drive by typo fix

chore: remove no longer needed empty rel fix
this seems to have been fixed long enough ago.
2026-02-07 12:40:38 +00:00 · 2022-11-02 21:11:05 +02:00 · 2022-11-02 21:10:44 +02:00 · 2022-11-02 18:37:48 +00:00 · 2022-11-02 12:30:09 -04:00 · 2022-11-02 16:22:58 +01:00
140 changed files with 6733 additions and 2738 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -73,6 +73,14 @@ runs:
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

+    - name: Download compatibility snapshot for Postgres 14
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
@@ -80,6 +88,8 @@ runs:
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -154,6 +164,15 @@ runs:
          scripts/generate_and_push_perf_report.sh
        fi

+    - name: Upload compatibility snapshot for Postgres 14
+      if: github.ref_name == 'release'
+      uses: ./.github/actions/upload
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        prefix: latest
+
    - name: Create Allure report
      if: always()
      uses: ./.github/actions/allure-report
--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: neon-storage-ireland
    bucket_region: eu-west-1
    console_mgmt_base_url: http://neon-stress-console.local
-    env_name: neon-stress
    etcd_endpoints: neon-stress-etcd.local:2379
    safekeeper_enable_s3_offload: 'false'
    pageserver_config_stub:
@@ -12,6 +11,7 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: neon-stress/wal
    hostname_suffix: ".local"
    remote_user: admin
  children:
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-ap-southeast-1
+    bucket_region: ap-southeast-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: ap-southeast-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
+    console_region_id: aws-ap-southeast-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-064de8ea28bdb495b
+        pageserver-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0b180defcaeeb6b93
+
+    safekeepers:
+      hosts:
+        safekeeper-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0d6f1dc5161eef894
+        safekeeper-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0e338adda8eb2d19f
+        safekeeper-2.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-04fb63634e4679eb9
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-eu-central-1
+    bucket_region: eu-central-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-central-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
+    console_region_id: aws-eu-central-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0cd8d316ecbb715be
+        pageserver-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-090044ed3d383fef0
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0b238612d2318a050
+        safekeeper-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-07b9c45e5c2637cd4
+        safekeeper-2.eu-central-1.aws.neon.tech:
+          ansible_host:  i-020257302c3c93d88
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -0,0 +1,36 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-us-east-2
+    bucket_region: us-east-2
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
+    console_region_id: aws-us-east-2
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-062227ba7f119eb8c
+        pageserver-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-0b3ec0afab5968938
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-0e94224750c57d346
+        safekeeper-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-06d113fb73bfddeb0
+        safekeeper-2.us-east-2.aws.neon.tech:
+          ansible_host:  i-09f66c8e04afff2e8
+          
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -1,7 +1,6 @@
 ---
 storage:
  vars:
-    env_name: prod-1
    console_mgmt_base_url: http://console-release.local
    bucket_name: zenith-storage-oregon
    bucket_region: us-west-2
@@ -12,6 +11,7 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: prod-1/wal
    hostname_suffix: ".local"
    remote_user: admin

--- a/.github/ansible/ssm_config
+++ b/.github/ansible/ssm_config
@@ -1,3 +1,2 @@
 ansible_connection: aws_ssm
-ansible_aws_ssm_bucket_name: neon-dev-bucket
 ansible_python_interpreter: /usr/bin/python3
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    env_name: us-stage
    etcd_endpoints: zenith-us-stage-etcd.local:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
@@ -11,6 +10,7 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: us-stage/wal
    hostname_suffix: ".local"
    remote_user: admin

--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: neon-staging-storage-us-east-2
    bucket_region: us-east-2
    console_mgmt_base_url: http://console-staging.local
-    env_name: us-stage
    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
@@ -11,9 +10,11 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
    hostname_suffix: ""
    remote_user: ssm-user
    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
    console_region_id: aws-us-east-2

  children:
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.us-east-2.aws.neon.build"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: dev
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.ap-southeast-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: ap-southeast-1
+  zenith_region_slug: ap-southeast-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.eu-central-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: eu-central-1
+  zenith_region_slug: eu-central-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.us-east-2.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -127,8 +127,8 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-

      - name: Cache postgres v14 build
        id: cache_pg_14
@@ -389,7 +389,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+          key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -481,6 +481,7 @@ jobs:

  neon-image:
    runs-on: dev
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

    steps:
@@ -494,10 +495,11 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
    runs-on: dev
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

    steps:
@@ -508,11 +510,12 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

  compute-node-image:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -527,11 +530,12 @@ jobs:
        # cloud repo depends on this image name, thus duplicating it
        # remove compute-node when cloud repo is updated
      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}

  compute-node-image-v14:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -543,12 +547,13 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}


  compute-node-image-v15:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -560,11 +565,11 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}

  promote-images:
    runs-on: dev
-    needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
@@ -577,8 +582,9 @@ jobs:

    steps:
      - name: Promote image to latest
-        run:
-          MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+        run: |
+          export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
+          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
    runs-on: dev
@@ -597,19 +603,19 @@ jobs:
          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json

      - name: Pull neon image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon

      - name: Pull compute tools image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools

      - name: Pull compute node image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node

      - name: Pull compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

      - name: Pull compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15

      - name: Pull rust image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
@@ -619,11 +625,11 @@ jobs:
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -750,9 +756,9 @@ jobs:
    defaults:
      run:
        shell: bash
-    env:
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+    strategy:
+      matrix:
+        target_region: [ us-east-2 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -774,7 +780,48 @@ jobs:
            exit 1
          fi

-          ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
@@ -818,3 +865,94 @@ jobs:
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  deploy-proxy-new:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-test-snapshot:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ deploy, deploy-proxy ]
+    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
+    steps:
+      - name: Promote compatibility snapshot for the release
+        shell: bash -euxo pipefail {0}
+        env:
+          BUCKET: neon-github-public-dev
+          PREFIX: artifacts/latest
+        run: |
+          for build_type in debug release; do
+            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
+            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+
+            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          done
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -36,7 +36,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 2
@@ -106,7 +106,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
+          key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust

      - name: Run cargo clippy
        run: ./run_clippy.sh
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -317,12 +317,6 @@ dependencies = [
 "generic-array",
 ]

-[[package]]
-name = "boxfnonce"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
-
 [[package]]
 name = "bstr"
 version = "1.0.1"
@@ -600,6 +594,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "url",
 "utils",
 "workspace_hack",
 ]
@@ -849,16 +844,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "daemonize"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
-dependencies = [
- "boxfnonce",
- "libc",
-]
-
 [[package]]
 name = "darling"
 version = "0.14.1"
@@ -2140,7 +2125,6 @@ dependencies = [
 "crc32c",
 "criterion",
 "crossbeam-utils",
- "daemonize",
 "etcd_broker",
 "fail",
 "futures",
@@ -2170,6 +2154,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "svg_fmt",
 "tar",
 "tempfile",
 "thiserror",
@@ -2188,7 +2173,10 @@ dependencies = [
 name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
 "const_format",
+ "postgres_ffi",
 "serde",
 "serde_with",
 "utils",
@@ -3083,7 +3071,6 @@ dependencies = [
 "clap 4.0.15",
 "const_format",
 "crc32c",
- "daemonize",
 "etcd_broker",
 "fs2",
 "git-version",
@@ -3091,6 +3078,7 @@ dependencies = [
 "humantime",
 "hyper",
 "metrics",
+ "nix 0.25.0",
 "once_cell",
 "parking_lot 0.12.1",
 "postgres",
@@ -3461,6 +3449,12 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"

+[[package]]
+name = "svg_fmt"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+
 [[package]]
 name = "symbolic-common"
 version = "8.8.0"
@@ -3932,6 +3926,16 @@ dependencies = [
 "tracing-core",
 ]

+[[package]]
+name = "tracing-serde"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.16"
@@ -3942,12 +3946,15 @@ dependencies = [
 "nu-ansi-term",
 "once_cell",
 "regex",
+ "serde",
+ "serde_json",
 "sharded-slab",
 "smallvec",
 "thread_local",
 "tracing",
 "tracing-core",
 "tracing-log",
+ "tracing-serde",
 ]

 [[package]]
@@ -4042,6 +4049,8 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "strum",
+ "strum_macros",
 "tempfile",
 "thiserror",
 "tokio",
--- a/3
+++ b/3
@@ -44,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
    && cachepot -s

 # Build final image
@@ -65,6 +65,7 @@ RUN set -e \

 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin

--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -1,50 +1,50 @@
-ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.0
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v14
+#
+# This file is identical to the Dockerfile.compute-node-v15 file
+# except for the version of Postgres that is built.
+#

+ARG TAG=pinned
+
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
-    tar xvzf postgis-3.3.0.tar.gz && \
-    cd postgis-3.3.0 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
+    tar xvzf postgis-3.3.1.tar.gz && \
+    cd postgis-3.3.1 && \
    ./autogen.sh && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    ./configure && \
@@ -57,39 +57,55 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils

-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold

+# Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    tar xvzf v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -108,16 +124,18 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control

+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-# plv8 still sometimes crashes during the creation
-# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY pgxn/ pgxn/
@@ -127,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -154,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -174,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -188,12 +212,6 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -4,44 +4,39 @@
 #

 ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.1
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v15

+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -62,39 +57,55 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils

-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold

+# Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    tar xvzf v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -113,16 +124,18 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control

+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-# plv8 still sometimes crashes during the creation
-# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY pgxn/ pgxn/
@@ -132,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -159,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -179,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -193,12 +212,6 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/10
+++ b/10
@@ -151,6 +151,11 @@ neon-pg-ext-v14: postgres-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v14"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v14
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
@@ -163,6 +168,11 @@ neon-pg-ext-v15: postgres-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v15"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v15
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
--- a/README.md
+++ b/README.md
@@ -223,10 +223,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git

-# either:
 CARGO_BUILD_FLAGS="--features=testing" make
-# or:
-make debug

 ./scripts/pytest
 ```
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -8,11 +8,10 @@ use std::process::Child;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
+use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
 use serde::Deserialize;

-use notify::{RecursiveMode, Watcher};
-
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

 /// Rust representation of Postgres role info with only those fields
@@ -169,7 +168,7 @@ impl Database {
    /// it may require a proper quoting too.
    pub fn to_pg_options(&self) -> String {
        let mut params: String = self.options.as_pg_options();
-        write!(params, " OWNER {}", &self.owner.quote())
+        write!(params, " OWNER {}", &self.owner.pg_quote())
            .expect("String is documented to not to error during write operations");

        params
@@ -180,18 +179,17 @@ impl Database {
 /// intended to be used for DB / role names.
 pub type PgIdent = String;

-/// Generic trait used to provide quoting for strings used in the
-/// Postgres SQL queries. Currently used only to implement quoting
-/// of identifiers, but could be used for literals in the future.
-pub trait PgQuote {
-    fn quote(&self) -> String;
+/// Generic trait used to provide quoting / encoding for strings used in the
+/// Postgres SQL queries and DATABASE_URL.
+pub trait Escaping {
+    fn pg_quote(&self) -> String;
 }

-impl PgQuote for PgIdent {
+impl Escaping for PgIdent {
    /// This is intended to mimic Postgres quote_ident(), but for simplicity it
-    /// always quotes provided string with `""` and escapes every `"`. Not idempotent,
-    /// i.e. if string is already escaped it will be escaped again.
-    fn quote(&self) -> String {
+    /// always quotes provided string with `""` and escapes every `"`.
+    /// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
+    fn pg_quote(&self) -> String {
        let result = format!("\"{}\"", self.replace('"', "\"\""));
        result
    }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,7 +1,9 @@
 use std::path::Path;
+use std::str::FromStr;

 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
+use postgres::config::Config;
 use postgres::{Client, NoTls};
 use serde::Deserialize;

@@ -115,8 +117,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    if existing_roles.iter().any(|r| r.name == op.name) {
                        let query: String = format!(
                            "ALTER ROLE {} RENAME TO {}",
-                            op.name.quote(),
-                            new_name.quote()
+                            op.name.pg_quote(),
+                            new_name.pg_quote()
                        );

                        warn!("renaming role '{}' to '{}'", op.name, new_name);
@@ -162,7 +164,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }

            if update_role {
-                let mut query: String = format!("ALTER ROLE {} ", name.quote());
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                info_print!(" -> update");

                query.push_str(&role.to_pg_options());
@@ -170,7 +172,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
        } else {
            info!("role name: '{}'", &name);
-            let mut query: String = format!("CREATE ROLE {} ", name.quote());
+            let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
            info!("role create query: '{}'", &query);
            info_print!(" -> create");

@@ -179,7 +181,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

            let grant_query = format!(
                "GRANT pg_read_all_data, pg_write_all_data TO {}",
-                name.quote()
+                name.pg_quote()
            );
            xact.execute(grant_query.as_str(), &[])?;
            info!("role grant query: '{}'", &grant_query);
@@ -215,7 +217,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
            // We do not check either role exists or not,
            // Postgres will take care of it for us
            if op.action == "delete_role" {
-                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
+                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote());

                warn!("deleting role '{}'", &op.name);
                xact.execute(query.as_str(), &[])?;
@@ -230,17 +232,16 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
 fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
    for db in &node.spec.cluster.databases {
        if db.owner != *role_name {
-            let mut connstr = node.connstr.clone();
-            // database name is always the last and the only component of the path
-            connstr.set_path(&db.name);
+            let mut conf = Config::from_str(node.connstr.as_str())?;
+            conf.dbname(&db.name);

-            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            let mut client = conf.connect(NoTls)?;

            // This will reassign all dependent objects to the db owner
            let reassign_query = format!(
                "REASSIGN OWNED BY {} TO {}",
-                role_name.quote(),
-                db.owner.quote()
+                role_name.pg_quote(),
+                db.owner.pg_quote()
            );
            info!(
                "reassigning objects owned by '{}' in db '{}' to '{}'",
@@ -249,7 +250,7 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
            client.simple_query(&reassign_query)?;

            // This now will only drop privileges of the role
-            let drop_query = format!("DROP OWNED BY {}", role_name.quote());
+            let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
            client.simple_query(&drop_query)?;
        }
    }
@@ -279,7 +280,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                // We do not check either DB exists or not,
                // Postgres will take care of it for us
                "delete_db" => {
-                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote());
+                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());

                    warn!("deleting database '{}'", &op.name);
                    client.execute(query.as_str(), &[])?;
@@ -291,8 +292,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    if existing_dbs.iter().any(|r| r.name == op.name) {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
-                            op.name.quote(),
-                            new_name.quote()
+                            op.name.pg_quote(),
+                            new_name.pg_quote()
                        );

                        warn!("renaming database '{}' to '{}'", op.name, new_name);
@@ -320,7 +321,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            // XXX: db owner name is returned as quoted string from Postgres,
            // when quoting is needed.
            let new_owner = if r.owner.starts_with('"') {
-                db.owner.quote()
+                db.owner.pg_quote()
            } else {
                db.owner.clone()
            };
@@ -328,15 +329,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            if new_owner != r.owner {
                let query: String = format!(
                    "ALTER DATABASE {} OWNER TO {}",
-                    name.quote(),
-                    db.owner.quote()
+                    name.pg_quote(),
+                    db.owner.pg_quote()
                );
                info_print!(" -> update");

                client.execute(query.as_str(), &[])?;
            }
        } else {
-            let mut query: String = format!("CREATE DATABASE {} ", name.quote());
+            let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
            info_print!(" -> create");

            query.push_str(&db.to_pg_options());
@@ -366,7 +367,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
        .cluster
        .roles
        .iter()
-        .map(|r| r.name.quote())
+        .map(|r| r.name.pg_quote())
        .collect::<Vec<_>>();

    for db in &spec.cluster.databases {
@@ -374,27 +375,22 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {

        let query: String = format!(
            "GRANT CREATE ON DATABASE {} TO {}",
-            dbname.quote(),
+            dbname.pg_quote(),
            roles.join(", ")
        );
        info!("grant query {}", &query);

        client.execute(query.as_str(), &[])?;
-
-        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed since postgres 15, where this privilege is removed by default.
-        client.execute("GRANT CREATE ON SCHEMA public TO web_access", &[])?;
    }

    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
-    let mut db_connstr = node.connstr.clone();
    for db in &node.spec.cluster.databases {
-        // database name is always the last and the only component of the path
-        db_connstr.set_path(&db.name);
+        let mut conf = Config::from_str(node.connstr.as_str())?;
+        conf.dbname(&db.name);

-        let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
+        let mut db_client = conf.connect(NoTls)?;

        // This will only change ownership on the schema itself, not the objects
        // inside it. Without it owner of the `public` schema will be `cloud_admin`
@@ -423,9 +419,36 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
                    END IF;\n\
                END\n\
            $$;",
-            db.owner.quote()
+            db.owner.pg_quote()
        );
        db_client.simple_query(&alter_query)?;
+
+        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
+        // This is needed because since postgres 15 this privilege is removed by default.
+        let grant_query = "DO $$\n\
+                BEGIN\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    ) AND\n\
+                    current_setting('server_version_num')::int/10000 >= 15\n\
+                    THEN\n\
+                        IF EXISTS(\n\
+                            SELECT rolname\n\
+                            FROM pg_catalog.pg_roles\n\
+                            WHERE rolname = 'web_access'\n\
+                        )\n\
+                        THEN\n\
+                            GRANT CREATE ON SCHEMA public TO web_access;\n\
+                        END IF;\n\
+                    END IF;\n\
+                END\n\
+            $$;"
+        .to_string();
+
+        info!("grant query for db {} : {}", &db.name, &grant_query);
+        db_client.simple_query(&grant_query)?;
    }

    Ok(())
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -33,9 +33,9 @@ mod pg_helpers_tests {
    }

    #[test]
-    fn quote_ident() {
+    fn ident_pg_quote() {
        let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;");

-        assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\"");
+        assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
    }
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,20 +4,21 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
+anyhow = "1.0"
 clap = "4.0"
 comfy-table = "6.1"
 git-version = "0.3.5"
-tar = "0.4.38"
+nix = "0.25"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+regex = "1"
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
-toml = "0.5"
-once_cell = "1.13.0"
-regex = "1"
-anyhow = "1.0"
+tar = "0.4.38"
 thiserror = "1"
-nix = "0.25"
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+toml = "0.5"
+url = "2.2.2"

 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -0,0 +1,264 @@
+//! Spawns and kills background processes that are needed by Neon CLI.
+//! Applies common set-up such as log and pid files (if needed) to every process.
+//!
+//! Neon CLI does not run in background, so it needs to store the information about
+//! spawned processes, which it does in this module.
+//! We do that by storing the pid of the process in the "${process_name}.pid" file.
+//! The pid file can be created by the process itself
+//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
+//! or we create such file after starting the process
+//! (non-Neon binaries don't necessarily follow our pidfile conventions).
+//! The pid stored in the file is later used to stop the service.
+//!
+//! See [`lock_file`] module for more info.
+
+use std::ffi::OsStr;
+use std::io::Write;
+use std::path::Path;
+use std::process::{Child, Command};
+use std::time::Duration;
+use std::{fs, io, thread};
+
+use anyhow::{anyhow, bail, Context, Result};
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+
+use utils::lock_file;
+
+const RETRIES: u32 = 15;
+const RETRY_TIMEOUT_MILLIS: u64 = 500;
+
+/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
+/// it itself.
+pub enum InitialPidFile<'t> {
+    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
+    Create(&'t Path),
+    /// The process will create the pidfile itself, need to wait for that event.
+    Expect(&'t Path),
+}
+
+/// Start a background child process using the parameters given.
+pub fn start_process<F, S: AsRef<OsStr>>(
+    process_name: &str,
+    datadir: &Path,
+    command: &Path,
+    args: &[S],
+    initial_pid_file: InitialPidFile,
+    process_status_check: F,
+) -> anyhow::Result<Child>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    let log_path = datadir.join(format!("{process_name}.log"));
+    let process_log_file = fs::OpenOptions::new()
+        .create(true)
+        .write(true)
+        .append(true)
+        .open(&log_path)
+        .with_context(|| {
+            format!("Could not open {process_name} log file {log_path:?} for writing")
+        })?;
+    let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
+        format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
+    })?;
+
+    let mut command = Command::new(command);
+    let background_command = command
+        .stdout(process_log_file)
+        .stderr(same_file_for_stderr)
+        .args(args);
+    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
+
+    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+        format!("Could not spawn {process_name}, see console output and log files for details.")
+    })?;
+    let pid = spawned_process.id();
+    let pid = Pid::from_raw(
+        i32::try_from(pid)
+            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
+    );
+
+    let pid_file_to_check = match initial_pid_file {
+        InitialPidFile::Create(target_pid_file_path) => {
+            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
+                lock_file::LockCreationResult::Created { .. } => {
+                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
+                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
+                }
+                lock_file::LockCreationResult::AlreadyLocked { .. } => {
+                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
+                }
+                lock_file::LockCreationResult::CreationFailed(e) => {
+                    return Err(e.context(format!(
+                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
+                )))
+                }
+            }
+            None
+        }
+        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
+    };
+
+    for retries in 0..RETRIES {
+        match process_started(pid, pid_file_to_check, &process_status_check) {
+            Ok(true) => {
+                println!("\n{process_name} started, pid: {pid}");
+                return Ok(spawned_process);
+            }
+            Ok(false) => {
+                if retries < 5 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                } else {
+                    if retries == 5 {
+                        println!() // put a line break after dots for second message
+                    }
+                    println!("{process_name} has not started yet, retrying ({retries})...");
+                }
+                thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
+            }
+            Err(e) => {
+                println!("{process_name} failed to start: {e:#}");
+                if let Err(e) = spawned_process.kill() {
+                    println!("Could not stop {process_name} subprocess: {e:#}")
+                };
+                return Err(e);
+            }
+        }
+    }
+    anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
+}
+
+/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
+pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
+    if !pid_file.exists() {
+        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
+        return Ok(());
+    }
+    let pid = read_pidfile(pid_file)?;
+
+    let sig = if immediate {
+        print!("Stopping {process_name} with pid {pid} immediately..");
+        Signal::SIGQUIT
+    } else {
+        print!("Stopping {process_name} with pid {pid} gracefully..");
+        Signal::SIGTERM
+    };
+    io::stdout().flush().unwrap();
+    match kill(pid, sig) {
+        Ok(()) => (),
+        Err(Errno::ESRCH) => {
+            println!(
+                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+            );
+            return Ok(());
+        }
+        Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
+    }
+
+    // Wait until process is gone
+    for _ in 0..RETRIES {
+        match process_has_stopped(pid) {
+            Ok(true) => {
+                println!("\n{process_name} stopped");
+                if let Err(e) = fs::remove_file(pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+                return Ok(());
+            }
+            Ok(false) => {
+                print!(".");
+                io::stdout().flush().unwrap();
+                thread::sleep(Duration::from_secs(1))
+            }
+            Err(e) => {
+                println!("{process_name} with pid {pid} failed to stop: {e:#}");
+                return Err(e);
+            }
+        }
+    }
+
+    anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
+}
+
+fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
+    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+
+    let var = "LLVM_PROFILE_FILE";
+    if let Some(val) = std::env::var_os(var) {
+        filled_cmd = filled_cmd.env(var, val);
+    }
+
+    const RUST_LOG_KEY: &str = "RUST_LOG";
+    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
+        filled_cmd.env(RUST_LOG_KEY, rust_log_value)
+    } else {
+        filled_cmd
+    }
+}
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in [
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+    ] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
+
+fn process_started<F>(
+    pid: Pid,
+    pid_file_to_check: Option<&Path>,
+    status_check: &F,
+) -> anyhow::Result<bool>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    match status_check() {
+        Ok(true) => match pid_file_to_check {
+            Some(pid_file_path) => {
+                if pid_file_path.exists() {
+                    let pid_in_file = read_pidfile(pid_file_path)?;
+                    Ok(pid_in_file == pid)
+                } else {
+                    Ok(false)
+                }
+            }
+            None => Ok(true),
+        },
+        Ok(false) => Ok(false),
+        Err(e) => anyhow::bail!("process failed to start: {e}"),
+    }
+}
+
+/// Read a PID file
+///
+/// We expect a file that contains a single integer.
+fn read_pidfile(pidfile: &Path) -> Result<Pid> {
+    let pid_str = fs::read_to_string(pidfile)
+        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
+    let pid: i32 = pid_str
+        .parse()
+        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
+    if pid < 1 {
+        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
+    }
+    Ok(Pid::from_raw(pid))
+}
+
+fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+    match kill(pid, None) {
+        // Process exists, keep waiting
+        Ok(_) => Ok(false),
+        // Process not found, we're done
+        Err(Errno::ESRCH) => Ok(true),
+        Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
+    }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::compute::ComputeControlPlane;
 use control_plane::local_env::{EtcdBroker, LocalEnv};
+use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage::PageServerNode;
 use control_plane::{etcd, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -12,15 +12,14 @@ use std::time::Duration;

 use anyhow::{Context, Result};
 use utils::{
-    connstring::connection_host_port,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

 use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
+use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
-use crate::storage::PageServerNode;

 //
 // ComputeControlPlane
@@ -183,18 +182,18 @@ impl PostgresNode {
    }

    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
-        let pg_path = self.env.pg_bin_dir(pg_version).join("postgres");
+        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
        let mut cmd = Command::new(&pg_path);

        cmd.arg("--sync-safekeepers")
            .env_clear()
            .env(
                "LD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version).to_str().unwrap(),
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env(
                "DYLD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version).to_str().unwrap(),
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env("PGDATA", self.pgdata().to_str().unwrap())
            .stdout(Stdio::piped())
@@ -282,9 +281,7 @@ impl PostgresNode {
    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
-        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
-        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
-        conf.append("wal_log_hints", "on");
+        conf.append("wal_log_hints", "off");
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
@@ -302,7 +299,8 @@ impl PostgresNode {

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
-            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());

            // Set up authentication
            //
@@ -422,7 +420,7 @@ impl PostgresNode {
    }

    fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
-        let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl");
+        let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
        let mut cmd = Command::new(pg_ctl_path);
        cmd.args(
            [
@@ -440,11 +438,11 @@ impl PostgresNode {
        .env_clear()
        .env(
            "LD_LIBRARY_PATH",
-            self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
+            self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
        )
        .env(
            "DYLD_LIBRARY_PATH",
-            self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
+            self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
        );
        if let Some(token) = auth_token {
            cmd.env("ZENITH_AUTH_TOKEN", token);
--- a/control_plane/src/connection.rs
+++ b/control_plane/src/connection.rs
@@ -0,0 +1,57 @@
+use url::Url;
+
+#[derive(Debug)]
+pub struct PgConnectionConfig {
+    url: Url,
+}
+
+impl PgConnectionConfig {
+    pub fn host(&self) -> &str {
+        self.url.host_str().expect("BUG: no host")
+    }
+
+    pub fn port(&self) -> u16 {
+        self.url.port().expect("BUG: no port")
+    }
+
+    /// Return a `<host>:<port>` string.
+    pub fn raw_address(&self) -> String {
+        format!("{}:{}", self.host(), self.port())
+    }
+
+    /// Connect using postgres protocol with TLS disabled.
+    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
+        postgres::Client::connect(self.url.as_str(), postgres::NoTls)
+    }
+}
+
+impl std::str::FromStr for PgConnectionConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut url: Url = s.parse()?;
+
+        match url.scheme() {
+            "postgres" | "postgresql" => {}
+            other => anyhow::bail!("invalid scheme: {other}"),
+        }
+
+        // It's not a valid connection url if host is unavailable.
+        if url.host().is_none() {
+            anyhow::bail!(url::ParseError::EmptyHost);
+        }
+
+        // E.g. `postgres:bar`.
+        if url.cannot_be_a_base() {
+            anyhow::bail!("URL cannot be a base");
+        }
+
+        // Set the default PG port if it's missing.
+        if url.port().is_none() {
+            url.set_port(Some(5432))
+                .expect("BUG: couldn't set the default port");
+        }
+
+        Ok(Self { url })
+    }
+}
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -1,95 +1,75 @@
-use std::{
-    fs,
-    path::PathBuf,
-    process::{Command, Stdio},
-};
+use std::{fs, path::PathBuf};

 use anyhow::Context;
-use nix::{
-    sys::signal::{kill, Signal},
-    unistd::Pid,
-};

-use crate::{local_env, read_pidfile};
+use crate::{background_process, local_env};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
    println!(
-        "Starting etcd broker using {}",
-        etcd_broker.etcd_binary_path.display()
+        "Starting etcd broker using {:?}",
+        etcd_broker.etcd_binary_path
    );

    let etcd_data_dir = env.base_data_dir.join("etcd");
-    fs::create_dir_all(&etcd_data_dir).with_context(|| {
-        format!(
-            "Failed to create etcd data dir: {}",
-            etcd_data_dir.display()
-        )
-    })?;
+    fs::create_dir_all(&etcd_data_dir)
+        .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;

-    let etcd_stdout_file =
-        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stout file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
-    let etcd_stderr_file =
-        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stderr file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
    let client_urls = etcd_broker.comma_separated_endpoints();
+    let args = [
+        format!("--data-dir={}", etcd_data_dir.display()),
+        format!("--listen-client-urls={client_urls}"),
+        format!("--advertise-client-urls={client_urls}"),
+        // Set --quota-backend-bytes to keep the etcd virtual memory
+        // size smaller. Our test etcd clusters are very small.
+        // See https://github.com/etcd-io/etcd/issues/7910
+        "--quota-backend-bytes=100000000".to_string(),
+        // etcd doesn't compact (vacuum) with default settings,
+        // enable it to prevent space exhaustion.
+        "--auto-compaction-mode=revision".to_string(),
+        "--auto-compaction-retention=1".to_string(),
+    ];

-    let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
-        .args(&[
-            format!("--data-dir={}", etcd_data_dir.display()),
-            format!("--listen-client-urls={client_urls}"),
-            format!("--advertise-client-urls={client_urls}"),
-            // Set --quota-backend-bytes to keep the etcd virtual memory
-            // size smaller. Our test etcd clusters are very small.
-            // See https://github.com/etcd-io/etcd/issues/7910
-            "--quota-backend-bytes=100000000".to_string(),
-        ])
-        .stdout(Stdio::from(etcd_stdout_file))
-        .stderr(Stdio::from(etcd_stderr_file))
-        .spawn()
-        .context("Failed to spawn etcd subprocess")?;
-    let pid = etcd_process.id();
+    let pid_file_path = etcd_pid_file_path(env);

-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
-        format!(
-            "Failed to create etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
+    let client = reqwest::blocking::Client::new();
+
+    background_process::start_process(
+        "etcd",
+        &etcd_data_dir,
+        &etcd_broker.etcd_binary_path,
+        &args,
+        background_process::InitialPidFile::Create(&pid_file_path),
+        || {
+            for broker_endpoint in &etcd_broker.broker_endpoints {
+                let request = broker_endpoint
+                    .join("health")
+                    .with_context(|| {
+                        format!(
+                            "Failed to append /health path to broker endopint {}",
+                            broker_endpoint
+                        )
+                    })
+                    .and_then(|url| {
+                        client.get(&url.to_string()).build().with_context(|| {
+                            format!("Failed to construct request to etcd endpoint {url}")
+                        })
+                    })?;
+                if client.execute(request).is_ok() {
+                    return Ok(true);
+                }
+            }
+
+            Ok(false)
+        },
+    )
+    .context("Failed to spawn etcd subprocess")?;

    Ok(())
 }

 pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    let etcd_path = &env.etcd_broker.etcd_binary_path;
-    println!("Stopping etcd broker at {}", etcd_path.display());
-
-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
-        format!(
-            "Failed to read etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?);
-
-    kill(pid, Signal::SIGTERM).with_context(|| {
-        format!(
-            "Failed to stop etcd with pid {pid} at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
-
-    Ok(())
+    background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
 }

 fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,59 +6,12 @@
 // Intended to be used in integration tests and in CLI tools for
 // local installations.
 //
-use anyhow::{anyhow, bail, Context, Result};
-use std::fs;
-use std::path::Path;
-use std::process::Command;

+mod background_process;
 pub mod compute;
+pub mod connection;
 pub mod etcd;
 pub mod local_env;
+pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
-pub mod storage;
-
-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-/// We return an i32 for compatibility with libc and nix.
-pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
-    if pid < 1 {
-        bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
-    }
-    Ok(pid)
-}
-
-fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
-    let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
-
-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        cmd.env(var, val);
-    }
-
-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        cmd
-    }
-}
-
-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
-    for env_key in [
-        "AWS_ACCESS_KEY_ID",
-        "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
-    ] {
-        if let Ok(value) = std::env::var(env_key) {
-            cmd = cmd.env(env_key, value);
-        }
-    }
-    cmd
-}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -201,37 +201,37 @@ impl LocalEnv {
        self.pg_distrib_dir.clone()
    }

-    pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match pg_version {
-            14 => path.join(format!("v{pg_version}")),
-            15 => path.join(format!("v{pg_version}")),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("bin"),
-            15 => self.pg_distrib_dir(pg_version).join("bin"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("lib"),
-            15 => self.pg_distrib_dir(pg_version).join("lib"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("pageserver"))
+    pub fn pageserver_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("pageserver")
    }

-    pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("safekeeper"))
+    pub fn safekeeper_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("safekeeper")
    }

    pub fn pg_data_dirs_path(&self) -> PathBuf {
@@ -422,10 +422,10 @@ impl LocalEnv {
            "directory '{}' already exists. Perhaps already initialized?",
            base_path.display()
        );
-        if !self.pg_bin_dir(pg_version).join("postgres").exists() {
+        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
-                self.pg_bin_dir(pg_version).display()
+                self.pg_bin_dir(pg_version)?.display()
            );
        }
        for binary in ["pageserver", "safekeeper"] {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,33 +1,27 @@
 use std::collections::HashMap;
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::{Path, PathBuf};
-use std::process::Command;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::process::Child;
+use std::{io, result};

+use crate::connection::PgConnectionConfig;
 use anyhow::{bail, Context};
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
-use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{
-    connstring::connection_address,
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

-use crate::local_env::LocalEnv;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
@@ -75,7 +69,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -101,7 +95,7 @@ impl PageServerNode {
    }

    /// Construct libpq connection string for connecting to the pageserver.
-    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
+    fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
        format!("postgresql://no_user:{password}@{listen_addr}/no_db")
            .parse()
            .unwrap()
@@ -161,7 +155,15 @@ impl PageServerNode {
            init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
        }

-        self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
+        let mut pageserver_process = self
+            .start_node(&init_config_overrides, &self.env.base_data_dir, true)
+            .with_context(|| {
+                format!(
+                    "Failed to start a process for pageserver {}",
+                    self.env.pageserver.id,
+                )
+            })?;
+
        let init_result = self
            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
            .context("Failed to create initial tenant and timeline for pageserver");
@@ -171,7 +173,29 @@ impl PageServerNode {
            }
            Err(e) => eprintln!("{e:#}"),
        }
-        self.stop(false)?;
+        match pageserver_process.kill() {
+            Err(e) => {
+                eprintln!(
+                    "Failed to stop pageserver {} process with pid {}: {e:#}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                )
+            }
+            Ok(()) => {
+                println!(
+                    "Stopped pageserver {} process with pid {}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                );
+                // cleanup after pageserver startup, since we do not call regular `stop_process` during init
+                let pid_file = self.pid_file();
+                if let Err(e) = fs::remove_file(&pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+            }
+        }
        init_result
    }

@@ -196,11 +220,14 @@ impl PageServerNode {
        self.env.pageserver_data_dir()
    }

-    pub fn pid_file(&self) -> PathBuf {
+    /// The pid file is created by the pageserver process, with its pid stored inside.
+    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
+    /// pageserver runs. (Unless someone removes the file manually; never do that!)
+    fn pid_file(&self) -> PathBuf {
        self.repo_path().join("pageserver.pid")
    }

-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
        self.start_node(config_overrides, &self.repo_path(), false)
    }

@@ -209,10 +236,10 @@ impl PageServerNode {
        config_overrides: &[&str],
        datadir: &Path,
        update_config: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Child> {
        println!(
            "Starting pageserver at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            datadir.display()
        );
        io::stdout().flush()?;
@@ -220,10 +247,7 @@ impl PageServerNode {
        let mut args = vec![
            "-D",
            datadir.to_str().with_context(|| {
-                format!(
-                    "Datadir path '{}' cannot be represented as a unicode string",
-                    datadir.display()
-                )
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
            })?,
        ];

@@ -235,48 +259,18 @@ impl PageServerNode {
            args.extend(["-c", config_override]);
        }

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
-        filled_cmd = fill_aws_secrets_vars(filled_cmd);
-
-        if !filled_cmd.status()?.success() {
-            bail!(
-                "Pageserver failed to start. See console output and '{}' for details.",
-                datadir.join("pageserver.log").display()
-            );
-        }
-
-        // It takes a while for the page server to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(()) => {
-                    println!("\nPageserver started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        PageserverHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!("Pageserver not responding yet, err {err} retrying ({retries})...");
-                            }
-                        }
-                        PageserverHttpError::Response(msg) => {
-                            bail!("pageserver failed to start: {msg} ")
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("pageserver failed to start in {RETRIES} seconds");
+        background_process::start_process(
+            "pageserver",
+            datadir,
+            &self.env.pageserver_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(PageserverHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -288,69 +282,18 @@ impl PageServerNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Pageserver is already stopped");
-            return Ok(());
-        }
-        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
-
-        let sig = if immediate {
-            print!("Stopping pageserver immediately..");
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping pageserver gracefully..");
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!("Pageserver with pid {pid} does not exist, but a PID file was found");
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to pageserver with pid {pid}: {}",
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop pageserver with pid {pid}");
+        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        println!("Pageserver query: '{sql}'");
        client.simple_query(sql).unwrap()
    }

    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect(NoTls)
+        self.pg_connection_config.connect_no_tls()
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
@@ -549,7 +492,7 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,23 +1,21 @@
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Command;
+use std::process::Child;
 use std::sync::Arc;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::{io, result};

-use anyhow::bail;
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
-use postgres::Config;
+use anyhow::Context;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
-use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
+use utils::{http::error::HttpErrorBody, id::NodeId};

-use crate::local_env::{LocalEnv, SafekeeperConf};
-use crate::storage::PageServerNode;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::connection::PgConnectionConfig;
+use crate::pageserver::PageServerNode;
+use crate::{
+    background_process,
+    local_env::{LocalEnv, SafekeeperConf},
+};

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
@@ -63,7 +61,7 @@ pub struct SafekeeperNode {

    pub conf: SafekeeperConf,

-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -87,15 +85,15 @@ impl SafekeeperNode {
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> Config {
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
+        format!("postgresql://no_user@127.0.0.1:{port}/no_db")
            .parse()
            .unwrap()
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
-        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
+        env.safekeeper_data_dir(&format!("sk{sk_id}"))
    }

    pub fn datadir_path(&self) -> PathBuf {
@@ -106,92 +104,78 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<()> {
+    pub fn start(&self) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            self.datadir_path().display()
        );
        io::stdout().flush().unwrap();

        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let id = self.id;
+        let datadir = self.datadir_path();

-        let mut cmd = Command::new(self.env.safekeeper_bin()?);
-        fill_rust_env_vars(
-            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
-                .args(&["--id", self.id.to_string().as_ref()])
-                .args(&["--listen-pg", &listen_pg])
-                .args(&["--listen-http", &listen_http])
-                .args(&["--recall", "1 second"])
-                .arg("--daemonize"),
-        );
+        let id_string = id.to_string();
+        let mut args = vec![
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+            })?,
+            "--id",
+            &id_string,
+            "--listen-pg",
+            &listen_pg,
+            "--listen-http",
+            &listen_http,
+        ];
        if !self.conf.sync {
-            cmd.arg("--no-sync");
+            args.push("--no-sync");
        }

        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
        if !comma_separated_endpoints.is_empty() {
-            cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
+            args.extend(["--broker-endpoints", &comma_separated_endpoints]);
        }
        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
-            cmd.args(&["--broker-etcd-prefix", prefix]);
+            args.extend(["--broker-etcd-prefix", prefix]);
        }
+
+        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
-            cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
+            backup_threads = threads.to_string();
+            args.extend(["--backup-threads", &backup_threads]);
+        } else {
+            drop(backup_threads);
        }
+
        if let Some(ref remote_storage) = self.conf.remote_storage {
-            cmd.args(&["--remote-storage", remote_storage]);
+            args.extend(["--remote-storage", remote_storage]);
        }
+
+        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
-            cmd.arg("--auth-validation-public-key-path");
-            // PathBuf is better be passed as is, not via `String`.
-            cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
+            args.extend([
+                "--auth-validation-public-key-path",
+                key_path.to_str().with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?,
+            ]);
        }

-        fill_aws_secrets_vars(&mut cmd);
-
-        if !cmd.status()?.success() {
-            bail!(
-                "Safekeeper failed to start. See '{}' for details.",
-                self.datadir_path().join("safekeeper.log").display()
-            );
-        }
-
-        // It takes a while for the safekeeper to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(_) => {
-                    println!("\nSafekeeper started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        SafekeeperHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!(
-                                    "Safekeeper not responding yet, err {} retrying ({})...",
-                                    err, retries
-                                );
-                            }
-                        }
-                        SafekeeperHttpError::Response(msg) => {
-                            bail!("safekeeper failed to start: {} ", msg)
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("safekeeper failed to start in {} seconds", RETRIES);
+        background_process::start_process(
+            &format!("safekeeper {id}"),
+            &datadir,
+            &self.env.safekeeper_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -203,63 +187,11 @@ impl SafekeeperNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Safekeeper {} is already stopped", self.id);
-            return Ok(());
-        }
-        let pid = read_pidfile(&pid_file)?;
-        let pid = Pid::from_raw(pid);
-
-        let sig = if immediate {
-            print!("Stopping safekeeper {} immediately..", self.id);
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping safekeeper {} gracefully..", self.id);
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!(
-                    "Safekeeper with pid {} does not exist, but a PID file was found",
-                    pid
-                );
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to safekeeper with pid {}: {}",
-                pid,
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop safekeeper with pid {}", pid);
+        background_process::stop_process(
+            immediate,
+            &format!("safekeeper {}", self.id),
+            &self.pid_file(),
+        )
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
--- a/docker-compose/compute/shell/compute.sh
+++ b/docker-compose/compute/shell/compute.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eux
+
+PG_VERSION=${PG_VERSION:-14}
+
+SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
+SPEC_FILE=/tmp/spec.json
+
+echo "Waiting pageserver become ready."
+while ! nc -z pageserver 6400; do
+     sleep 1;
+done
+echo "Page server is ready."
+
+echo "Create a tenant and timeline"
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{}"
+     http://pageserver:9898/v1/tenant/
+)
+tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+)
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
+
+echo "Overwrite tenant id and timeline id in spec file"
+tenant_id=$(echo ${result} | jq -r .tenant_id)
+timeline_id=$(echo ${result} | jq -r .timeline_id)
+
+sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+
+cat ${SPEC_FILE}
+
+echo "Start compute node"
+/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
+     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
+     -b /usr/local/bin/postgres                              \
+     -S ${SPEC_FILE}
--- a/docker-compose/compute/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute/var/db/postgres/specs/spec.json
@@ -0,0 +1,141 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2022-10-12T18:00:00.000Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+    "cluster": {
+        "cluster_id": "docker_compose",
+        "name": "docker_compose_test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "cloud_admin",
+                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                "options": null
+            }
+        ],
+        "databases": [
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "port",
+                "value": "55433",
+                "vartype": "integer"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "1MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "5s",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_keep_size",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "restart_after_crash",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "neon",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.safekeepers",
+                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.timeline_id",
+                "value": "TIMELINE_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.tenant_id",
+                "value": "TENANT_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.pageserver_connstring",
+                "value": "host=pageserver port=6400",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_write_lag",
+                "value": "500MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_flush_lag",
+                "value": "10GB",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+    ]
+}
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -0,0 +1,200 @@
+version: '3'
+
+services:
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.4
+    ports:
+      - 2379:2379
+      - 2380:2380
+    environment:
+      # This signifficantly speeds up etcd and we anyway don't data persistency there.
+      ETCD_UNSAFE_NO_FSYNC: "1"
+    command: 
+      - "etcd"
+      - "--auto-compaction-mode=revision"
+      - "--auto-compaction-retention=1"
+      - "--name=etcd-cluster"
+      - "--initial-cluster-state=new"
+      - "--initial-cluster-token=etcd-cluster-1"
+      - "--initial-cluster=etcd-cluster=http://etcd:2380"
+      - "--initial-advertise-peer-urls=http://etcd:2380"
+      - "--advertise-client-urls=http://etcd:2379"
+      - "--listen-client-urls=http://0.0.0.0:2379"
+      - "--listen-peer-urls=http://0.0.0.0:2380"
+      - "--quota-backend-bytes=134217728" # 128 MB
+
+  minio:
+    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    command: server /data --address :9000 --console-address ":9001"
+
+  minio_create_buckets:
+    image: minio/mc
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command: 
+      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
+             echo 'Waiting to start minio...' && sleep 1;
+         done;
+         /usr/bin/mc mb minio/neon --region=eu-north-1;
+         exit 0;"
+    depends_on:
+      - minio
+
+  pageserver:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - BROKER_ENDPOINT='http://etcd:2379'
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+       #- 6400:6400  # pg protocol handler
+       - 9898:9898 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper1:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
+      - SAFEKEEPER_ID=1
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7676:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper2:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
+      - SAFEKEEPER_ID=2
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7677:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper3:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
+      - SAFEKEEPER_ID=3
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7678:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  compute:
+    build:
+      context: ./image/compute
+      args:
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - http_proxy=$http_proxy
+        - https_proxy=$https_proxy
+    environment:
+      - PG_VERSION=${PG_VERSION:-14}
+      #- RUST_BACKTRACE=1
+    volumes:
+      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute/shell/:/shell/
+    ports:
+      - 55433:55433 # pg protocol handler
+      - 3080:3080 # http endpoints
+    entrypoint:
+      - "/shell/compute.sh"
+    depends_on:
+      - safekeeper1
+      - safekeeper2
+      - safekeeper3
+      - pageserver
+
+  compute_is_ready:
+    image: postgres:latest
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - "until pg_isready -h compute -p 55433 ; do
+            echo 'Waiting to start compute...' && sleep 1;
+         done"
+    depends_on:
+      - compute
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -0,0 +1,10 @@
+ARG COMPUTE_IMAGE=compute-node-v14:latest
+FROM neondatabase/${COMPUTE_IMAGE}
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -80,4 +80,6 @@
 - [015-storage-messaging](rfcs/015-storage-messaging.md)
 - [016-connection-routing](rfcs/016-connection-routing.md)
 - [017-timeline-data-management](rfcs/017-timeline-data-management.md)
+- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
+- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
 - [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall
 1. `neondatabase/compute-tools` and `neondatabase/compute-node`

 2. `neondatabase/neon`
+
+## Docker Compose example
+
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+
+- etcd x 1
+- pageserver x 1
+- safekeeper x 3
+- compute x 1
+- MinIO x 1        # This is Amazon S3 compatible object storage
+
+### How to use
+
+1. create containers
+
+You can specify version of neon cluster using following environment values.
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+```
+$ cd docker-compose/docker-compose.yml
+$ docker-compose down   # remove the conainers if exists
+$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+Creating network "dockercompose_default" with the default driver
+Creating dockercompose_etcd3_1 ...
+(...omit...)
+```
+
+2. connect compute node
+```
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
+postgres=# CREATE TABLE t(key int primary key, value text);
+CREATE TABLE
+postgres=# insert into t values(1,1);
+INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+3. If you want to see the log, you can use `docker-compose logs` command.
+```
+# check the container name you want to see
+$ docker ps
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+(...omit...)
+
+$ docker logs -f dockercompose_compute_1
+2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
+2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
+(...omit...)
+```
+
+4. If you want to see durable data in MinIO which is s3 compatible storage
+
+Access http://localhost:9001 and sign in.
+
+- Username: `minio`
+- Password: `password`
+
+You can see durable pages and WAL data in `neon` bucket.
--- a/docs/rfcs/019-tenant-timeline-lifecycles.md
+++ b/docs/rfcs/019-tenant-timeline-lifecycles.md
@@ -0,0 +1,91 @@
+# Managing Tenant and Timeline lifecycles
+
+## Summary
+
+The pageserver has a Tenant object in memory for each tenant it manages, and a
+Timeline for each timeline. There are a lot of tasks that operate on the tenants
+and timelines with references to those objects. We have some mechanisms to track
+which tasks are operating on each Tenant and Timeline, and to request them to
+shutdown when a tenant or timeline is deleted, but it does not cover all uses,
+and as a result we have many race conditions around tenant/timeline shutdown.
+
+## Motivation
+
+We have a bunch of race conditions that can produce weird errors and can be hard
+to track down.
+
+## Non Goals
+
+This RFC only covers the problem of ensuring that a task/thread isn't operating
+on a Tenant or Timeline. It does not cover what states, aside from Active and
+non-Active, each Tenant and Timeline should have, or when exactly the transitions
+should happen.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Pageserver. Although I wonder if the safekeeper should have a similar mechanism.
+
+## Current situation
+
+Most pageserver tasks of are managed by task_mgr.rs:
+
+- LibpqEndpointListener
+- HttpEndPointListener
+- WalReceiverManager and -Connection
+- GarbageCollector and Compaction
+- InitialLogicalSizeCalculation
+
+In addition to those tasks, the walreceiver performs some direct tokio::spawn
+calls to spawn tasks that are not registered with 'task_mgr'. And all of these
+tasks can spawn extra operations with tokio spawn_blocking.
+
+Whenever a tenant or timeline is removed from the system, by pageserver
+shutdown, delete_timeline or tenant-detach operation, we rely on the task
+registry in 'task_mgr.rs' to wait until there are no tasks operating on the
+tenant or timeline, before its Tenant/Timeline object is removed. That relies on
+each task to register itself with the tenant/timeline ID in
+'task_mgr.rs'. However, there are many gaps in that. For example,
+GarbageCollection and Compaction tasks are registered with the tenant, but when
+they proceed to operate on a particular timeline of the tenant, they don't
+register with timeline ID. Because of that, the timeline can be deleted while GC
+or compaction is running on it, causing failures in the GC or compaction (see
+https://github.com/neondatabase/neon/issues/2442).
+
+Another problem is that the task registry only works for tokio Tasks. There is
+no way to register a piece of code that runs inside spawn_blocking(), for
+example.
+
+## Proposed implementation
+
+This "voluntary" registration of tasks is fragile. Let's use Rust language features
+to enforce that a tenant/timeline cannot be removed from the system when there is
+still some code operating on it.
+
+Let's introduce new Guard objects for Tenant and Timeline, and do all actions through
+the Guard object. Something like:
+
+TenantActiveGuard: Guard object over Arc<Tenant>. When you acquire the guard,
+the code checks that the tenant is in Active state. If it's not, you get an
+error. You can change the state of the tenant to Stopping while there are
+ActiveTenantGuard objects still on it, to prevent new ActiveTenantGuards from
+being acquired, but the Tenant cannot be removed until all the guards are gone.
+
+TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the
+tenant is not in Active state. Used for operations like attach/detach. Perhaps
+allow only one such guard on a Tenant at a time.
+
+Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think
+we need at least two states: Active and Stopping. The Stopping state is used at
+deletion, to prevent new TimelineActiveGuards from appearing, while you wait for
+existing TimelineActiveGuards to die out.
+
+The shutdown-signaling, using shutdown_watcher() and is_shutdown_requested(),
+probably also needs changes to deal with the new Guards. The rule is that if you
+have a TenantActiveGuard, and the tenant's state changes from Active to
+Stopping, the is_shutdown_requested() function should return true, and
+shutdown_watcher() future should return.
+
+This signaling doesn't neessarily need to cover all cases. For example, if you
+have a block of code in spawn_blocking(), it might be acceptable if
+is_shutdown_requested() doesn't return true even though the tenant is in
+Stopping state, as long as the code finishes reasonably fast.
--- a/docs/rfcs/020-pageserver-s3-coordination.md
+++ b/docs/rfcs/020-pageserver-s3-coordination.md
@@ -0,0 +1,246 @@
+# Coordinating access of multiple pageservers to the same s3 data
+
+## Motivation
+
+There are some blind spots around coordinating access of multiple pageservers
+to the same s3 data. Currently this is applicable only to tenant relocation
+case, but in the future we'll need to solve similar problems for
+replica/standby pageservers.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Pageserver
+
+## The problem
+
+### Relocation
+
+During relocation both pageservers can write to s3. This should be ok for all
+data except the `index_part.json`. For index part it causes problems during
+compaction/gc because they remove files from index/s3.
+
+Imagine this case:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+    participant PS2
+
+    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
+    PS2->>S3: Attach called, sees L1, L2
+    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
+    note over S3: Index now L2, L3
+    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
+    note over S3: Index now L1, L2, L4
+```
+
+At this point it is not possible to restore from index, it contains L2 which
+is no longer available in s3 and doesnt contain L3 added by compaction by the
+first pageserver. So if any of the pageservers restart initial sync will fail
+(or in on-demand world it will fail a bit later during page request from
+missing layer)
+
+### Standby pageserver
+
+Another related case is standby pageserver. In this case second pageserver can
+be used as a replica to scale reads and serve as a failover target in case
+first one fails.
+
+In this mode second pageserver needs to have the same picture of s3 files to
+be able to load layers on-demand. To accomplish that second pageserver
+cannot run gc/compaction jobs. Instead it needs to receive updates for index
+contents. (There is no need to run walreceiver on the second pageserver then).
+
+## Observations
+
+- If both pageservers ingest wal then their layer set diverges, because layer
+  file generation is not deterministic
+- If one of the pageservers does not ingest wal (and just picks up layer
+  updates) then it lags behind and cannot really answer queries in the same
+  pace as the primary one
+- Can compaction help make layers deterministic? E g we do not upload level
+  zero layers and construction of higher levels should be deterministic.
+  This way we can guarantee that layer creation by timeout wont mess things up.
+  This way one pageserver uploads data and second one can just ingest it.
+  But we still need some form of election
+
+## Solutions
+
+### Manual orchestration
+
+One possible solution for relocation case is to orchestrate background jobs
+from outside. The oracle who runs migration can turn off background jobs on
+PS1 before migration and then run migration -> enable them on PS2. The problem
+comes if migration fails. In this case in order to resume background jobs
+oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
+respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
+without human ensuring that no upload from PS2 can happen. In order to be able
+to resolve this automatically CAS is required on S3 side so pageserver can
+avoid overwriting index part if it is no longer the leading one
+
+Note that flag that disables background jobs needs to be persistent, because
+otherwise pageserver restart will clean it
+
+### Avoid index_part.json
+
+Index part consists of two parts, list of layers and metadata. List of layers
+can be easily obtained by `ListObjects` S3 API method. But what to do with
+metadata? Create metadata instance for each checkpoint and add some counter
+to the file name?
+
+Back to potentially long s3 ls.
+
+### Coordination based approach
+
+Do it like safekeepers chose leader for WAL upload. Ping each other and decide
+based on some heuristics e g smallest node id. During relocation PS1 sends
+"resign" ping message so others can start election without waiting for a timeout.
+
+This still leaves metadata question open and non deterministic layers are a
+problem as well
+
+### Avoid metadata file
+
+One way to eliminate metadata file is to store it in layer files under some
+special key. This may resonate with intention to keep all relation sizes in
+some special segment to avoid initial download during size calculation.
+Maybe with that we can even store pre calculated value.
+
+As a downside each checkpoint gets 512 bytes larger.
+
+If we entirely avoid metadata file this opens up many approaches
+
+* * *
+
+During discussion it seems that we converged on the approach consisting of:
+
+- index files stored per pageserver in the same timeline directory. With that
+  index file name starts to look like: `<pageserver_node_id>_index_part.json`.
+  In such set up there are no concurrent overwrites of index file by different
+  pageservers.
+- For replica pageservers the solution would be for primary to broadcast index
+  changes to any followers with an ability to check index files in s3 and
+  restore the full state. To properly merge changes with index files we can use
+  a counter that is persisted in an index file, is incremented on every change
+  to it and passed along with broadcasted change. This way we can determine
+  whether we need to apply change to the index state or not.
+- Responsibility for running background jobs is assigned externally. Pageserver
+  keeps locally persistent flag for each tenant that indicates whether this
+  pageserver is considered as primary one or not. TODO what happends if we
+  crash and cannot start for some extended period of time? Control plane can
+  assign ownership to some other pageserver. Pageserver needs some way to check
+  if its still the blessed one. Maybe by explicit request to control plane on
+  start.
+
+Requirement for deterministic layer generation was considered overly strict
+because of two reasons:
+
+- It can limit possible optimizations e g when pageserver wants to reshuffle
+  some data locally and doesnt want to coordinate this
+- The deterministic algorithm itself can change so during deployments for some
+  time there will be two different version running at the same time which can
+  cause non determinism
+
+### External elections
+
+The above case with lost state in this schema with externally managed
+leadership is represented like this:
+
+Note that here we keep objects list in the index file.
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant CP as Control Plane
+    participant S3
+    participant PS2
+
+    note over PS1,PS2: PS1 starts up and still a leader
+    PS1->>CP: Am I still the leader for Tenant X?
+    activate CP
+    CP->>PS1: Yes
+    deactivate CP
+    PS1->>S3: Fetch PS1 index.
+    note over PS1: Continue operations, start backround jobs
+    note over PS1,PS2: PS1 starts up and still and is not a leader anymore
+    PS1->>CP: Am I still the leader for Tenant X?
+    CP->>PS1: No
+    PS1->>PS2: Subscribe to index changes
+    PS1->>S3: Fetch PS1 and PS2 indexes
+    note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
+    note over PS1: Continue operations, do not start background jobs
+```
+
+### Internal elections
+
+To manage leadership internally we can use broker to exchange pings so nodes
+can decide on the leader roles. In case multiple pageservers are active leader
+is the one with lowest node id.
+
+Operations with internally managed elections:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+
+    note over PS1: Starts up
+    note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
+    PS1->>S3: Fetch indexes from s3
+    alt there is a leader
+        note over PS1: do not start background jobs, <br> continue applying index updates
+    else there is no leader
+        note over PS1: start background jobs, <br> broadcast index changes
+    end
+
+    note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
+```
+
+### Eviction
+
+When two pageservers operate on a tenant for extended period of time follower
+doesnt perform write operations in s3. When layer is evicted follower relies
+on updates from primary to get info about layers it needs to cover range for
+evicted layer.
+
+Note that it wont match evicted layer exactly, so layers will overlap and
+lookup code needs to correctly handle that.
+
+### Relocation flow
+
+Actions become:
+
+- Attach tenant to new pageserver
+- New pageserver becomes follower since previous one is still leading
+- New pageserver starts replicating from safekeepers but does not upload layers
+- Detach is called on the old one
+- New pageserver becomes leader after it realizes that old one disappeared
+
+### Index File
+
+Using `s3 ls` on startup simplifies things, but we still need metadata, so we
+need to fetch index files anyway. If they contain list of files we can combine
+them and avoid costly `s3 ls`
+
+### Remaining issues
+
+- More than one remote consistent lsn for safekeepers to know
+
+Anything else?
+
+### Proposed solution
+
+To recap. On meeting we converged on approach with external elections but I
+think it will be overall harder to manage and will introduce a dependency on
+control plane for pageserver. Using separate index files for each pageserver
+consisting of log of operations and a metadata snapshot should be enough.
+
+### What we need to get there?
+
+- Change index file structure to contain log of changes instead of just the
+  file list
+- Implement pinging/elections for pageservers
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati

 PostgreSQL extension that contains functions needed for testing and debugging.

+`/pgxn/neon_walredo`:
+
+Library to run Postgres as a "WAL redo process" in the pageserver.
+
 `/safekeeper`:

 The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
--- a/libs/etcd_broker/src/subscription_value.rs
+++ b/libs/etcd_broker/src/subscription_value.rs
@@ -29,6 +29,9 @@ pub struct SkTimelineInfo {
    #[serde_as(as = "Option<DisplayFromStr>")]
    #[serde(default)]
    pub peer_horizon_lsn: Option<Lsn>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub local_start_lsn: Option<Lsn>,
    /// A connection string to use for WAL receiving.
    #[serde(default)]
    pub safekeeper_connstr: Option<String>,
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
 const_format = "0.2.21"
+anyhow = { version = "1.0", features = ["backtrace"] }
+bytes = "1.0.1"

 utils = { path = "../utils" }
+postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,6 +2,7 @@ use const_format::formatcp;

 /// Public API types
 pub mod models;
+pub mod reltag;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,6 +7,10 @@ use utils::{
    lsn::Lsn,
 };

+use crate::reltag::RelTag;
+use anyhow::bail;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+
 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TenantState {
@@ -19,6 +23,22 @@ pub enum TenantState {
    Broken,
 }

+/// A state of a timeline in pageserver's memory.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum TimelineState {
+    /// Timeline is fully operational, its background jobs are running.
+    Active,
+    /// A timeline is recognized by pageserver, but not yet ready to operate.
+    /// The status indicates, that the timeline could eventually go back to Active automatically:
+    /// for example, if the owning tenant goes back to Active again.
+    Suspended,
+    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
+    /// automatically become Active after certain events: only a management call can change this status.
+    Paused,
+    /// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
+    Broken,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
@@ -160,6 +180,8 @@ pub struct TimelineInfo {
    pub remote_consistent_lsn: Option<Lsn>,
    pub awaits_download: bool,

+    pub state: TimelineState,
+
    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
    // compatility with older clients.
    pub local: LocalTimelineInfo,
@@ -201,3 +223,160 @@ pub struct FailpointConfig {
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }
+
+// Wrapped in libpq CopyData
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+}
+
+// Wrapped in libpq CopyData
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub dbnode: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub db_size: i64,
+}
+
+impl PagestreamFeMessage {
+    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.get_u8();
+        match msg_tag {
+            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+                blkno: body.get_u32(),
+            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
+            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.exists as u8);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::GetPage(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put(&resp.page[..]);
+            }
+
+            Self::Error(resp) => {
+                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
+        }
+
+        bytes.into()
+    }
+}
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -37,22 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
 });

 impl Conf {
-    pub fn pg_distrib_dir(&self) -> PathBuf {
+    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match self.pg_version {
-            14 => path.join(format!("v{}", self.pg_version)),
-            15 => path.join(format!("v{}", self.pg_version)),
-            _ => panic!("Unsupported postgres version: {}", self.pg_version),
+            14 => Ok(path.join(format!("v{}", self.pg_version))),
+            15 => Ok(path.join(format!("v{}", self.pg_version))),
+            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }

-    fn pg_bin_dir(&self) -> PathBuf {
-        self.pg_distrib_dir().join("bin")
+    fn pg_bin_dir(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir()?.join("bin"))
    }

-    fn pg_lib_dir(&self) -> PathBuf {
-        self.pg_distrib_dir().join("lib")
+    fn pg_lib_dir(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir()?.join("lib"))
    }

    pub fn wal_dir(&self) -> PathBuf {
@@ -60,12 +60,12 @@ impl Conf {
    }

    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
-        let path = self.pg_bin_dir().join(command);
+        let path = self.pg_bin_dir()?.join(command);
        ensure!(path.exists(), "Command {:?} does not exist", path);
        let mut cmd = Command::new(path);
        cmd.env_clear()
-            .env("LD_LIBRARY_PATH", self.pg_lib_dir())
-            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
+            .env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
        Ok(cmd)
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -16,7 +16,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe_dir::path_with_suffix_extension;
+use utils::crashsafe::path_with_suffix_extension;

 use crate::{Download, DownloadError, RemoteObjectId};

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -19,7 +19,7 @@ thiserror = "1.0"
 tokio = { version = "1.17", features = ["macros"]}
 tokio-rustls = "0.23"
 tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 nix = "0.25"
 signal-hook = "0.3.10"
 rand = "0.8.3"
@@ -30,6 +30,8 @@ rustls-split = "0.3.0"
 git-version = "0.3.5"
 serde_with = "2.0"
 once_cell = "1.13.0"
+strum = "0.24"
+strum_macros = "0.24"


 metrics = { path = "../metrics" }
--- a/libs/utils/src/connstring.rs
+++ b/libs/utils/src/connstring.rs
@@ -1,52 +0,0 @@
-use postgres::Config;
-
-pub fn connection_host_port(config: &Config) -> (String, u16) {
-    assert_eq!(
-        config.get_hosts().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    assert_eq!(
-        config.get_ports().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    let host = match &config.get_hosts()[0] {
-        postgres::config::Host::Tcp(host) => host.as_ref(),
-        postgres::config::Host::Unix(host) => host.to_str().unwrap(),
-    };
-    (host.to_owned(), config.get_ports()[0])
-}
-
-pub fn connection_address(config: &Config) -> String {
-    let (host, port) = connection_host_port(config);
-    format!("{}:{}", host, port)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_connection_host_port() {
-        let config: Config = "postgresql://no_user@localhost:64000/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-
-    #[test]
-    #[should_panic(expected = "only one pair of host and port is supported in connection string")]
-    fn test_connection_host_port_multiple_ports() {
-        let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-}
--- a/libs/utils/src/crashsafe_dir.rs
+++ b/libs/utils/src/crashsafe_dir.rs
@@ -12,16 +12,8 @@ pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
    let path = path.as_ref();

    fs::create_dir(path)?;
-    File::open(path)?.sync_all()?;
-
-    if let Some(parent) = path.parent() {
-        File::open(parent)?.sync_all()
-    } else {
-        Err(io::Error::new(
-            io::ErrorKind::InvalidInput,
-            "can't find parent",
-        ))
-    }
+    fsync_file_and_parent(path)?;
+    Ok(())
 }

 /// Similar to [`std::fs::create_dir_all`], except we fsync all
@@ -65,12 +57,12 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {

    // Fsync the created directories from child to parent.
    for &path in dirs_to_create.iter() {
-        File::open(path)?.sync_all()?;
+        fsync(path)?;
    }

    // If we created any new directories, fsync the parent.
    if !dirs_to_create.is_empty() {
-        File::open(path)?.sync_all()?;
+        fsync(path)?;
    }

    Ok(())
@@ -92,6 +84,33 @@ pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str)
        .with_extension(new_extension.as_ref())
 }

+pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
+    let parent = file_path.parent().ok_or_else(|| {
+        io::Error::new(
+            io::ErrorKind::Other,
+            format!("File {file_path:?} has no parent"),
+        )
+    })?;
+
+    fsync(file_path)?;
+    fsync(parent)?;
+    Ok(())
+}
+
+pub fn fsync(path: &Path) -> io::Result<()> {
+    File::open(path)
+        .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
+        .and_then(|file| {
+            file.sync_all().map_err(|e| {
+                io::Error::new(
+                    e.kind(),
+                    format!("Failed to sync file {path:?} data and metadata: {e}"),
+                )
+            })
+        })
+        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -75,6 +75,12 @@ impl From<[u8; 16]> for Id {
    }
 }

+impl From<Id> for u128 {
+    fn from(id: Id) -> Self {
+        u128::from_le_bytes(id.0)
+    }
+}
+
 impl fmt::Display for Id {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(&self.hex_encode())
@@ -136,6 +142,12 @@ macro_rules! id_newtype {
            }
        }

+        impl From<$t> for u128 {
+            fn from(id: $t) -> Self {
+                u128::from(id.0)
+            }
+        }
+
        impl fmt::Display for $t {
            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
                self.0.fmt(f)
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -19,11 +19,8 @@ pub mod postgres_backend;
 pub mod postgres_backend_async;
 pub mod pq_proto;

-// dealing with connstring parsing and handy access to it's parts
-pub mod connstring;
-
-// helper functions for creating and fsyncing directories/trees
-pub mod crashsafe_dir;
+// helper functions for creating and fsyncing
+pub mod crashsafe;

 // common authentication routines
 pub mod auth;
@@ -39,6 +36,8 @@ pub mod sock_split;
 // common log initialisation routine
 pub mod logging;

+pub mod lock_file;
+
 // Misc
 pub mod accum;
 pub mod shutdown;
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -0,0 +1,81 @@
+//! A module to create and read lock files. A lock file ensures that only one
+//! process is running at a time, in a particular directory.
+//!
+//! File locking is done using [`fcntl::flock`], which means that holding the
+//! lock on file only prevents acquiring another lock on it; all other
+//! operations are still possible on files. Other process can still open, read,
+//! write, or remove the file, for example.
+//! If the file is removed while a process is holding a lock on it,
+//! the process that holds the lock does not get any error or notification.
+//! Furthermore, you can create a new file with the same name and lock the new file,
+//! while the old process is still running.
+//! Deleting the lock file while the locking process is still running is a bad idea!
+
+use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+
+use anyhow::Context;
+use nix::fcntl;
+
+use crate::crashsafe;
+
+pub enum LockCreationResult {
+    Created {
+        new_lock_contents: String,
+        file: fs::File,
+    },
+    AlreadyLocked {
+        existing_lock_contents: String,
+    },
+    CreationFailed(anyhow::Error),
+}
+
+/// Creates a lock file in the path given and writes the given contents into the file.
+/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
+pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
+    let lock_file = match fs::OpenOptions::new()
+        .create(true) // O_CREAT
+        .write(true)
+        .open(lock_file_path)
+        .context("Failed to open lock file")
+    {
+        Ok(file) => file,
+        Err(e) => return LockCreationResult::CreationFailed(e),
+    };
+
+    match fcntl::flock(
+        lock_file.as_raw_fd(),
+        fcntl::FlockArg::LockExclusiveNonblock,
+    ) {
+        Ok(()) => {
+            match lock_file
+                .set_len(0)
+                .context("Failed to truncate lockfile")
+                .and_then(|()| {
+                    fs::write(lock_file_path, &contents).with_context(|| {
+                        format!("Failed to write '{contents}' contents into lockfile")
+                    })
+                })
+                .and_then(|()| {
+                    crashsafe::fsync_file_and_parent(lock_file_path)
+                        .context("Failed to fsync lockfile")
+                }) {
+                Ok(()) => LockCreationResult::Created {
+                    new_lock_contents: contents,
+                    file: lock_file,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(nix::errno::Errno::EAGAIN) => {
+            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
+                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
+                    existing_lock_contents,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(e) => {
+            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
+        }
+    }
+}
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,19 +1,28 @@
-use std::{
-    fs::{File, OpenOptions},
-    path::Path,
-};
+use std::str::FromStr;

-use anyhow::{Context, Result};
+use anyhow::Context;
+use strum_macros::{EnumString, EnumVariantNames};

-pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
-    // Don't open the same file for output multiple times;
-    // the different fds could overwrite each other's output.
-    let log_file = OpenOptions::new()
-        .create(true)
-        .append(true)
-        .open(&log_filename)
-        .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
+#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[strum(serialize_all = "snake_case")]
+pub enum LogFormat {
+    Plain,
+    Json,
+}

+impl LogFormat {
+    pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
+        use strum::VariantNames;
+        LogFormat::from_str(s).with_context(|| {
+            format!(
+                "Unrecognized log format. Please specify one of: {:?}",
+                LogFormat::VARIANTS
+            )
+        })
+    }
+}
+
+pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
    let default_filter_str = "info";

    // We fall back to printing all spans at info-level or above if
@@ -23,20 +32,14 @@ pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {

    let base_logger = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
-        .with_target(false) // don't include event targets
-        .with_ansi(false); // don't use colors in log file;
+        .with_target(false)
+        .with_ansi(false)
+        .with_writer(std::io::stdout);

-    // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
-    // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
-    // for example to be in line with docker log command which expects logs comimg from stdout
-    if daemonize {
-        let x = log_file.try_clone().unwrap();
-        base_logger
-            .with_writer(move || x.try_clone().unwrap())
-            .init();
-    } else {
-        base_logger.init();
+    match log_format {
+        LogFormat::Json => base_logger.json().init(),
+        LogFormat::Plain => base_logger.init(),
    }

-    Ok(log_file)
+    Ok(())
 }
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -15,7 +15,7 @@ use std::sync::Arc;
 use std::task::Poll;
 use tracing::{debug, error, trace};

-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
 use tokio_rustls::TlsAcceptor;

 #[async_trait::async_trait]
@@ -66,8 +66,8 @@ pub enum ProcessMsgResult {
 /// Always-writeable sock_split stream.
 /// May not be readable. See [`PostgresBackend::take_stream_in`]
 pub enum Stream {
-    Unencrypted(tokio::net::TcpStream),
-    Tls(Box<tokio_rustls::server::TlsStream<tokio::net::TcpStream>>),
+    Unencrypted(BufReader<tokio::net::TcpStream>),
+    Tls(Box<tokio_rustls::server::TlsStream<BufReader<tokio::net::TcpStream>>>),
    Broken,
 }

@@ -157,7 +157,7 @@ impl PostgresBackend {
        let peer_addr = socket.peer_addr()?;

        Ok(Self {
-            stream: Stream::Unencrypted(socket),
+            stream: Stream::Unencrypted(BufReader::new(socket)),
            buf_out: BytesMut::with_capacity(10 * 1024),
            state: ProtoState::Initialization,
            md5_salt: [0u8; 4],
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -24,7 +24,6 @@ hex = "0.4.3"
 hyper = "0.14"
 itertools = "0.10.3"
 clap = { version = "4.0", features = ["string"] }
-daemonize = "0.4.1"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -67,6 +66,7 @@ remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
 close_fds = "0.3.2"
 walkdir = "2.3.2"
+svg_fmt = "0.4.1"

 [dev-dependencies]
 criterion = "0.4"
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,8 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;

-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
+use pageserver_api::reltag::{RelTag, SlruKind};

 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -0,0 +1,150 @@
+//! A tool for visualizing the arrangement of layerfiles within a timeline.
+//!
+//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
+//! page-lsn space, where every delta layer is a rectangle and every image layer is a
+//! thick line. Legend:
+//! - The x axis (left to right) represents page index.
+//! - The y axis represents LSN, growing upwards.
+//!
+//! Coordinates in both axis are compressed for better readability.
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
+//!
+//! Example use:
+//! ```
+//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
+//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ firefox out.svg
+//! ```
+//!
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
+//!
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
+use anyhow::Result;
+use pageserver::repository::Key;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    ops::Range,
+};
+use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
+    let set: BTreeSet<T> = coords.into_iter().collect();
+
+    let mut map: BTreeMap<T, usize> = BTreeMap::new();
+    for (i, e) in set.iter().enumerate() {
+        map.insert(*e, i);
+    }
+
+    map
+}
+
+fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
+    let split: Vec<&str> = name.split("__").collect();
+    let keys: Vec<&str> = split[0].split('-').collect();
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    if lsns.len() == 1 {
+        lsns.push(lsns[0]);
+    }
+
+    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
+    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    (keys, lsns)
+}
+
+fn main() -> Result<()> {
+    // Parse layer filenames from stdin
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    let stdin = io::stdin();
+    for line in stdin.lock().lines() {
+        let range = parse_filename(&line.unwrap());
+        ranges.push(range);
+    }
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for (keyr, lsnr) in &ranges {
+        keys.push(keyr.start);
+        keys.push(keyr.end);
+        lsns.push(lsnr.start);
+        lsns.push(lsnr.end);
+    }
+
+    // Analyze
+    let key_map = build_coordinate_compression_map(keys);
+    let lsn_map = build_coordinate_compression_map(lsns);
+
+    // Initialize stats
+    let mut num_deltas = 0;
+    let mut num_images = 0;
+
+    // Draw
+    let stretch = 3.0; // Stretch out vertically for better visibility
+    println!(
+        "{}",
+        BeginSvg {
+            w: key_map.len() as f32,
+            h: stretch * lsn_map.len() as f32
+        }
+    );
+    for (keyr, lsnr) in &ranges {
+        let key_start = *key_map.get(&keyr.start).unwrap();
+        let key_end = *key_map.get(&keyr.end).unwrap();
+        let key_diff = key_end - key_start;
+        let lsn_max = lsn_map.len();
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
+        let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
+
+        let mut lsn_diff = (lsn_end - lsn_start) as f32;
+        let mut fill = Fill::None;
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let mut lsn_offset = 0.0;
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        match lsn_start.cmp(&lsn_end) {
+            Ordering::Less => num_deltas += 1,
+            Ordering::Equal => {
+                num_images += 1;
+                lsn_diff = 0.3;
+                lsn_offset = -lsn_diff / 2.0;
+                margin = 0.05;
+                fill = Fill::Color(rgb(0, 0, 0));
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+
+        println!(
+            "    {}",
+            rectangle(
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
+            )
+            .fill(fill)
+            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
+            .border_radius(0.4)
+        );
+    }
+    println!("{}", EndSvg);
+
+    eprintln!("num_images: {}", num_images);
+    eprintln!("num_deltas: {}", num_deltas);
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,17 +1,14 @@
 //! Main entry point for the Page Server executable.

-use remote_storage::GenericRemoteStorage;
 use std::{env, ops::ControlFlow, path::Path, str::FromStr};
+
+use anyhow::{anyhow, Context};
+use clap::{Arg, ArgAction, Command};
+use fail::FailScenario;
+use nix::unistd::Pid;
 use tracing::*;

-use anyhow::{anyhow, bail, Context, Result};
-
-use clap::{Arg, ArgAction, Command};
-use daemonize::Daemonize;
-
-use fail::FailScenario;
 use metrics::set_build_info_metric;
-
 use pageserver::{
    config::{defaults::*, PageServerConf},
    http, page_cache, page_service, profiling, task_mgr,
@@ -19,20 +16,22 @@ use pageserver::{
    task_mgr::{
        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
    },
-    tenant_mgr, virtual_file, LOG_FILE_NAME,
+    tenant_mgr, virtual_file,
 };
+use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
-    logging,
+    lock_file, logging,
    postgres_backend::AuthType,
    project_git_version,
-    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
 };

 project_git_version!(GIT_VERSION);

+const PID_FILE_NAME: &str = "pageserver.pid";
+
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
@@ -65,6 +64,7 @@ fn main() -> anyhow::Result<()> {
    let workdir = workdir
        .canonicalize()
        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
+
    let cfg_file_path = workdir.join("pageserver.toml");

    // Set CWD to workdir for non-daemon modes
@@ -75,8 +75,6 @@ fn main() -> anyhow::Result<()> {
        )
    })?;

-    let daemonize = arg_matches.get_flag("daemonize");
-
    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
        ControlFlow::Break(()) => {
@@ -87,7 +85,7 @@ fn main() -> anyhow::Result<()> {

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
-        utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| {
+        utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
            format!(
                "Failed to create tenants root dir at '{}'",
                tenants_path.display()
@@ -102,7 +100,7 @@ fn main() -> anyhow::Result<()> {
    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);

-    start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
+    start_pageserver(conf).context("Failed to start pageserver")?;

    scenario.teardown();
    Ok(())
@@ -197,12 +195,34 @@ fn initialize_config(
    })
 }

-fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
-    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
-
+fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    logging::init(conf.log_format)?;
    info!("version: {}", version());

+    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
+        lock_file::LockCreationResult::Created {
+            new_lock_contents,
+            file,
+        } => {
+            info!("Created lock file at {lock_file_path:?} with contents {new_lock_contents}");
+            file
+        }
+        lock_file::LockCreationResult::AlreadyLocked {
+            existing_lock_contents,
+        } => anyhow::bail!(
+            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
+            conf.workdir,
+            existing_lock_contents
+        ),
+        lock_file::LockCreationResult::CreationFailed(e) => {
+            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
+        }
+    };
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    let _ = Box::leak(Box::new(lock_file));
+
    // TODO: Check that it looks like a valid repository before going further

    // bind sockets before daemonizing so we report errors early and do not return until we are listening
@@ -218,33 +238,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    );
    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

-    // NB: Don't spawn any threads before daemonizing!
-    if daemonize {
-        info!("daemonizing...");
-
-        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
-        // that we will see any accidental manual fprintf's or backtraces.
-        let stdout = log_file
-            .try_clone()
-            .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
-        let stderr = log_file;
-
-        let daemonize = Daemonize::new()
-            .pid_file("pageserver.pid")
-            .working_directory(".")
-            .stdout(stdout)
-            .stderr(stderr);
-
-        // XXX: The parent process should exit abruptly right after
-        // it has spawned a child to prevent coverage machinery from
-        // dumping stats into a `profraw` file now owned by the child.
-        // Otherwise, the coverage data will be damaged.
-        match daemonize.exit_action(|| exit_now(0)).start() {
-            Ok(_) => info!("Success, daemonized"),
-            Err(err) => bail!("{err}. could not daemonize. bailing."),
-        }
-    }
-
    let signals = signals::install_shutdown_handlers()?;

    // start profiler (if enabled)
@@ -347,14 +340,6 @@ fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(version())
-        .arg(
-
-            Arg::new("daemonize")
-                .short('d')
-                .long("daemonize")
-                .action(ArgAction::SetTrue)
-                .help("Run in the background"),
-        )
        .arg(
            Arg::new("init")
                .long("init")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,6 +7,7 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::RemoteStorageConfig;
 use std::env;
+use utils::crashsafe::path_with_suffix_extension;

 use std::path::{Path, PathBuf};
 use std::str::FromStr;
@@ -16,6 +17,7 @@ use toml_edit::{Document, Item};
 use url::Url;
 use utils::{
    id::{NodeId, TenantId, TimelineId},
+    logging::LogFormat,
    postgres_backend::AuthType,
 };

@@ -24,6 +26,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt};

 /// The name of the metadata file pageserver creates per timeline.
 pub const METADATA_FILE_NAME: &str = "metadata";
+pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 const TENANT_CONFIG_NAME: &str = "config";

 pub mod defaults {
@@ -43,6 +46,8 @@ pub mod defaults {
    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;

+    pub const DEFAULT_LOG_FORMAT: &str = "plain";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -61,6 +66,7 @@ pub mod defaults {
 # initial superuser role name to use when creating a new tenant
 #initial_superuser_name = '{DEFAULT_SUPERUSER}'

+#log_format = '{DEFAULT_LOG_FORMAT}'
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -124,6 +130,8 @@ pub struct PageServerConf {

    /// Etcd broker endpoints to connect to.
    pub broker_endpoints: Vec<Url>,
+
+    pub log_format: LogFormat,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -190,6 +198,8 @@ struct PageServerConfigBuilder {
    profiling: BuilderValue<ProfilingConfig>,
    broker_etcd_prefix: BuilderValue<String>,
    broker_endpoints: BuilderValue<Vec<Url>>,
+
+    log_format: BuilderValue<LogFormat>,
 }

 impl Default for PageServerConfigBuilder {
@@ -217,6 +227,7 @@ impl Default for PageServerConfigBuilder {
            profiling: Set(ProfilingConfig::Disabled),
            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
            broker_endpoints: Set(Vec::new()),
+            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
        }
    }
 }
@@ -289,6 +300,10 @@ impl PageServerConfigBuilder {
        self.profiling = BuilderValue::Set(profiling)
    }

+    pub fn log_format(&mut self, log_format: LogFormat) {
+        self.log_format = BuilderValue::Set(log_format)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let broker_endpoints = self
            .broker_endpoints
@@ -333,6 +348,7 @@ impl PageServerConfigBuilder {
            broker_etcd_prefix: self
                .broker_etcd_prefix
                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
+            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
        })
    }
 }
@@ -364,6 +380,17 @@ impl PageServerConf {
        self.timelines_path(tenant_id).join(timeline_id.to_string())
    }

+    pub fn timeline_uninit_mark_file_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&timeline_id, &tenant_id),
+            TIMELINE_UNINIT_MARK_SUFFIX,
+        )
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -374,28 +401,28 @@ impl PageServerConf {
    //
    // Postgres distribution paths
    //
-    pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match pg_version {
-            14 => path.join(format!("v{pg_version}")),
-            15 => path.join(format!("v{pg_version}")),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("bin"),
-            15 => self.pg_distrib_dir(pg_version).join("bin"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("lib"),
-            15 => self.pg_distrib_dir(pg_version).join("lib"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

@@ -446,6 +473,9 @@ impl PageServerConf {
                        })
                        .collect::<anyhow::Result<_>>()?,
                ),
+                "log_format" => builder.log_format(
+                    LogFormat::from_config(&parse_toml_string(key, item)?)?
+                ),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -558,6 +588,7 @@ impl PageServerConf {
            default_tenant_conf: TenantConf::dummy_conf(),
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
        }
    }
 }
@@ -652,6 +683,8 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10

+log_format = 'json'
+
 "#;

    #[test]
@@ -691,6 +724,7 @@ id = 10
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -735,6 +769,7 @@ id = 10
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                log_format: LogFormat::Json,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -618,6 +618,7 @@ components:
        - last_record_lsn
        - disk_consistent_lsn
        - awaits_download
+        - state
      properties:
        timeline_id:
          type: string
@@ -660,6 +661,8 @@ components:
          type: integer
        awaits_download:
          type: boolean
+        state:
+          type: string

        # These 'local' and 'remote' fields just duplicate some of the fields
        # above. They are kept for backwards-compatibility. They can be removed,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -129,6 +129,7 @@ async fn build_timeline_info(
        }
    };
    let current_physical_size = Some(timeline.get_physical_size());
+    let state = timeline.current_state();

    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
@@ -158,6 +159,7 @@ async fn build_timeline_info(

        remote_consistent_lsn,
        awaits_download,
+        state,

        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
        // with the control plane.
@@ -225,13 +227,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);

-    let timelines = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
+    let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
        Ok(tenant.list_timelines())
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    })?;

    let mut response_data = Vec::with_capacity(timelines.len());
    for timeline in timelines {
@@ -294,7 +293,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body

    let timeline_info = async {
        let timeline = tokio::task::spawn_blocking(move || {
-            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
+            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false)
        })
        .await
        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
@@ -331,14 +330,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let timeline = tenant_mgr::get_tenant(tenant_id, true)
-        .and_then(|tenant| tenant.get_timeline(timeline_id))
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
        .map_err(ApiError::NotFound)?;
    let result = match timeline
        .find_lsn_for_timestamp(timestamp_pg)
        .map_err(ApiError::InternalServerError)?
    {
-        LsnForTimestamp::Present(lsn) => format!("{}", lsn),
+        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
        LsnForTimestamp::Future(_lsn) => "future".into(),
        LsnForTimestamp::Past(_lsn) => "past".into(),
        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
@@ -522,9 +520,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    check_permission(&request, Some(tenant_id))?;

    // if tenant is in progress of downloading it can be absent in global tenant map
-    let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
-        .await
-        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false);

    let state = get_state(&request);
    let remote_index = &state.remote_index;
@@ -781,11 +777,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 // Run GC immediately on given timeline.
-// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
-// This probably should require special authentication or a global flag to
-// enable, I don't think we want to or need to allow regular clients to invoke
-// GC.
-//     @hllinnaka in commits ec44f4b29, 3aca717f3
 #[cfg(feature = "testing")]
 async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -793,16 +784,16 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
-    let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let _span_guard =
        info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
-    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon());
+    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());

    // Use tenant's pitr setting
-    let pitr = repo.get_pitr_interval();
-    let result = repo
+    let pitr = tenant.get_pitr_interval();
+    let result = tenant
        .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
        // better once the types support it.
@@ -811,19 +802,15 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 }

 // Run compaction immediately on given timeline.
-// FIXME This is just for tests. Don't expect this to be exposed to
-// the users or the api.
-//     @dhammika in commit a0781f229
 #[cfg(feature = "testing")]
 async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = repo
-        .get_timeline(timeline_id)
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
    timeline.compact().map_err(ApiError::InternalServerError)?;

@@ -837,10 +824,9 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = repo
-        .get_timeline(timeline_id)
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
    timeline
        .checkpoint(CheckpointConfig::Forced)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,10 +12,10 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
@@ -43,19 +43,19 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
 pub fn import_timeline_from_postgres_datadir(
-    path: &Path,
    tline: &Timeline,
-    lsn: Lsn,
+    pgdata_path: &Path,
+    pgdata_lsn: Lsn,
 ) -> Result<()> {
    let mut pg_control: Option<ControlFileData> = None;

    // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
    // Then fishing out pg_control would be unnecessary
-    let mut modification = tline.begin_modification(lsn);
+    let mut modification = tline.begin_modification(pgdata_lsn);
    modification.init_empty()?;

    // Import all but pg_wal
-    let all_but_wal = WalkDir::new(path)
+    let all_but_wal = WalkDir::new(pgdata_path)
        .into_iter()
        .filter_entry(|entry| !entry.path().ends_with("pg_wal"));
    for entry in all_but_wal {
@@ -63,7 +63,7 @@ pub fn import_timeline_from_postgres_datadir(
        let metadata = entry.metadata().expect("error getting dir entry metadata");
        if metadata.is_file() {
            let absolute_path = entry.path();
-            let relative_path = absolute_path.strip_prefix(path)?;
+            let relative_path = absolute_path.strip_prefix(pgdata_path)?;

            let file = File::open(absolute_path)?;
            let len = metadata.len() as usize;
@@ -84,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir(
        "Postgres cluster was not shut down cleanly"
    );
    ensure!(
-        pg_control.checkPointCopy.redo == lsn.0,
+        pg_control.checkPointCopy.redo == pgdata_lsn.0,
        "unexpected checkpoint REDO pointer"
    );

@@ -92,10 +92,10 @@ pub fn import_timeline_from_postgres_datadir(
    // this reads the checkpoint record itself, advancing the tip of the timeline to
    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
    import_wal(
-        &path.join("pg_wal"),
+        &pgdata_path.join("pg_wal"),
        tline,
        Lsn(pg_control.checkPointCopy.redo),
-        lsn,
+        pgdata_lsn,
    )?;

    Ok(())
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -8,7 +8,6 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
-pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
 pub mod task_mgr;
@@ -44,7 +43,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

-pub const LOG_FILE_NAME: &str = "pageserver.log";
+static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 /// Config for the Repository checkpointer
 #[derive(Debug, Clone, Copy)]
@@ -80,7 +79,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {

    // There should be nothing left, but let's be sure
    task_mgr::shutdown_tasks(None, None, None).await;
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -107,18 +107,20 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {

 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
 // or in testing they estimate how much we would upload if we did.
-static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });

-static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -275,11 +277,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
 /// smallest redo processing times. These buckets allow us to measure down
 /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
 /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
+///
+/// Values up to 1s are recorded because metrics show that we have redo
+/// durations and lock times larger than 0.250s.
 macro_rules! redo_histogram_time_buckets {
    () => {
        vec![
            0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
-            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
+            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
+            1.000_000,
        ]
    };
 }
@@ -294,6 +300,17 @@ macro_rules! redo_histogram_count_buckets {
    };
 }

+macro_rules! redo_bytes_histogram_count_buckets {
+    () => {
+        // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
+        // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
+        vec![
+            24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
+            2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
+        ]
+    };
+}
+
 pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
@@ -321,6 +338,15 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_bytes_histogram",
+        "Histogram of number of records replayed per redo",
+        redo_bytes_histogram_count_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
 pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
@@ -386,8 +412,12 @@ impl TimelineMetrics {
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone();
-        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone();
+        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();

        TimelineMetrics {
            tenant_id,
@@ -419,6 +449,8 @@ impl Drop for TimelineMetrics {
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);

        for op in STORAGE_TIME_OPERATIONS {
            let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,8 +10,14 @@
 //

 use anyhow::{bail, ensure, Context, Result};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamNblocksRequest, PagestreamNblocksResponse,
+};
 use std::io;
 use std::net::TcpListener;
 use std::str;
@@ -32,10 +38,9 @@ use utils::{

 use crate::basebackup;
 use crate::config::{PageServerConf, ProfilingConfig};
-use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
+use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
-use crate::reltag::RelTag;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -45,163 +50,6 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// Wrapped in libpq CopyData
-enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-}
-
-// Wrapped in libpq CopyData
-enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-}
-
-#[derive(Debug)]
-struct PagestreamExistsRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-    blkno: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeRequest {
-    latest: bool,
-    lsn: Lsn,
-    dbnode: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamExistsResponse {
-    exists: bool,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksResponse {
-    n_blocks: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageResponse {
-    page: Bytes,
-}
-
-#[derive(Debug)]
-struct PagestreamErrorResponse {
-    message: String,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeResponse {
-    db_size: i64,
-}
-
-impl PagestreamFeMessage {
-    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.get_u8();
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-                blkno: body.get_u32(),
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
-            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
-                bytes.put_u8(resp.exists as u8);
-            }
-
-            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
-                bytes.put_u32(resp.n_blocks);
-            }
-
-            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
-                bytes.put(&resp.page[..]);
-            }
-
-            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
-                bytes.put_i64(resp.db_size);
-            }
-        }
-
-        bytes.into()
-    }
-}
-
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
    async_stream::try_stream! {
        loop {
@@ -500,11 +348,8 @@ impl PageServerHandler {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline(
-            timeline_id,
-            base_lsn,
-            pg_version,
-        )?;
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
@@ -527,7 +372,8 @@ impl PageServerHandler {
        // - use block_in_place()
        let mut copyin_stream = Box::pin(copyin_stream(pgb));
        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?;
+        tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
+        timeline.initialize()?;

        // Drain the rest of the Copy data
        let mut bytes_after_tar = 0;
@@ -544,12 +390,6 @@ impl PageServerHandler {
        // It wouldn't work if base came from vanilla postgres though,
        // since we discard some log files.

-        // Flush data to disk, then upload to s3
-        info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush)?;
-
-        timeline.launch_wal_receiver()?;
-
        info!("done");
        Ok(())
    }
@@ -1068,7 +908,8 @@ impl postgres_backend_async::Handler for PageServerHandler {
 }

 fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result<Arc<Timeline>> {
-    tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id))
+    tenant_mgr::get_tenant(tenant_id, true)
+        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
 }

 ///
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,12 +7,12 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
@@ -1373,6 +1373,17 @@ fn is_rel_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }

+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
 pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
    Ok(match key.field1 {
        0x01 => {
@@ -1403,7 +1414,9 @@ pub fn create_test_timeline(
    timeline_id: utils::id::TimelineId,
    pg_version: u32,
 ) -> Result<std::sync::Arc<Timeline>> {
-    let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?;
+    let tline = tenant
+        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
+        .initialize()?;
    let mut m = tline.begin_modification(Lsn(8));
    m.init_empty()?;
    m.commit()?;
--- a/pageserver/src/storage_sync/download.rs
+++ b/pageserver/src/storage_sync/download.rs
@@ -22,7 +22,7 @@ use crate::{
    TEMP_FILE_SUFFIX,
 };
 use utils::{
-    crashsafe_dir::path_with_suffix_extension,
+    crashsafe::path_with_suffix_extension,
    id::{TenantId, TenantTimelineId, TimelineId},
 };

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -610,9 +610,9 @@ impl DeltaLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct DeltaLayerWriter {
+struct DeltaLayerWriterInner {
    conf: &'static PageServerConf,
-    path: PathBuf,
+    pub path: PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,

@@ -624,17 +624,17 @@ pub struct DeltaLayerWriter {
    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

-impl DeltaLayerWriter {
+impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
-    pub fn new(
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_start: Key,
        lsn_range: Range<Lsn>,
-    ) -> Result<DeltaLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename. We don't know
        // the end key yet, so we cannot form the final filename yet. We will
        // rename it when we're done.
@@ -653,7 +653,7 @@ impl DeltaLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        Ok(DeltaLayerWriter {
+        Ok(Self {
            conf,
            path,
            timeline_id,
@@ -670,17 +670,17 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
    }

-    pub fn put_value_bytes(
+    fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
        val: &[u8],
        will_init: bool,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);

        let off = self.blob_writer.write_blob(val)?;
@@ -693,14 +693,14 @@ impl DeltaLayerWriter {
        Ok(())
    }

-    pub fn size(&self) -> u64 {
+    fn size(&self) -> u64 {
        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

    ///
    /// Finish writing the delta layer.
    ///
-    pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -768,6 +768,102 @@ impl DeltaLayerWriter {
    }
 }

+/// A builder object for constructing a new delta layer.
+///
+/// Usage:
+///
+/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_value` for every page
+///    version to store in the layer.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct DeltaLayerWriter {
+    inner: Option<DeltaLayerWriterInner>,
+}
+
+impl DeltaLayerWriter {
+    ///
+    /// Start building a new delta layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_start: Key,
+        lsn_range: Range<Lsn>,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            inner: Some(DeltaLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_start,
+                lsn_range,
+            )?),
+        })
+    }
+
+    ///
+    /// Append a key-value pair to the file.
+    ///
+    /// The values must be appended in key, lsn order.
+    ///
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val)
+    }
+
+    pub fn put_value_bytes(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: &[u8],
+        will_init: bool,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_value_bytes(key, lsn, val, will_init)
+    }
+
+    pub fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
+
+    ///
+    /// Finish writing the delta layer.
+    ///
+    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end)
+    }
+}
+
+impl Drop for DeltaLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            match inner.blob_writer.into_inner().into_inner() {
+                Ok(vfile) => vfile.remove(),
+                Err(err) => warn!(
+                    "error while flushing buffer of image layer temporary file: {}",
+                    err
+                ),
+            }
+        }
+    }
+}
+
 ///
 /// Iterator over all key-value pairse stored in a delta layer
 ///
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -411,7 +411,7 @@ impl ImageLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct ImageLayerWriter {
+struct ImageLayerWriterInner {
    conf: &'static PageServerConf,
    path: PathBuf,
    timeline_id: TimelineId,
@@ -423,14 +423,17 @@ pub struct ImageLayerWriter {
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

-impl ImageLayerWriter {
-    pub fn new(
+impl ImageLayerWriterInner {
+    ///
+    /// Start building a new image layer.
+    ///
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> anyhow::Result<ImageLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
        let path = ImageLayer::temp_path_for(
@@ -455,7 +458,7 @@ impl ImageLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        let writer = ImageLayerWriter {
+        let writer = Self {
            conf,
            path,
            timeline_id,
@@ -474,7 +477,7 @@ impl ImageLayerWriter {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
+    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let off = self.blob_writer.write_blob(img)?;

@@ -485,7 +488,10 @@ impl ImageLayerWriter {
        Ok(())
    }

-    pub fn finish(self) -> anyhow::Result<ImageLayer> {
+    ///
+    /// Finish writing the image layer.
+    ///
+    fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -552,3 +558,76 @@ impl ImageLayerWriter {
        Ok(layer)
    }
 }
+
+/// A builder object for constructing a new image layer.
+///
+/// Usage:
+///
+/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_page_image` for every key-value
+///    pair in the key range.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct ImageLayerWriter {
+    inner: Option<ImageLayerWriterInner>,
+}
+
+impl ImageLayerWriter {
+    ///
+    /// Start building a new image layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+    ) -> anyhow::Result<ImageLayerWriter> {
+        Ok(Self {
+            inner: Some(ImageLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_range,
+                lsn,
+            )?),
+        })
+    }
+
+    ///
+    /// Write next value to the file.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img)
+    }
+
+    ///
+    /// Finish writing the image layer.
+    ///
+    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish()
+    }
+}
+
+impl Drop for ImageLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            inner.blob_writer.into_inner().remove();
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,10 +1,12 @@
 //!

-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use pageserver_api::models::TimelineState;
+use tokio::sync::watch;
 use tokio::task::spawn_blocking;
 use tracing::*;

@@ -34,8 +36,9 @@ use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::reltag::RelTag;
+use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::tenant_config::TenantConfOpt;
+use pageserver_api::reltag::RelTag;

 use postgres_ffi::to_pg_timestamp;
 use utils::{
@@ -52,6 +55,7 @@ use crate::task_mgr::TaskKind;
 use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::CheckpointConfig;
+use crate::ZERO_PAGE;
 use crate::{
    page_cache,
    storage_sync::{self, index::LayerFileMetadata},
@@ -158,6 +162,8 @@ pub struct Timeline {

    /// Relation size cache
    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+
+    state: watch::Sender<TimelineState>,
 }

 /// Internal structure to hold all data needed for logical size calculation.
@@ -305,10 +311,6 @@ pub struct GcInfo {

 /// Public interface functions
 impl Timeline {
-    //------------------------------------------------------------------------------
-    // Public GET functions
-    //------------------------------------------------------------------------------
-
    /// Get the LSN where this branch was created
    pub fn get_ancestor_lsn(&self) -> Lsn {
        self.ancestor_lsn
@@ -418,9 +420,11 @@ impl Timeline {
    /// those functions with an LSN that has been processed yet is an error.
    ///
    pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
+        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
+
        // This should never be called from the WAL receiver, because that could lead
        // to a deadlock.
-        ensure!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
            "wait_lsn cannot be called in WAL receiver"
        );
@@ -443,7 +447,7 @@ impl Timeline {
        &self,
        lsn: Lsn,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
@@ -453,12 +457,6 @@ impl Timeline {
        Ok(())
    }

-    //------------------------------------------------------------------------------
-    // Public PUT functions, to update the repository with new page versions.
-    //
-    // These are called by the WAL receiver to digest WAL records.
-    //------------------------------------------------------------------------------
-
    /// Flush to disk all data that was written with the put_* functions
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
@@ -477,6 +475,91 @@ impl Timeline {
        }
    }

+    pub fn compact(&self) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+
+        // Last record Lsn could be zero in case the timelie was just created
+        if !last_record_lsn.is_valid() {
+            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
+            return Ok(());
+        }
+
+        //
+        // High level strategy for compaction / image creation:
+        //
+        // 1. First, calculate the desired "partitioning" of the
+        // currently in-use key space. The goal is to partition the
+        // key space into roughly fixed-size chunks, but also take into
+        // account any existing image layers, and try to align the
+        // chunk boundaries with the existing image layers to avoid
+        // too much churn. Also try to align chunk boundaries with
+        // relation boundaries.  In principle, we don't know about
+        // relation boundaries here, we just deal with key-value
+        // pairs, and the code in pgdatadir_mapping.rs knows how to
+        // map relations into key-value pairs. But in practice we know
+        // that 'field6' is the block number, and the fields 1-5
+        // identify a relation. This is just an optimization,
+        // though.
+        //
+        // 2. Once we know the partitioning, for each partition,
+        // decide if it's time to create a new image layer. The
+        // criteria is: there has been too much "churn" since the last
+        // image layer? The "churn" is fuzzy concept, it's a
+        // combination of too many delta files, or too much WAL in
+        // total in the delta file. Or perhaps: if creating an image
+        // file would allow to delete some older files.
+        //
+        // 3. After that, we compact all level0 delta files if there
+        // are too many of them.  While compacting, we also garbage
+        // collect any page versions that are no longer needed because
+        // of the new image layers we created in step 2.
+        //
+        // TODO: This high level strategy hasn't been implemented yet.
+        // Below are functions compact_level0() and create_image_layers()
+        // but they are a bit ad hoc and don't quite work like it's explained
+        // above. Rewrite it.
+        let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
+
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Define partitioning schema if needed
+
+        match self.repartition(
+            self.get_last_record_lsn(),
+            self.get_compaction_target_size(),
+        ) {
+            Ok((partitioning, lsn)) => {
+                // 2. Create new image layers for partitions that have been modified
+                // "enough".
+                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
+                if !layer_paths_to_upload.is_empty()
+                    && self.upload_layers.load(atomic::Ordering::Relaxed)
+                {
+                    storage_sync::schedule_layer_upload(
+                        self.tenant_id,
+                        self.timeline_id,
+                        layer_paths_to_upload,
+                        None,
+                    );
+                }
+
+                // 3. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size)?;
+                timer.stop_and_record();
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                error!("could not compact, repartitioning keyspace failed: {err:?}");
+            }
+        };
+
+        Ok(())
+    }
+
    /// Mutate the timeline with a [`TimelineWriter`].
    pub fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
@@ -484,6 +567,109 @@ impl Timeline {
            _write_guard: self.write_lock.lock().unwrap(),
        }
    }
+
+    /// Retrieve current logical size of the timeline.
+    ///
+    /// The size could be lagging behind the actual number, in case
+    /// the initial size calculation has not been run (gets triggered on the first size access).
+    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
+        let current_size = self.current_logical_size.current_size()?;
+        debug!("Current size: {current_size:?}");
+
+        let size = current_size.size();
+        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
+            (current_size, self.current_logical_size.initial_part_end)
+        {
+            self.try_spawn_size_init_task(init_lsn);
+        }
+
+        Ok(size)
+    }
+
+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
+    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+        let layers = self.layers.read().unwrap();
+        if let Some(open_layer) = &layers.open_layer {
+            let open_layer_size = open_layer.size()?;
+            drop(layers);
+            let last_freeze_at = self.last_freeze_at.load();
+            let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+            let distance = last_lsn.widening_sub(last_freeze_at);
+            // Checkpointing the open layer can be triggered by layer size or LSN range.
+            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
+            // we want to stay below that with a big margin.  The LSN distance determines how
+            // much WAL the safekeepers need to store.
+            if distance >= self.get_checkpoint_distance().into()
+                || open_layer_size > self.get_checkpoint_distance()
+                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
+            {
+                info!(
+                    "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                    distance,
+                    open_layer_size,
+                    last_freeze_ts.elapsed()
+                );
+
+                self.freeze_inmem_layer(true);
+                self.last_freeze_at.store(last_lsn);
+                *(self.last_freeze_ts.write().unwrap()) = Instant::now();
+
+                // Launch a task to flush the frozen layer to disk, unless
+                // a task was already running. (If the task was running
+                // at the time that we froze the layer, it must've seen the
+                // the layer we just froze before it exited; see comments
+                // in flush_frozen_layers())
+                if let Ok(guard) = self.layer_flush_lock.try_lock() {
+                    drop(guard);
+                    let self_clone = Arc::clone(self);
+                    task_mgr::spawn(
+                        task_mgr::BACKGROUND_RUNTIME.handle(),
+                        task_mgr::TaskKind::LayerFlushTask,
+                        Some(self.tenant_id),
+                        Some(self.timeline_id),
+                        "layer flush task",
+                        false,
+                        async move { self_clone.flush_frozen_layers(false) },
+                    );
+                }
+            }
+        }
+        Ok(())
+    }
+
+    pub fn set_state(&self, new_state: TimelineState) {
+        match (self.current_state(), new_state) {
+            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
+                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+            }
+            (TimelineState::Broken, _) => {
+                error!("Ignoring state update {new_state:?} for broken tenant");
+            }
+            (TimelineState::Paused, TimelineState::Active) => {
+                debug!("Not activating a paused timeline");
+            }
+            (_, new_state) => {
+                self.state.send_replace(new_state);
+            }
+        }
+    }
+
+    pub fn current_state(&self) -> TimelineState {
+        *self.state.borrow()
+    }
+
+    pub fn is_active(&self) -> bool {
+        self.current_state() == TimelineState::Active
+    }
+
+    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
+        self.state.subscribe()
+    }
 }

 // Private functions
@@ -527,7 +713,7 @@ impl Timeline {
    ///
    /// Loads the metadata for the timeline into memory, but not the layer map.
    #[allow(clippy::too_many_arguments)]
-    pub fn new(
+    pub(super) fn new(
        conf: &'static PageServerConf,
        tenant_conf: Arc<RwLock<TenantConfOpt>>,
        metadata: TimelineMetadata,
@@ -537,8 +723,9 @@ impl Timeline {
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        upload_layers: bool,
        pg_version: u32,
-    ) -> Timeline {
+    ) -> Self {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
+        let (state, _) = watch::channel(TimelineState::Suspended);

        let mut result = Timeline {
            conf,
@@ -595,16 +782,17 @@ impl Timeline {

            last_received_wal: Mutex::new(None),
            rel_size_cache: RwLock::new(HashMap::new()),
+            state,
        };
        result.repartition_threshold = result.get_checkpoint_distance() / 10;
        result
    }

-    pub fn launch_wal_receiver(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
        if !is_etcd_client_initialized() {
            if cfg!(test) {
                info!("not launching WAL receiver because etcd client hasn't been initialized");
-                return Ok(());
+                return;
            } else {
                panic!("etcd client not initialized");
            }
@@ -632,16 +820,14 @@ impl Timeline {
            walreceiver_connect_timeout,
            lagging_wal_timeout,
            max_lsn_wal_lag,
-        )?;
-
-        Ok(())
+        );
    }

    ///
    /// Scan the timeline directory to populate the layer map.
    /// Returns all timeline-related files that were found and loaded.
    ///
-    pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
+    pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut layers = self.layers.write().unwrap();
        let mut num_layers = 0;

@@ -727,33 +913,13 @@ impl Timeline {
        Ok(())
    }

-    pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
+    pub(super) fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
        self.layer_removal_cs
            .try_lock()
            .map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
    }

-    /// Retrieve current logical size of the timeline.
-    ///
-    /// The size could be lagging behind the actual number, in case
-    /// the initial size calculation has not been run (gets triggered on the first size access).
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
-        let current_size = self.current_logical_size.current_size()?;
-        debug!("Current size: {current_size:?}");
-
-        let size = current_size.size();
-        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
-            (current_size, self.current_logical_size.initial_part_end)
-        {
-            self.try_spawn_size_init_task(init_lsn);
-        }
-
-        Ok(size)
-    }
-
    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
-        let timeline_id = self.timeline_id;
-
        // Atomically check if the timeline size calculation had already started.
        // If the flag was not already set, this sets it.
        if !self
@@ -770,17 +936,42 @@ impl Timeline {
                "initial size calculation",
                false,
                async move {
-                    let calculated_size = self_clone.calculate_logical_size(init_lsn)?;
-                    let result = spawn_blocking(move || {
-                        self_clone.current_logical_size.initial_logical_size.set(calculated_size)
-                    }).await?;
-                    match result {
-                        Ok(()) => info!("Successfully calculated initial logical size"),
-                        Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
+                    let mut timeline_state_updates = self_clone.subscribe_for_state_updates();
+                    let self_calculation = Arc::clone(&self_clone);
+                    tokio::select! {
+                        calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
+                            let calculated_size = calculation_result
+                                .context("Failed to spawn calculation result task")?
+                                .context("Failed to calculate logical size")?;
+                            match self_clone.current_logical_size.initial_logical_size.set(calculated_size) {
+                                Ok(()) => info!("Successfully calculated initial logical size"),
+                                Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
+                            }
+                            Ok(())
+                        },
+                        new_event = async {
+                            loop {
+                                match timeline_state_updates.changed().await {
+                                    Ok(()) => {
+                                        let new_state = *timeline_state_updates.borrow();
+                                        match new_state {
+                                            // we're running this job for active timelines only
+                                            TimelineState::Active => continue,
+                                            TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state),
+                                        }
+                                    }
+                                    Err(_sender_dropped_error) => return None,
+                                }
+                            }
+                        } => {
+                            match new_event {
+                                Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"),
+                                None => info!("Timeline dropped state updates sender, stopping init size calculation"),
+                            }
+                            Ok(())
+                        },
                    }
-                    Ok(())
-                }
-                .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id))
+                }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)),
            );
        }
    }
@@ -971,7 +1162,7 @@ impl Timeline {
        Some((lsn, img))
    }

-    fn get_ancestor_timeline(&self) -> Result<Arc<Timeline>> {
+    fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
            format!(
                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
@@ -1030,14 +1221,14 @@ impl Timeline {
        Ok(layer)
    }

-    fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn)?;
        layer.put_value(key, lsn, val)?;
        Ok(())
    }

-    fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
+    fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
        let layer = self.get_layer_for_write(lsn)?;
        layer.put_tombstone(key_range, lsn)?;

@@ -1076,64 +1267,6 @@ impl Timeline {
        drop(layers);
    }

-    ///
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    ///
-    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let layers = self.layers.read().unwrap();
-        if let Some(open_layer) = &layers.open_layer {
-            let open_layer_size = open_layer.size()?;
-            drop(layers);
-            let last_freeze_at = self.last_freeze_at.load();
-            let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-            let distance = last_lsn.widening_sub(last_freeze_at);
-            // Checkpointing the open layer can be triggered by layer size or LSN range.
-            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
-            // we want to stay below that with a big margin.  The LSN distance determines how
-            // much WAL the safekeepers need to store.
-            if distance >= self.get_checkpoint_distance().into()
-                || open_layer_size > self.get_checkpoint_distance()
-                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-            {
-                info!(
-                    "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                    distance,
-                    open_layer_size,
-                    last_freeze_ts.elapsed()
-                );
-
-                self.freeze_inmem_layer(true);
-                self.last_freeze_at.store(last_lsn);
-                *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-                // Launch a task to flush the frozen layer to disk, unless
-                // a task was already running. (If the task was running
-                // at the time that we froze the layer, it must've seen the
-                // the layer we just froze before it exited; see comments
-                // in flush_frozen_layers())
-                if let Ok(guard) = self.layer_flush_lock.try_lock() {
-                    drop(guard);
-                    let self_clone = Arc::clone(self);
-                    task_mgr::spawn(
-                        task_mgr::BACKGROUND_RUNTIME.handle(),
-                        task_mgr::TaskKind::LayerFlushTask,
-                        Some(self.tenant_id),
-                        Some(self.timeline_id),
-                        "layer flush task",
-                        false,
-                        async move { self_clone.flush_frozen_layers(false) },
-                    );
-                }
-            }
-        }
-        Ok(())
-    }
-
    /// Flush all frozen layers to disk.
    ///
    /// Only one task at a time can be doing layer-flushing for a
@@ -1141,7 +1274,7 @@ impl Timeline {
    /// currently doing the flushing, this function will wait for it
    /// to finish. If 'wait' is false, this function will return
    /// immediately instead.
-    fn flush_frozen_layers(&self, wait: bool) -> Result<()> {
+    fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
        let flush_lock_guard = if wait {
            self.layer_flush_lock.lock().unwrap()
        } else {
@@ -1180,7 +1313,7 @@ impl Timeline {
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
+    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -1220,78 +1353,76 @@ impl Timeline {
        // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
        // *all* the layers, to avoid fsyncing the file multiple times.
        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
-        self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?;
+        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();

+        // If we were able to advance 'disk_consistent_lsn', save it the metadata file.
+        // After crash, we will restart WAL streaming and processing from that point.
+        if disk_consistent_lsn != old_disk_consistent_lsn {
+            assert!(disk_consistent_lsn > old_disk_consistent_lsn);
+            self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)?;
+            // Also update the in-memory copy
+            self.disk_consistent_lsn.store(disk_consistent_lsn);
+        }
        Ok(())
    }

    /// Update metadata file
-    fn update_disk_consistent_lsn(
+    fn update_metadata_file(
        &self,
        disk_consistent_lsn: Lsn,
        layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
-    ) -> Result<()> {
-        // If we were able to advance 'disk_consistent_lsn', save it the metadata file.
-        // After crash, we will restart WAL streaming and processing from that point.
-        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
-        if disk_consistent_lsn != old_disk_consistent_lsn {
-            assert!(disk_consistent_lsn > old_disk_consistent_lsn);
+    ) -> anyhow::Result<()> {
+        // We can only save a valid 'prev_record_lsn' value on disk if we
+        // flushed *all* in-memory changes to disk. We only track
+        // 'prev_record_lsn' in memory for the latest processed record, so we
+        // don't remember what the correct value that corresponds to some old
+        // LSN is. But if we flush everything, then the value corresponding
+        // current 'last_record_lsn' is correct and we can store it on disk.
+        let RecordLsn {
+            last: last_record_lsn,
+            prev: prev_record_lsn,
+        } = self.last_record_lsn.load();
+        let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
+            Some(prev_record_lsn)
+        } else {
+            None
+        };

-            // We can only save a valid 'prev_record_lsn' value on disk if we
-            // flushed *all* in-memory changes to disk. We only track
-            // 'prev_record_lsn' in memory for the latest processed record, so we
-            // don't remember what the correct value that corresponds to some old
-            // LSN is. But if we flush everything, then the value corresponding
-            // current 'last_record_lsn' is correct and we can store it on disk.
-            let RecordLsn {
-                last: last_record_lsn,
-                prev: prev_record_lsn,
-            } = self.last_record_lsn.load();
-            let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
-                Some(prev_record_lsn)
-            } else {
-                None
-            };
+        let ancestor_timeline_id = self
+            .ancestor_timeline
+            .as_ref()
+            .map(|ancestor| ancestor.timeline_id);

-            let ancestor_timeline_id = self
-                .ancestor_timeline
-                .as_ref()
-                .map(|ancestor| ancestor.timeline_id);
+        let metadata = TimelineMetadata::new(
+            disk_consistent_lsn,
+            ondisk_prev_record_lsn,
+            ancestor_timeline_id,
+            self.ancestor_lsn,
+            *self.latest_gc_cutoff_lsn.read(),
+            self.initdb_lsn,
+            self.pg_version,
+        );

-            let metadata = TimelineMetadata::new(
-                disk_consistent_lsn,
-                ondisk_prev_record_lsn,
-                ancestor_timeline_id,
-                self.ancestor_lsn,
-                *self.latest_gc_cutoff_lsn.read(),
-                self.initdb_lsn,
-                self.pg_version,
-            );
+        fail_point!("checkpoint-before-saving-metadata", |x| bail!(
+            "{}",
+            x.unwrap()
+        ));

-            fail_point!("checkpoint-before-saving-metadata", |x| bail!(
-                "{}",
-                x.unwrap()
-            ));
+        save_metadata(
+            self.conf,
+            self.timeline_id,
+            self.tenant_id,
+            &metadata,
+            false,
+        )?;

-            save_metadata(
-                self.conf,
-                self.timeline_id,
+        if self.can_upload_layers() {
+            storage_sync::schedule_layer_upload(
                self.tenant_id,
-                &metadata,
-                false,
-            )?;
-
-            if self.upload_layers.load(atomic::Ordering::Relaxed) {
-                storage_sync::schedule_layer_upload(
-                    self.tenant_id,
-                    self.timeline_id,
-                    layer_paths_to_upload,
-                    Some(metadata),
-                );
-            }
-
-            // Also update the in-memory copy
-            self.disk_consistent_lsn.store(disk_consistent_lsn);
+                self.timeline_id,
+                layer_paths_to_upload,
+                Some(metadata),
+            );
        }

        Ok(())
@@ -1301,7 +1432,7 @@ impl Timeline {
    fn create_delta_layer(
        &self,
        frozen_layer: &InMemoryLayer,
-    ) -> Result<(PathBuf, LayerFileMetadata)> {
+    ) -> anyhow::Result<(PathBuf, LayerFileMetadata)> {
        // Write it out
        let new_delta = frozen_layer.write_to_disk()?;
        let new_delta_path = new_delta.path();
@@ -1336,92 +1467,7 @@ impl Timeline {
        Ok((new_delta_path, LayerFileMetadata::new(sz)))
    }

-    pub fn compact(&self) -> anyhow::Result<()> {
-        let last_record_lsn = self.get_last_record_lsn();
-
-        // Last record Lsn could be zero in case the timelie was just created
-        if !last_record_lsn.is_valid() {
-            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(());
-        }
-
-        //
-        // High level strategy for compaction / image creation:
-        //
-        // 1. First, calculate the desired "partitioning" of the
-        // currently in-use key space. The goal is to partition the
-        // key space into roughly fixed-size chunks, but also take into
-        // account any existing image layers, and try to align the
-        // chunk boundaries with the existing image layers to avoid
-        // too much churn. Also try to align chunk boundaries with
-        // relation boundaries.  In principle, we don't know about
-        // relation boundaries here, we just deal with key-value
-        // pairs, and the code in pgdatadir_mapping.rs knows how to
-        // map relations into key-value pairs. But in practice we know
-        // that 'field6' is the block number, and the fields 1-5
-        // identify a relation. This is just an optimization,
-        // though.
-        //
-        // 2. Once we know the partitioning, for each partition,
-        // decide if it's time to create a new image layer. The
-        // criteria is: there has been too much "churn" since the last
-        // image layer? The "churn" is fuzzy concept, it's a
-        // combination of too many delta files, or too much WAL in
-        // total in the delta file. Or perhaps: if creating an image
-        // file would allow to delete some older files.
-        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
-        let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
-
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Define partitioning schema if needed
-
-        match self.repartition(
-            self.get_last_record_lsn(),
-            self.get_compaction_target_size(),
-        ) {
-            Ok((partitioning, lsn)) => {
-                // 2. Create new image layers for partitions that have been modified
-                // "enough".
-                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
-                if !layer_paths_to_upload.is_empty()
-                    && self.upload_layers.load(atomic::Ordering::Relaxed)
-                {
-                    storage_sync::schedule_layer_upload(
-                        self.tenant_id,
-                        self.timeline_id,
-                        layer_paths_to_upload,
-                        None,
-                    );
-                }
-
-                // 3. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size)?;
-                timer.stop_and_record();
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                error!("could not compact, repartitioning keyspace failed: {err:?}");
-            }
-        };
-
-        Ok(())
-    }
-
-    fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> {
+    fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
        let mut partitioning_guard = self.partitioning.lock().unwrap();
        if partitioning_guard.1 == Lsn(0)
            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
@@ -1435,7 +1481,7 @@ impl Timeline {
    }

    // Is it time to create a new image layer for the given partition?
-    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<bool> {
+    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
        let layers = self.layers.read().unwrap();

        for part_range in &partition.ranges {
@@ -1480,7 +1526,7 @@ impl Timeline {
        partitioning: &KeyPartitioning,
        lsn: Lsn,
        force: bool,
-    ) -> Result<HashMap<PathBuf, LayerFileMetadata>> {
+    ) -> anyhow::Result<HashMap<PathBuf, LayerFileMetadata>> {
        let timer = self.metrics.create_images_time_histo.start_timer();
        let mut image_layers: Vec<ImageLayer> = Vec::new();
        for partition in partitioning.parts.iter() {
@@ -1495,10 +1541,39 @@ impl Timeline {
                    lsn,
                )?;

+                fail_point!("image-layer-writer-fail-before-finish", |_| {
+                    anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
+                });
+
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
-                        let img = self.get(key, lsn)?;
+                        let img = match self.get(key, lsn) {
+                            Ok(img) => img,
+                            Err(err) => {
+                                // If we fail to reconstruct a VM or FSM page, we can zero the
+                                // page without losing any actual user data. That seems better
+                                // than failing repeatedly and getting stuck.
+                                //
+                                // We had a bug at one point, where we truncated the FSM and VM
+                                // in the pageserver, but the Postgres didn't know about that
+                                // and continued to generate incremental WAL records for pages
+                                // that didn't exist in the pageserver. Trying to replay those
+                                // WAL records failed to find the previous image of the page.
+                                // This special case allows us to recover from that situation.
+                                // See https://github.com/neondatabase/neon/issues/2601.
+                                //
+                                // Unfortunately we cannot do this for the main fork, or for
+                                // any metadata keys, keys, as that would lead to actual data
+                                // loss.
+                                if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
+                                    warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
+                                    ZERO_PAGE.clone()
+                                } else {
+                                    return Err(err);
+                                }
+                            }
+                        };
                        image_layer_writer.put_image(key, &img)?;
                        key = key.next();
                    }
@@ -1548,7 +1623,7 @@ impl Timeline {
    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
    /// as Level 1 files.
    ///
-    fn compact_level0(&self, target_file_size: u64) -> Result<()> {
+    fn compact_level0(&self, target_file_size: u64) -> anyhow::Result<()> {
        let layers = self.layers.read().unwrap();
        let mut level0_deltas = layers.get_level0_deltas()?;
        drop(layers);
@@ -1764,6 +1839,11 @@ impl Timeline {
                    },
                )?);
            }
+
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
+            });
+
            writer.as_mut().unwrap().put_value(key, lsn, value)?;
            prev_key = Some(key);
        }
@@ -1815,7 +1895,7 @@ impl Timeline {
        }
        drop(layers);

-        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+        if self.can_upload_layers() {
            storage_sync::schedule_layer_upload(
                self.tenant_id,
                self.timeline_id,
@@ -1858,12 +1938,12 @@ impl Timeline {
    ///
    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
    /// whether a record is needed for PITR.
-    pub fn update_gc_info(
+    pub(super) fn update_gc_info(
        &self,
        retain_lsns: Vec<Lsn>,
        cutoff_horizon: Lsn,
        pitr: Duration,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        let mut gc_info = self.gc_info.write().unwrap();

        gc_info.horizon_cutoff = cutoff_horizon;
@@ -1918,8 +1998,8 @@ impl Timeline {
    /// within a layer file. We can only remove the whole file if it's fully
    /// obsolete.
    ///
-    pub fn gc(&self) -> Result<GcResult> {
-        let mut result: GcResult = Default::default();
+    pub(super) fn gc(&self) -> anyhow::Result<GcResult> {
+        let mut result: GcResult = GcResult::default();
        let now = SystemTime::now();

        fail_point!("before-timeline-gc");
@@ -1962,6 +2042,9 @@ impl Timeline {
            );
            write_guard.store_and_unlock(new_gc_cutoff).wait();
        }
+        // Persist the new GC cutoff value in the metadata file, before
+        // we actually remove anything.
+        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;

        info!("GC starting");

@@ -2087,7 +2170,16 @@ impl Timeline {
            result.layers_removed += 1;
        }

-        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+        info!(
+            "GC completed removing {} layers, cutoff {}",
+            result.layers_removed, new_gc_cutoff
+        );
+
+        if result.layers_removed != 0 {
+            fail_point!("after-timeline-gc-removed-layers");
+        }
+
+        if self.can_upload_layers() {
            storage_sync::schedule_layer_delete(
                self.tenant_id,
                self.timeline_id,
@@ -2176,6 +2268,11 @@ impl Timeline {
            }
        }
    }
+
+    fn can_upload_layers(&self) -> bool {
+        self.upload_layers.load(atomic::Ordering::Relaxed)
+            && self.current_state() != TimelineState::Broken
+    }
 }

 /// Helper function for get_reconstruct_data() to add the path of layers traversed
@@ -2226,11 +2323,11 @@ impl<'a> TimelineWriter<'a> {
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
+    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
        self.tl.put_value(key, lsn, value)
    }

-    pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
+    pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
        self.tl.put_tombstone(key_range, lsn)
    }

--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -12,7 +12,7 @@ use tracing::*;

 use remote_storage::GenericRemoteStorage;

-use crate::config::{PageServerConf, METADATA_FILE_NAME};
+use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
 use crate::http::models::TenantInfo;
 use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
 use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
@@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt;
 use crate::walredo::PostgresRedoManager;
 use crate::TEMP_FILE_SUFFIX;

-use utils::crashsafe_dir::{self, path_with_suffix_extension};
+use utils::crashsafe::{self, path_with_suffix_extension};
 use utils::id::{TenantId, TimelineId};

 mod tenants_state {
@@ -265,58 +265,98 @@ fn create_tenant_files(
        temporary_tenant_dir.display()
    );

-    let temporary_tenant_timelines_dir = rebase_directory(
-        &conf.timelines_path(&tenant_id),
-        &target_tenant_directory,
-        &temporary_tenant_dir,
-    )?;
-    let temporary_tenant_config_path = rebase_directory(
-        &conf.tenant_config_path(tenant_id),
-        &target_tenant_directory,
-        &temporary_tenant_dir,
-    )?;
-
    // top-level dir may exist if we are creating it through CLI
-    crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| {
+    crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
        format!(
            "could not create temporary tenant directory {}",
            temporary_tenant_dir.display()
        )
    })?;
-    // first, create a config in the top-level temp directory, fsync the file
-    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?;
-    // then, create a subdirectory in the top-level temp directory, fsynced
-    crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
+
+    let creation_result = try_create_target_tenant_dir(
+        conf,
+        tenant_conf,
+        tenant_id,
+        &temporary_tenant_dir,
+        &target_tenant_directory,
+    );
+
+    if creation_result.is_err() {
+        error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
+        if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
+            error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
+        } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
+            error!(
+                "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
+            )
+        }
+    }
+
+    creation_result
+}
+
+fn try_create_target_tenant_dir(
+    conf: &'static PageServerConf,
+    tenant_conf: TenantConfOpt,
+    tenant_id: TenantId,
+    temporary_tenant_dir: &Path,
+    target_tenant_directory: &Path,
+) -> Result<(), anyhow::Error> {
+    let temporary_tenant_timelines_dir = rebase_directory(
+        &conf.timelines_path(&tenant_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+    let temporary_tenant_config_path = rebase_directory(
+        &conf.tenant_config_path(tenant_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
+        || {
+            format!(
+                "Failed to write tenant {} config to {}",
+                tenant_id,
+                temporary_tenant_config_path.display()
+            )
+        },
+    )?;
+    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
-            "could not create temporary tenant timelines directory {}",
+            "could not create tenant {} temporary timelines directory {}",
+            tenant_id,
            temporary_tenant_timelines_dir.display()
        )
    })?;
-
    fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
    });

-    // move-rename tmp directory with all files synced into a permanent directory, fsync its parent
-    fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| {
+    fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
        format!(
-            "failed to move temporary tenant directory {} into the permanent one {}",
+            "failed to move tenant {} temporary directory {} into the permanent one {}",
+            tenant_id,
            temporary_tenant_dir.display(),
            target_tenant_directory.display()
        )
    })?;
    let target_dir_parent = target_tenant_directory.parent().with_context(|| {
        format!(
-            "Failed to get tenant dir parent for {}",
+            "Failed to get tenant {} dir parent for {}",
+            tenant_id,
            target_tenant_directory.display()
        )
    })?;
-    fs::File::open(target_dir_parent)?.sync_all()?;
-
-    info!(
-        "created tenant directory structure in {}",
-        target_tenant_directory.display()
-    );
+    crashsafe::fsync(target_dir_parent).with_context(|| {
+        format!(
+            "Failed to fsync renamed directory's parent {} for tenant {}",
+            target_dir_parent.display(),
+            tenant_id,
+        )
+    })?;

    Ok(())
 }
@@ -602,6 +642,15 @@ fn is_temporary(path: &Path) -> bool {
    }
 }

+fn is_uninit_mark(path: &Path) -> bool {
+    match path.file_name() {
+        Some(name) => name
+            .to_string_lossy()
+            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
+        None => false,
+    }
+}
+
 fn collect_timelines_for_tenant(
    config: &'static PageServerConf,
    tenant_path: &Path,
@@ -644,28 +693,74 @@ fn collect_timelines_for_tenant(
                            e
                        );
                    }
+                } else if is_uninit_mark(&timeline_dir) {
+                    let timeline_uninit_mark_file = &timeline_dir;
+                    info!(
+                        "Found an uninit mark file {}, removing the timeline and its uninit mark",
+                        timeline_uninit_mark_file.display()
+                    );
+                    let timeline_id = timeline_uninit_mark_file
+                        .file_stem()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline uninit mark name {}",
+                                timeline_uninit_mark_file.display()
+                            )
+                        })?;
+                    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+                    if let Err(e) =
+                        remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
+                    {
+                        error!("Failed to clean up uninit marked timeline: {e:?}");
+                    }
                } else {
-                    match collect_timeline_files(&timeline_dir) {
-                        Ok((timeline_id, metadata, timeline_files)) => {
-                            tenant_timelines.insert(
-                                timeline_id,
-                                TimelineLocalFiles::collected(metadata, timeline_files),
-                            );
+                    let timeline_id = timeline_dir
+                        .file_name()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline dir name {}",
+                                timeline_dir.display()
+                            )
+                        })?;
+                    let timeline_uninit_mark_file =
+                        config.timeline_uninit_mark_file_path(tenant_id, timeline_id);
+                    if timeline_uninit_mark_file.exists() {
+                        info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark");
+                        if let Err(e) = remove_timeline_and_uninit_mark(
+                            &timeline_dir,
+                            &timeline_uninit_mark_file,
+                        ) {
+                            error!("Failed to clean up uninit marked timeline: {e:?}");
                        }
-                        Err(e) => {
-                            error!(
-                                "Failed to process timeline dir contents at '{}', reason: {:?}",
-                                timeline_dir.display(),
-                                e
-                            );
-                            match remove_if_empty(&timeline_dir) {
-                                Ok(true) => info!(
-                                    "Removed empty timeline directory {}",
-                                    timeline_dir.display()
-                                ),
-                                Ok(false) => (),
-                                Err(e) => {
-                                    error!("Failed to remove empty timeline directory: {e:?}")
+                    } else {
+                        match collect_timeline_files(&timeline_dir) {
+                            Ok((metadata, timeline_files)) => {
+                                tenant_timelines.insert(
+                                    timeline_id,
+                                    TimelineLocalFiles::collected(metadata, timeline_files),
+                                );
+                            }
+                            Err(e) => {
+                                error!(
+                                    "Failed to process timeline dir contents at '{}', reason: {:?}",
+                                    timeline_dir.display(),
+                                    e
+                                );
+                                match remove_if_empty(&timeline_dir) {
+                                    Ok(true) => info!(
+                                        "Removed empty timeline directory {}",
+                                        timeline_dir.display()
+                                    ),
+                                    Ok(false) => (),
+                                    Err(e) => {
+                                        error!("Failed to remove empty timeline directory: {e:?}")
+                                    }
                                }
                            }
                        }
@@ -688,24 +783,41 @@ fn collect_timelines_for_tenant(
    Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
 }

+fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
+    fs::remove_dir_all(&timeline_dir)
+        .or_else(|e| {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                // we can leave the uninit mark without a timeline dir,
+                // just remove the mark then
+                Ok(())
+            } else {
+                Err(e)
+            }
+        })
+        .with_context(|| {
+            format!(
+                "Failed to remove unit marked timeline directory {}",
+                timeline_dir.display()
+            )
+        })?;
+    fs::remove_file(&uninit_mark).with_context(|| {
+        format!(
+            "Failed to remove timeline uninit mark file {}",
+            uninit_mark.display()
+        )
+    })?;
+
+    Ok(())
+}
+
 // discover timeline files and extract timeline metadata
 //  NOTE: ephemeral files are excluded from the list
 fn collect_timeline_files(
    timeline_dir: &Path,
-) -> anyhow::Result<(
-    TimelineId,
-    TimelineMetadata,
-    HashMap<PathBuf, LayerFileMetadata>,
-)> {
+) -> anyhow::Result<(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>)> {
    let mut timeline_files = HashMap::new();
    let mut timeline_metadata_path = None;

-    let timeline_id = timeline_dir
-        .file_name()
-        .and_then(OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<TimelineId>()
-        .context("Could not parse timeline id out of the timeline dir name")?;
    let timeline_dir_entries =
        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
    for entry in timeline_dir_entries {
@@ -754,5 +866,5 @@ fn collect_timeline_files(
        "Timeline has no ancestor and no layer files"
    );

-    Ok((timeline_id, metadata, timeline_files))
+    Ok((metadata, timeline_files))
 }
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -70,8 +70,10 @@ async fn compaction_loop(tenant_id: TenantId) {
            // Run compaction
            let mut sleep_duration = tenant.get_compaction_period();
            if let Err(e) = tenant.compaction_iteration() {
-                error!("Compaction failed, retrying: {e:#}");
                sleep_duration = wait_duration;
+                error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
+                #[cfg(feature = "testing")]
+                std::process::abort();
            }

            // Sleep
@@ -119,8 +121,10 @@ async fn gc_loop(tenant_id: TenantId) {
            if gc_horizon > 0 {
                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
                {
-                    error!("Gc failed, retrying: {e:#}");
                    sleep_duration = wait_duration;
+                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
+                    #[cfg(feature = "testing")]
+                    std::process::abort();
                }
            }

@@ -171,7 +175,7 @@ async fn wait_for_active_tenant(
                        }
                        state => {
                            debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}");
-                            tokio::time::sleep(wait).await;
+                            continue;
                        }
                    }
                }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -319,6 +319,12 @@ impl VirtualFile {

        Ok(result)
    }
+
+    pub fn remove(self) {
+        let path = self.path.clone();
+        drop(self);
+        std::fs::remove_file(path).expect("failed to remove the virtual file");
+    }
 }

 impl Drop for VirtualFile {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,9 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walrecord::*;
+use crate::ZERO_PAGE;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
@@ -43,8 +44,6 @@ use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;

-static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
-
 pub struct WalIngest<'a> {
    timeline: &'a Timeline,

--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -155,22 +155,19 @@ impl<E: Clone> TaskHandle<E> {

    /// Aborts current task, waiting for it to finish.
    pub async fn shutdown(self) {
-        match self.join_handle {
-            Some(jh) => {
-                self.cancellation.send(()).ok();
-                match jh.await {
-                    Ok(Ok(())) => debug!("Shutdown success"),
-                    Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                    Err(join_error) => {
-                        if join_error.is_cancelled() {
-                            error!("Shutdown task was cancelled");
-                        } else {
-                            error!("Shutdown task join error: {join_error}")
-                        }
+        if let Some(jh) = self.join_handle {
+            self.cancellation.send(()).ok();
+            match jh.await {
+                Ok(Ok(())) => debug!("Shutdown success"),
+                Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
+                Err(join_error) => {
+                    if join_error.is_cancelled() {
+                        error!("Shutdown task was cancelled");
+                    } else {
+                        error!("Shutdown task join error: {join_error}")
                    }
                }
            }
-            None => {}
        }
    }
 }
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -12,6 +12,7 @@
 use std::{
    collections::{hash_map, HashMap},
    num::NonZeroU64,
+    ops::ControlFlow,
    sync::Arc,
    time::Duration,
 };
@@ -26,7 +27,8 @@ use etcd_broker::{
    subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
    BrokerUpdate, Client,
 };
-use tokio::select;
+use pageserver_api::models::TimelineState;
+use tokio::{select, sync::watch};
 use tracing::*;

 use crate::{
@@ -47,7 +49,7 @@ pub fn spawn_connection_manager_task(
    wal_connect_timeout: Duration,
    lagging_wal_timeout: Duration,
    max_lsn_wal_lag: NonZeroU64,
-) -> anyhow::Result<()> {
+) {
    let mut etcd_client = get_etcd_client().clone();

    let tenant_id = timeline.tenant_id;
@@ -58,10 +60,7 @@ pub fn spawn_connection_manager_task(
        TaskKind::WalReceiverManager,
        Some(tenant_id),
        Some(timeline_id),
-        &format!(
-            "walreceiver for tenant {} timeline {}",
-            timeline.tenant_id, timeline.timeline_id
-        ),
+        &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
        false,
        async move {
            info!("WAL receiver broker started, connecting to etcd");
@@ -75,27 +74,28 @@ pub fn spawn_connection_manager_task(
                select! {
                    _ = task_mgr::shutdown_watcher() => {
                        info!("WAL receiver shutdown requested, shutting down");
-                        // Kill current connection, if any
-                        if let Some(wal_connection) = walreceiver_state.wal_connection.take()
-                        {
-                            wal_connection.connection_task.shutdown().await;
-                        }
+                        walreceiver_state.shutdown().await;
                        return Ok(());
                    },
-
-                    _ = connection_manager_loop_step(
+                    loop_step_result = connection_manager_loop_step(
                        &broker_loop_prefix,
                        &mut etcd_client,
                        &mut walreceiver_state,
-                    ) => {},
+                    ) => match loop_step_result {
+                        ControlFlow::Continue(()) => continue,
+                        ControlFlow::Break(()) => {
+                            info!("Connection manager loop ended, shutting down");
+                            walreceiver_state.shutdown().await;
+                            return Ok(());
+                        }
+                    },
                }
            }
        }
        .instrument(
-            info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
+            info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
        ),
    );
-    Ok(())
 }

 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
@@ -105,7 +105,17 @@ async fn connection_manager_loop_step(
    broker_prefix: &str,
    etcd_client: &mut Client,
    walreceiver_state: &mut WalreceiverState,
-) {
+) -> ControlFlow<(), ()> {
+    let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
+
+    match wait_for_active_timeline(&mut timeline_state_updates).await {
+        ControlFlow::Continue(()) => {}
+        ControlFlow::Break(()) => {
+            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
+            return ControlFlow::Break(());
+        }
+    }
+
    let id = TenantTimelineId {
        tenant_id: walreceiver_state.timeline.tenant_id,
        timeline_id: walreceiver_state.timeline.timeline_id,
@@ -130,10 +140,12 @@ async fn connection_manager_loop_step(
        //  - change connection if the rules decide so, or if the current connection dies
        //  - receive updates from broker
        //      - this might change the current desired connection
+        //  - timeline state changes to something that does not allow walreceiver to run concurrently
        select! {
            broker_connection_result = &mut broker_subscription.watcher_handle => {
+                info!("Broker connection was closed from the other side, ending current broker loop step");
                cleanup_broker_connection(broker_connection_result, walreceiver_state);
-                return;
+                return ControlFlow::Continue(());
            },

            Some(wal_connection_update) = async {
@@ -186,11 +198,36 @@ async fn connection_manager_loop_step(
                            (&mut broker_subscription.watcher_handle).await,
                            walreceiver_state,
                        );
-                        return;
+                        return ControlFlow::Continue(());
                    }
                }
            },

+            new_event = async {
+                loop {
+                    match timeline_state_updates.changed().await {
+                        Ok(()) => {
+                            let new_state = walreceiver_state.timeline.current_state();
+                            match new_state {
+                                // we're already active as walreceiver, no need to reactivate
+                                TimelineState::Active => continue,
+                                TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                            }
+                        }
+                        Err(_sender_dropped_error) => return ControlFlow::Break(()),
+                    }
+                }
+            } => match new_event {
+                ControlFlow::Continue(new_state) => {
+                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
+                    return ControlFlow::Continue(());
+                }
+                ControlFlow::Break(()) => {
+                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
+                    return ControlFlow::Break(());
+                }
+            },
+
            _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
        }

@@ -217,6 +254,34 @@ async fn connection_manager_loop_step(
    }
 }

+async fn wait_for_active_timeline(
+    timeline_state_updates: &mut watch::Receiver<TimelineState>,
+) -> ControlFlow<(), ()> {
+    let current_state = *timeline_state_updates.borrow();
+    if current_state == TimelineState::Active {
+        return ControlFlow::Continue(());
+    }
+
+    loop {
+        match timeline_state_updates.changed().await {
+            Ok(()) => {
+                let new_state = *timeline_state_updates.borrow();
+                match new_state {
+                    TimelineState::Active => {
+                        debug!("Timeline state changed to active, continuing the walreceiver connection manager");
+                        return ControlFlow::Continue(());
+                    }
+                    state => {
+                        debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}");
+                        continue;
+                    }
+                }
+            }
+            Err(_sender_dropped_error) => return ControlFlow::Break(()),
+        }
+    }
+}
+
 fn cleanup_broker_connection(
    broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
    walreceiver_state: &mut WalreceiverState,
@@ -724,6 +789,12 @@ impl WalreceiverState {
            self.wal_connection_retries.remove(&node_id);
        }
    }
+
+    async fn shutdown(mut self) {
+        if let Some(wal_connection) = self.wal_connection.take() {
+            wal_connection.connection_task.shutdown().await;
+        }
+    }
 }

 #[derive(Debug, PartialEq, Eq)]
@@ -765,15 +836,20 @@ fn wal_stream_connection_string(
    listen_pg_addr_str: &str,
 ) -> anyhow::Result<String> {
    let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db");
-    let me_conf = sk_connstr
-        .parse::<postgres::config::Config>()
-        .with_context(|| {
-            format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one")
-        })?;
-    let (host, port) = utils::connstring::connection_host_port(&me_conf);
-    Ok(format!(
-        "host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
-    ))
+    sk_connstr
+        .parse()
+        .context("bad url")
+        .and_then(|url: url::Url| {
+            let host = url.host_str().context("host is missing")?;
+            let port = url.port().unwrap_or(5432); // default PG port
+
+            Ok(format!(
+                "host={host} \
+                 port={port} \
+                 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
+            ))
+        })
+        .with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'"))
 }

 #[cfg(test)]
@@ -802,6 +878,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -818,7 +895,9 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        safekeeper_connstr: Some("no commit_lsn".to_string()),
+                        local_start_lsn: None,
+
+                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -834,7 +913,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        safekeeper_connstr: Some("no commit_lsn".to_string()),
+                        local_start_lsn: None,
+                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -850,6 +930,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -909,6 +990,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -925,7 +1008,9 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        safekeeper_connstr: Some("not advanced Lsn".to_string()),
+                        local_start_lsn: None,
+
+                        safekeeper_connstr: Some("not_advanced_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -941,7 +1026,9 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        safekeeper_connstr: Some("not enough advanced Lsn".to_string()),
+                        local_start_lsn: None,
+
+                        safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -975,6 +1062,8 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
+                    local_start_lsn: None,
+
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1007,7 +1096,9 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        safekeeper_connstr: Some("smaller commit_lsn".to_string()),
+                        local_start_lsn: None,
+
+                        safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1023,6 +1114,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1039,6 +1132,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -1084,6 +1179,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1100,6 +1197,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1169,6 +1268,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1185,7 +1286,9 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()),
+                        local_start_lsn: None,
+
+                        safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1209,7 +1312,7 @@ mod tests {
        );
        assert!(over_threshcurrent_candidate
            .wal_source_connstr
-            .contains("advanced by Lsn safekeeper"));
+            .contains("advanced_by_lsn_safekeeper"));

        Ok(())
    }
@@ -1256,6 +1359,8 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
+                    local_start_lsn: None,
+
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1327,6 +1432,8 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
+                    local_start_lsn: None,
+
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1374,7 +1481,9 @@ mod tests {
            timeline: harness
                .load()
                .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
-                .expect("Failed to create an empty timeline for dummy wal connection manager"),
+                .expect("Failed to create an empty timeline for dummy wal connection manager")
+                .initialize()
+                .unwrap(),
            wal_connect_timeout: Duration::from_secs(1),
            lagging_wal_timeout: Duration::from_secs(1),
            max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -10,7 +10,7 @@
 //! process. Then we get the page image back. Communication with the
 //! postgres process happens via stdin/stdout
 //!
-//! See src/backend/tcop/zenith_wal_redo.c for the other side of
+//! See pgxn/neon_walredo/walredoproc.c for the other side of
 //! this communication.
 //!
 //! The Postgres process is assumed to be secure against malicious WAL
@@ -35,17 +35,18 @@ use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::crashsafe_dir::path_with_suffix_extension;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

 use crate::metrics::{
-    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME,
+    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+    WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::v14::nonrelfile_utils::{
@@ -244,12 +245,23 @@ impl PostgresRedoManager {
        let end_time = Instant::now();
        let duration = end_time.duration_since(lock_time);

+        let len = records.len();
+        let nbytes = records.iter().fold(0, |acumulator, record| {
+            acumulator
+                + match &record.1 {
+                    NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                    _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
+                }
+        });
+
        WAL_REDO_TIME.observe(duration.as_secs_f64());
-        WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64);
+        WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+        WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);

        debug!(
-            "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
-            records.len(),
+            "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+            len,
+            nbytes,
            duration.as_micros(),
            lsn
        );
@@ -258,8 +270,9 @@ impl PostgresRedoManager {
        // next request will launch a new one.
        if result.is_err() {
            error!(
-                "error applying {} WAL records to reconstruct page image at LSN {}",
+                "error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
                records.len(),
+                nbytes,
                lsn
            );
            let process = process_guard.take().unwrap();
@@ -597,13 +610,26 @@ impl PostgresRedoProcess {
            );
            fs::remove_dir_all(&datadir)?;
        }
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).map_err(|e| {
+            Error::new(
+                ErrorKind::Other,
+                format!("incorrect pg_bin_dir path: {}", e),
+            )
+        })?;
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).map_err(|e| {
+            Error::new(
+                ErrorKind::Other,
+                format!("incorrect pg_lib_dir path: {}", e),
+            )
+        })?;
+
        info!("running initdb in {}", datadir.display());
-        let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb"))
+        let initdb = Command::new(pg_bin_dir_path.join("initdb"))
            .args(&["-D", &datadir.to_string_lossy()])
            .arg("-N")
            .env_clear()
-            .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
-            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS
            .close_fds()
            .output()
            .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?;
@@ -618,38 +644,33 @@ impl PostgresRedoProcess {
                ),
            ));
        } else {
-            // Limit shared cache for wal-redo-postres
+            // Limit shared cache for wal-redo-postgres
            let mut config = OpenOptions::new()
                .append(true)
                .open(PathBuf::from(&datadir).join("postgresql.conf"))?;
            config.write_all(b"shared_buffers=128kB\n")?;
            config.write_all(b"fsync=off\n")?;
-            config.write_all(b"shared_preload_libraries=neon\n")?;
-            config.write_all(b"neon.wal_redo=on\n")?;
        }

        // Start postgres itself
-        let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres"))
+        let mut child = Command::new(pg_bin_dir_path.join("postgres"))
            .arg("--wal-redo")
            .stdin(Stdio::piped())
            .stderr(Stdio::piped())
            .stdout(Stdio::piped())
            .env_clear()
-            .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
-            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("PGDATA", &datadir)
-            // The redo process is not trusted, so it runs in seccomp mode
-            // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
-            // inherit any file descriptors from the pageserver that would allow
-            // an attacker to do bad things.
+            // The redo process is not trusted, and runs in seccomp mode that
+            // doesn't allow it to open any files. We have to also make sure it
+            // doesn't inherit any file descriptors from the pageserver, that
+            // would allow an attacker to read any files that happen to be open
+            // in the pageserver.
            //
            // The Rust standard library makes sure to mark any file descriptors with
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
-            //
-            // One example is the pidfile of the daemonize library, which doesn't
-            // currently mark file descriptors as close-on-exec. Either way, we
-            // want to be on the safe side and prevent accidental regression.
            .close_fds()
            .spawn()
            .map_err(|e| {
@@ -818,7 +839,7 @@ impl PostgresRedoProcess {
 }

 // Functions for constructing messages to send to the postgres WAL redo
-// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
+// process. See pgxn/neon_walredo/walredoproc.c for
 // explanation of the protocol.

 fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,7 +4,6 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o \
 	libpagestore.o \
 	libpqwalproposer.o \
 	pagestore_smgr.o \
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -419,15 +419,6 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

-	DefineCustomBoolVariable("neon.wal_redo",
-							 "start in wal-redo mode",
-							 NULL,
-							 &wal_redo,
-							 false,
-							 PGC_POSTMASTER,
-							 0,
-							 NULL, NULL, NULL);
-
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
@@ -452,13 +443,7 @@ pg_init_libpagestore(void)
 	neon_timeline_walproposer = neon_timeline;
 	neon_tenant_walproposer = neon_tenant;

-	if (wal_redo)
-	{
-		neon_log(PageStoreTrace, "set inmem_smgr hook");
-		smgr_hook = smgr_inmem;
-		smgr_init_hook = smgr_init_inmem;
-	}
-	else if (page_server_connstring && page_server_connstring[0])
+	if (page_server_connstring && page_server_connstring[0])
 	{
 		neon_log(PageStoreTrace, "set neon_smgr hook");
 		smgr_hook = smgr_neon;
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -10,51 +10,12 @@ struct WalProposerConn
 	PGconn	   *pg_conn;
 	bool		is_nonblocking; /* whether the connection is non-blocking */
 	char	   *recvbuf;		/* last received data from
-								 * libpqprop_async_read */
+								 * walprop_async_read */
 };

-/* Prototypes for exported functions */
-static char *libpqprop_error_message(WalProposerConn * conn);
-static WalProposerConnStatusType libpqprop_status(WalProposerConn * conn);
-static WalProposerConn * libpqprop_connect_start(char *conninfo);
-static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn * conn);
-static bool libpqprop_send_query(WalProposerConn * conn, char *query);
-static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn * conn);
-static pgsocket libpqprop_socket(WalProposerConn * conn);
-static int	libpqprop_flush(WalProposerConn * conn);
-static void libpqprop_finish(WalProposerConn * conn);
-static PGAsyncReadResult libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount);
-static PGAsyncWriteResult libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size);
-static bool libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size);
-
-static WalProposerFunctionsType PQWalProposerFunctions =
-{
-	libpqprop_error_message,
-		libpqprop_status,
-		libpqprop_connect_start,
-		libpqprop_connect_poll,
-		libpqprop_send_query,
-		libpqprop_get_query_result,
-		libpqprop_socket,
-		libpqprop_flush,
-		libpqprop_finish,
-		libpqprop_async_read,
-		libpqprop_async_write,
-		libpqprop_blocking_write,
-};
-
-/* Module initialization */
-void
-pg_init_libpqwalproposer(void)
-{
-	if (WalProposerFunctions != NULL)
-		elog(ERROR, "libpqwalproposer already loaded");
-	WalProposerFunctions = &PQWalProposerFunctions;
-}
-
 /* Helper function */
 static bool
-ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking)
+ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
 {
 	/* If we're already correctly blocking or nonblocking, all good */
 	if (is_nonblocking == conn->is_nonblocking)
@@ -69,14 +30,14 @@ ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking)
 }

 /* Exported function definitions */
-static char *
-libpqprop_error_message(WalProposerConn * conn)
+char *
+walprop_error_message(WalProposerConn *conn)
 {
 	return PQerrorMessage(conn->pg_conn);
 }

-static WalProposerConnStatusType
-libpqprop_status(WalProposerConn * conn)
+WalProposerConnStatusType
+walprop_status(WalProposerConn *conn)
 {
 	switch (PQstatus(conn->pg_conn))
 	{
@@ -89,8 +50,8 @@ libpqprop_status(WalProposerConn * conn)
 	}
 }

-static WalProposerConn *
-libpqprop_connect_start(char *conninfo)
+WalProposerConn *
+walprop_connect_start(char *conninfo)
 {
 	WalProposerConn *conn;
 	PGconn	   *pg_conn;
@@ -119,8 +80,8 @@ libpqprop_connect_start(char *conninfo)
 	return conn;
 }

-static WalProposerConnectPollStatusType
-libpqprop_connect_poll(WalProposerConn * conn)
+WalProposerConnectPollStatusType
+walprop_connect_poll(WalProposerConn *conn)
 {
 	WalProposerConnectPollStatusType return_val;

@@ -160,8 +121,8 @@ libpqprop_connect_poll(WalProposerConn * conn)
 	return return_val;
 }

-static bool
-libpqprop_send_query(WalProposerConn * conn, char *query)
+bool
+walprop_send_query(WalProposerConn *conn, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
@@ -177,8 +138,8 @@ libpqprop_send_query(WalProposerConn * conn, char *query)
 	return true;
 }

-static WalProposerExecStatusType
-libpqprop_get_query_result(WalProposerConn * conn)
+WalProposerExecStatusType
+walprop_get_query_result(WalProposerConn *conn)
 {
 	PGresult   *result;
 	WalProposerExecStatusType return_val;
@@ -255,20 +216,20 @@ libpqprop_get_query_result(WalProposerConn * conn)
 	return return_val;
 }

-static pgsocket
-libpqprop_socket(WalProposerConn * conn)
+pgsocket
+walprop_socket(WalProposerConn *conn)
 {
 	return PQsocket(conn->pg_conn);
 }

-static int
-libpqprop_flush(WalProposerConn * conn)
+int
+walprop_flush(WalProposerConn *conn)
 {
 	return (PQflush(conn->pg_conn));
 }

-static void
-libpqprop_finish(WalProposerConn * conn)
+void
+walprop_finish(WalProposerConn *conn)
 {
 	if (conn->recvbuf != NULL)
 		PQfreemem(conn->recvbuf);
@@ -282,8 +243,8 @@ libpqprop_finish(WalProposerConn * conn)
 * On success, the data is placed in *buf. It is valid until the next call
 * to this function.
 */
-static PGAsyncReadResult
-libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount)
+PGAsyncReadResult
+walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 {
 	int			result;

@@ -353,8 +314,8 @@ libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount)
 	}
 }

-static PGAsyncWriteResult
-libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size)
+PGAsyncWriteResult
+walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 {
 	int			result;

@@ -408,8 +369,12 @@ libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size)
 	}
 }

-static bool
-libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size)
+/*
+ * This function is very similar to walprop_async_write. For more
+ * information, refer to the comments there.
+ */
+bool
+walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
 {
 	int			result;

@@ -417,10 +382,6 @@ libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size)
 	if (!ensure_nonblocking_status(conn, false))
 		return false;

-	/*
-	 * Ths function is very similar to libpqprop_async_write. For more
-	 * information, refer to the comments there
-	 */
 	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
 		return false;

--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -32,7 +32,6 @@ void
 _PG_init(void)
 {
 	pg_init_libpagestore();
-	pg_init_libpqwalproposer();
 	pg_init_walproposer();

 	EmitWarningsOnPlaceholders("neon");
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -13,7 +13,6 @@
 #define NEON_H

 extern void pg_init_libpagestore(void);
-extern void pg_init_libpqwalproposer(void);
 extern void pg_init_walproposer(void);

 #endif							/* NEON_H */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -155,10 +155,6 @@ extern int32 max_cluster_size;
 extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
 extern void smgr_init_neon(void);

-extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
-extern void smgr_init_inmem(void);
-extern void smgr_shutdown_inmem(void);
-
 /* Neon storage manager functionality */

 extern void neon_init(void);
@@ -188,29 +184,6 @@ extern void neon_truncate(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber nblocks);
 extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);

-/* neon wal-redo storage manager functionality */
-
-extern void inmem_init(void);
-extern void inmem_open(SMgrRelation reln);
-extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
-extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, char *buffer, bool skipFsync);
-extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber blocknum);
-extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					   char *buffer);
-extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
-						BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
-							BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber nblocks);
-extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
-
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -99,7 +99,6 @@ char	   *page_server_connstring;
 /*with substituted password*/
 char	   *neon_timeline;
 char	   *neon_tenant;
-bool		wal_redo = false;
 int32		max_cluster_size;

 /* unlogged relation build states */
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -43,6 +43,7 @@
 #if PG_VERSION_NUM >= 150000
 #include "access/xlogrecovery.h"
 #endif
+#include "storage/fd.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -69,7 +70,8 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "walproposer_utils.h"
-#include "replication/walpropshim.h"
+
+static bool syncSafekeepers = false;

 char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
@@ -79,9 +81,6 @@ bool		am_wal_proposer;
 char	   *neon_timeline_walproposer = NULL;
 char	   *neon_tenant_walproposer = NULL;

-/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
-WalProposerFunctionsType *WalProposerFunctions = NULL;
-
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"

 static int	n_safekeepers = 0;
@@ -120,8 +119,8 @@ static TimestampTz last_reconnect_attempt;
 static WalproposerShmemState * walprop_shared;

 /* Prototypes for private functions */
-static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
-static void WalProposerStartImpl(void);
+static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
+static void WalProposerStart(void);
 static void WalProposerLoop(void);
 static void InitEventSet(void);
 static void UpdateEventSet(Safekeeper *sk, uint32 events);
@@ -189,9 +188,56 @@ pg_init_walproposer(void)
 	ProcessInterruptsCallback = backpressure_throttling_impl;

 	WalProposerRegister();
+}

-	WalProposerInit = &WalProposerInitImpl;
-	WalProposerStart = &WalProposerStartImpl;
+/*
+ * Entry point for `postgres --sync-safekeepers`.
+ */
+void
+WalProposerSync(int argc, char *argv[])
+{
+	struct stat stat_buf;
+
+	syncSafekeepers = true;
+#if PG_VERSION_NUM < 150000
+	ThisTimeLineID = 1;
+#endif
+
+	/*
+	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
+	 *
+	 * Copied from InitPostmasterDeathWatchHandle()
+	 */
+	if (pipe(postmaster_alive_fds) < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+					errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+					errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
+
+	ChangeToDataDir();
+
+	/* Create pg_wal directory, if it doesn't exist */
+	if (stat(XLOGDIR, &stat_buf) != 0)
+	{
+		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
+		if (MakePGDirectory(XLOGDIR) < 0)
+		{
+			ereport(ERROR,
+					(errcode_for_file_access(),
+						errmsg("could not create directory \"%s\": %m",
+							   XLOGDIR)));
+			exit(1);
+		}
+	}
+
+	WalProposerInit(0, 0);
+
+	BackgroundWorkerUnblockSignals();
+
+	WalProposerStart();
 }

 static void
@@ -432,16 +478,12 @@ WalProposerRegister(void)
 }

 static void
-WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
+WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 {
 	char	   *host;
 	char	   *sep;
 	char	   *port;

-	/* Load the libpq-specific functions */
-	if (WalProposerFunctions == NULL)
-		elog(ERROR, "libpqwalproposer didn't initialize correctly");
-
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
@@ -515,7 +557,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
 }

 static void
-WalProposerStartImpl(void)
+WalProposerStart(void)
 {

 	/* Initiate connections to all safekeeper nodes */
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -446,31 +446,31 @@ typedef enum
 }			WalProposerConnStatusType;

 /* Re-exported PQerrorMessage */
-typedef char *(*walprop_error_message_fn) (WalProposerConn * conn);
+extern char *walprop_error_message(WalProposerConn *conn);

 /* Re-exported PQstatus */
-typedef WalProposerConnStatusType(*walprop_status_fn) (WalProposerConn * conn);
+extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);

 /* Re-exported PQconnectStart */
-typedef WalProposerConn * (*walprop_connect_start_fn) (char *conninfo);
+extern WalProposerConn * walprop_connect_start(char *conninfo);

 /* Re-exported PQconectPoll */
-typedef WalProposerConnectPollStatusType(*walprop_connect_poll_fn) (WalProposerConn * conn);
+extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);

 /* Blocking wrapper around PQsendQuery */
-typedef bool (*walprop_send_query_fn) (WalProposerConn * conn, char *query);
+extern bool walprop_send_query(WalProposerConn *conn, char *query);

 /* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
-typedef WalProposerExecStatusType(*walprop_get_query_result_fn) (WalProposerConn * conn);
+extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);

 /* Re-exported PQsocket */
-typedef pgsocket (*walprop_socket_fn) (WalProposerConn * conn);
+extern pgsocket walprop_socket(WalProposerConn *conn);

 /* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
-typedef int (*walprop_flush_fn) (WalProposerConn * conn);
+extern int	walprop_flush(WalProposerConn *conn);

 /* Re-exported PQfinish */
-typedef void (*walprop_finish_fn) (WalProposerConn * conn);
+extern void walprop_finish(WalProposerConn *conn);

 /*
 * Ergonomic wrapper around PGgetCopyData
@@ -486,9 +486,7 @@ typedef void (*walprop_finish_fn) (WalProposerConn * conn);
 * performs a bit of extra checking work that's always required and is normally
 * somewhat verbose.
 */
-typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn,
-												   char **buf,
-												   int *amount);
+extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);

 /*
 * Ergonomic wrapper around PQputCopyData + PQflush
@@ -497,69 +495,14 @@ typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn,
 *
 * For information on the meaning of return codes, refer to PGAsyncWriteResult.
 */
-typedef PGAsyncWriteResult(*walprop_async_write_fn) (WalProposerConn * conn,
-													 void const *buf,
-													 size_t size);
+extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);

 /*
 * Blocking equivalent to walprop_async_write_fn
 *
 * Returns 'true' if successful, 'false' on failure.
 */
-typedef bool (*walprop_blocking_write_fn) (WalProposerConn * conn, void const *buf, size_t size);
-
-/* All libpqwalproposer exported functions collected together. */
-typedef struct WalProposerFunctionsType
-{
-	walprop_error_message_fn walprop_error_message;
-	walprop_status_fn walprop_status;
-	walprop_connect_start_fn walprop_connect_start;
-	walprop_connect_poll_fn walprop_connect_poll;
-	walprop_send_query_fn walprop_send_query;
-	walprop_get_query_result_fn walprop_get_query_result;
-	walprop_socket_fn walprop_socket;
-	walprop_flush_fn walprop_flush;
-	walprop_finish_fn walprop_finish;
-	walprop_async_read_fn walprop_async_read;
-	walprop_async_write_fn walprop_async_write;
-	walprop_blocking_write_fn walprop_blocking_write;
-}			WalProposerFunctionsType;
-
-/* Allow the above functions to be "called" with normal syntax */
-#define walprop_error_message(conn) \
-	WalProposerFunctions->walprop_error_message(conn)
-#define walprop_status(conn) \
-	WalProposerFunctions->walprop_status(conn)
-#define walprop_connect_start(conninfo) \
-	WalProposerFunctions->walprop_connect_start(conninfo)
-#define walprop_connect_poll(conn) \
-	WalProposerFunctions->walprop_connect_poll(conn)
-#define walprop_send_query(conn, query) \
-	WalProposerFunctions->walprop_send_query(conn, query)
-#define walprop_get_query_result(conn) \
-	WalProposerFunctions->walprop_get_query_result(conn)
-#define walprop_set_nonblocking(conn, arg) \
-	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
-#define walprop_socket(conn) \
-	WalProposerFunctions->walprop_socket(conn)
-#define walprop_flush(conn) \
-	WalProposerFunctions->walprop_flush(conn)
-#define walprop_finish(conn) \
-	WalProposerFunctions->walprop_finish(conn)
-#define walprop_async_read(conn, buf, amount) \
-	WalProposerFunctions->walprop_async_read(conn, buf, amount)
-#define walprop_async_write(conn, buf, size) \
-	WalProposerFunctions->walprop_async_write(conn, buf, size)
-#define walprop_blocking_write(conn, buf, size) \
-	WalProposerFunctions->walprop_blocking_write(conn, buf, size)
-
-/*
- * The runtime location of the libpqwalproposer functions.
- *
- * This pointer is set by the initializer in libpqwalproposer, so that we
- * can use it later.
- */
-extern PGDLLIMPORT WalProposerFunctionsType * WalProposerFunctions;
+extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);

 extern uint64 BackpressureThrottlingTime(void);

--- a/pgxn/neon_walredo/Makefile
+++ b/pgxn/neon_walredo/Makefile
@@ -0,0 +1,22 @@
+# pgxs/neon_walredo/Makefile
+
+MODULE_big = neon_walredo
+OBJS = \
+	$(WIN32RES) \
+	inmem_smgr.o \
+	walredoproc.o \
+
+# This really should be guarded by $(with_libseccomp), but I couldn't
+# make that work with pgxs. So we always compile it, but its contents
+# are wrapped in #ifdef HAVE_LIBSECCOMP instead.
+OBJS += seccomp.o
+
+PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver"
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+
+ifeq ($(with_libseccomp),yes)
+SHLIB_LINK += -lseccomp
+endif
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -3,9 +3,8 @@
 * inmem_smgr.c
 *
 * This is an implementation of the SMGR interface, used in the WAL redo
- * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
- * storage, the pages that are written out are kept in a small number of
- * in-memory buffers.
+ * process. It has no persistent storage, the pages that are written out
+ * are kept in a small number of in-memory buffers.
 *
 * Normally, replaying a WAL record only needs to access a handful of
 * buffers, which fit in the normal buffer cache, so this is just for
@@ -15,15 +14,11 @@
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * IDENTIFICATION
- *	  contrib/neon/inmem_smgr.c
- *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

 #include "access/xlog.h"
-#include "pagestore_client.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"
 #include "storage/relfilenode.h"
@@ -33,6 +28,8 @@
 #include "access/xlogutils.h"
 #endif

+#include "inmem_smgr.h"
+
 /* Size of the in-memory smgr */
 #define MAX_PAGES 64

@@ -59,10 +56,34 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
 	return -1;
 }

+
+/* neon wal-redo storage manager functionality */
+static void inmem_init(void);
+static void inmem_open(SMgrRelation reln);
+static void inmem_close(SMgrRelation reln, ForkNumber forknum);
+static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
+static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum);
+static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					   char *buffer);
+static void inmem_write(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, char *buffer, bool skipFsync);
+static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, BlockNumber nblocks);
+static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
+static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber nblocks);
+static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+
 /*
 *	inmem_init() -- Initialize private state
 */
-void
+static void
 inmem_init(void)
 {
 	used_pages = 0;
@@ -71,7 +92,7 @@ inmem_init(void)
 /*
 *	inmem_exists() -- Does the physical file exist?
 */
-bool
+static bool
 inmem_exists(SMgrRelation reln, ForkNumber forknum)
 {
 	for (int i = 0; i < used_pages; i++)
@@ -90,7 +111,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
 *
 * If isRedo is true, it's okay for the relation to exist already.
 */
-void
+static void
 inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 {
 }
@@ -98,7 +119,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 /*
 *	inmem_unlink() -- Unlink a relation.
 */
-void
+static void
 inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
 {
 }
@@ -112,7 +133,7 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
 *		EOF).  Note that we assume writing a block beyond current EOF
 *		causes intervening file space to become filled with zeroes.
 */
-void
+static void
 inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 char *buffer, bool skipFsync)
 {
@@ -123,7 +144,7 @@ inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 /*
 *  inmem_open() -- Initialize newly-opened relation.
 */
-void
+static void
 inmem_open(SMgrRelation reln)
 {
 }
@@ -131,7 +152,7 @@ inmem_open(SMgrRelation reln)
 /*
 *	inmem_close() -- Close the specified relation, if it isn't closed already.
 */
-void
+static void
 inmem_close(SMgrRelation reln, ForkNumber forknum)
 {
 }
@@ -139,7 +160,7 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
 /*
 *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
 */
-bool
+static bool
 inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 	return true;
@@ -148,7 +169,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 /*
 * inmem_writeback() -- Tell the kernel to write pages back to storage.
 */
-void
+static void
 inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 				BlockNumber blocknum, BlockNumber nblocks)
 {
@@ -157,7 +178,7 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 /*
 *	inmem_read() -- Read the specified block from a relation.
 */
-void
+static void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   char *buffer)
 {
@@ -177,7 +198,7 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 *		relation (ie, those before the current EOF).  To extend a relation,
 *		use mdextend().
 */
-void
+static void
 inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			char *buffer, bool skipFsync)
 {
@@ -224,7 +245,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 /*
 *	inmem_nblocks() -- Get the number of blocks stored in a relation.
 */
-BlockNumber
+static BlockNumber
 inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	/*
@@ -243,7 +264,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 /*
 *	inmem_truncate() -- Truncate relation to specified number of blocks.
 */
-void
+static void
 inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 }
@@ -251,7 +272,7 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 /*
 *	inmem_immedsync() -- Immediately sync a relation to stable storage.
 */
-void
+static void
 inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * inmem_smgr.h
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef INMEM_SMGR_H
+#define INMEM_SMGR_H
+
+extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
+extern void smgr_init_inmem(void);
+
+#endif /* INMEM_SMGR_H */
--- a/pgxn/neon_walredo/neon_seccomp.h
+++ b/pgxn/neon_walredo/neon_seccomp.h
@@ -0,0 +1,22 @@
+#ifndef NEON_SECCOMP_H
+#define NEON_SECCOMP_H
+
+#include <seccomp.h>
+
+typedef struct {
+    int    psr_syscall; /* syscall number */
+    uint32 psr_action;  /* libseccomp action, e.g. SCMP_ACT_ALLOW */
+} PgSeccompRule;
+
+#define PG_SCMP(syscall, action)                \
+    (PgSeccompRule) {                           \
+        .psr_syscall = SCMP_SYS(syscall),       \
+        .psr_action = (action),                 \
+    }
+
+#define PG_SCMP_ALLOW(syscall) \
+    PG_SCMP(syscall, SCMP_ACT_ALLOW)
+
+extern void seccomp_load_rules(PgSeccompRule *syscalls, int count);
+
+#endif /* NEON_SECCOMP_H */
--- a/pgxn/neon_walredo/seccomp.c
+++ b/pgxn/neon_walredo/seccomp.c
@@ -0,0 +1,257 @@
+/*-------------------------------------------------------------------------
+ *
+ * seccomp.c
+ *	  Secure Computing BPF API wrapper.
+ *
+ * Pageserver delegates complex WAL decoding duties to postgres,
+ * which means that the latter might fall victim to carefully designed
+ * malicious WAL records and start doing harmful things to the system.
+ * To prevent this, it has been decided to limit possible interactions
+ * with the outside world using the Secure Computing BPF mode.
+ *
+ * We use this mode to disable all syscalls not in the allowlist. This
+ * approach has its pros & cons:
+ *
+ *  - We have to carefully handpick and maintain the set of syscalls
+ *    required for the WAL redo process. Core dumps help with that.
+ *    The method of trial and error seems to work reasonably well,
+ *    but it would be nice to find a proper way to "prove" that
+ *    the set in question is both necessary and sufficient.
+ *
+ *  - Once we enter the seccomp bpf mode, it's impossible to lift those
+ *    restrictions (otherwise, what kind of "protection" would that be?).
+ *    Thus, we have to either enable extra syscalls for the clean shutdown,
+ *    or exit the process immediately via _exit() instead of proc_exit().
+ *
+ *  - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom
+ *    facility to deal with the forbidden syscalls? If we'd like to embed
+ *    a startup security test, we should go with the latter; In that
+ *    case, which one of the following options is preferable?
+ *
+ *      * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP.
+ *        Provide a common signal handler with a static switch to override
+ *        its behavior for the test case. This would undermine the whole
+ *        purpose of such protection, so we'd have to go further and remap
+ *        the memory backing the switch as readonly, then ban mprotect().
+ *        Ugly and fragile, to say the least.
+ *
+ *      * Yet again, catch the denied syscalls using SCMP_ACT_TRAP.
+ *        Provide 2 different signal handlers: one for a test case,
+ *        another for the main processing loop. Install the first one,
+ *        enable seccomp, perform the test, switch to the second one,
+ *        finally ban sigaction(), presto!
+ *
+ *      * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the
+ *        test, then ban it altogether with another filter. The downside
+ *        of this solution is that we don't actually check that
+ *        SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works.
+ *
+ *    Either approach seems to require two eBPF filter programs,
+ *    which is unfortunate: the man page tells this is uncommon.
+ *    Maybe I (@funbringer) am missing something, though; I encourage
+ *    any reader to get familiar with it and scrutinize my conclusions.
+ *
+ * TODOs and ideas in no particular order:
+ *
+ *  - Do something about mmap() in musl's malloc().
+ *    Definitely not a priority if we don't care about musl.
+ *
+ *  - See if we can untangle PG's shutdown sequence (involving unlink()):
+ *
+ *      * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode.
+ *      * Investigate chroot() or mount namespaces for better FS isolation.
+ *      * (Per Heikki) Simply call _exit(), no big deal.
+ *      * Come up with a better idea?
+ *
+ *  - Make use of seccomp's argument inspection (for what?).
+ *    Unfortunately, it views all syscall arguments as scalars,
+ *    so it won't work for e.g. string comparison in unlink().
+ *
+ *  - Benchmark with bpf jit on/off, try seccomp_syscall_priority().
+ *
+ *  - Test against various linux distros & glibc versions.
+ *    I suspect that certain libc functions might involve slightly
+ *    different syscalls, e.g. select/pselect6/pselect6_time64/whatever.
+ *
+ *  - Test on any arch other than amd64 to see if it works there.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+/*
+ * I couldn't find a good way to do a conditional OBJS += seccomp.o in
+ * the Makefile, so this file is compiled even when seccomp is disabled,
+ * it's just empty in that case.
+ */
+#ifdef HAVE_LIBSECCOMP
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
+#include "neon_seccomp.h"
+
+static void die(int code, const char *str);
+
+static bool seccomp_test_sighandler_done = false;
+static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt);
+static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt);
+
+static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action);
+
+void
+seccomp_load_rules(PgSeccompRule *rules, int count)
+{
+	struct sigaction action = { .sa_flags = SA_SIGINFO };
+	PgSeccompRule rule;
+	long fd;
+
+	/*
+	 * Install a test signal handler.
+	 * XXX: pqsignal() is too restrictive for our purposes,
+	 * since we'd like to examine the contents of siginfo_t.
+	 */
+	action.sa_sigaction = seccomp_test_sighandler;
+	if (sigaction(SIGSYS, &action, NULL) != 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not install test SIGSYS handler")));
+
+	/*
+	 * First, check that open of a well-known file works.
+	 * XXX: We use raw syscall() to call the very open().
+	 */
+	fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	if (seccomp_test_sighandler_done)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: signal handler test flag was set unexpectedly")));
+	if (fd < 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not open /dev/null for seccomp testing: %m")));
+	close((int) fd);
+
+	/* Set a trap on open() to test seccomp bpf */
+	rule = PG_SCMP(open, SCMP_ACT_TRAP);
+	if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not load test trap")));
+
+	/* Finally, check that open() now raises SIGSYS */
+	(void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	if (!seccomp_test_sighandler_done)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: SIGSYS handler doesn't seem to work")));
+
+	/* Now that everything seems to work, install a proper handler */
+	action.sa_sigaction = seccomp_deny_sighandler;
+	if (sigaction(SIGSYS, &action, NULL) != 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not install SIGSYS handler")));
+
+	/* If this succeeds, any syscall not in the list will crash the process */
+	if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not enter seccomp mode")));
+}
+
+/*
+ * Enter seccomp mode with a BPF filter that will only allow
+ * certain syscalls to proceed.
+ */
+static int
+do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action)
+{
+	scmp_filter_ctx ctx;
+	int rc = -1;
+
+	/* Create a context with a default action for syscalls not in the list */
+	if ((ctx = seccomp_init(def_action)) == NULL)
+		goto cleanup;
+
+	for (int i = 0; i < count; i++)
+	{
+		PgSeccompRule *rule = &rules[i];
+		if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0)
+			goto cleanup;
+	}
+
+	/* Try building & loading the program into the kernel */
+	if ((rc = seccomp_load(ctx)) != 0)
+		goto cleanup;
+
+cleanup:
+	/*
+	 * We don't need the context anymore regardless of the result,
+	 * since either we failed or the eBPF program has already been
+	 * loaded into the linux kernel.
+	 */
+	seccomp_release(ctx);
+	return rc;
+}
+
+static void
+die(int code, const char *str)
+{
+	/* work around gcc ignoring that it shouldn't warn on (void) result being unused */
+	ssize_t _unused pg_attribute_unused();
+	/* Best effort write to stderr */
+	_unused = write(fileno(stderr), str, strlen(str));
+
+	/* XXX: we don't want to run any atexit callbacks */
+	_exit(code);
+}
+
+static void
+seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
+{
+#define DIE_PREFIX "seccomp test signal handler: "
+
+	/* Check that this signal handler is used only for a single test case */
+	if (seccomp_test_sighandler_done)
+		die(1, DIE_PREFIX "test handler should only be used for 1 test\n");
+	seccomp_test_sighandler_done = true;
+
+	if (signum != SIGSYS)
+		die(1, DIE_PREFIX "bad signal number\n");
+
+	/* TODO: maybe somehow extract the hardcoded syscall number */
+	if (info->si_syscall != SCMP_SYS(open))
+		die(1, DIE_PREFIX "bad syscall number\n");
+
+#undef DIE_PREFIX
+}
+
+static void
+seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
+{
+	/*
+	 * Unfortunately, we can't use seccomp_syscall_resolve_num_arch()
+	 * to resolve the syscall's name, since it calls strdup()
+	 * under the hood (wtf!).
+	 */
+	char buffer[128];
+	(void)snprintf(buffer, lengthof(buffer),
+			"---------------------------------------\n"
+			"seccomp: bad syscall %d\n"
+			"---------------------------------------\n",
+			info->si_syscall);
+
+	/*
+	 * Instead of silently crashing the process with
+	 * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS,
+	 * we'd like to receive a real SIGSYS to print the
+	 * message and *then* immediately exit.
+	 */
+	die(1, buffer);
+}
+
+#endif		/* HAVE_LIBSECCOMP */
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -0,0 +1,847 @@
+/*-------------------------------------------------------------------------
+ *
+ * walredoproc.c
+ *	  Entry point for WAL redo helper
+ *
+ *
+ * This file contains an alternative main() function for the 'postgres'
+ * binary. In the special mode, we go into a special mode that's similar
+ * to the single user mode. We don't launch postmaster or any auxiliary
+ * processes. Instead, we wait for command from 'stdin', and respond to
+ * 'stdout'.
+ *
+ * The protocol through stdin/stdout is loosely based on the libpq protocol.
+ * The process accepts messages through stdin, and each message has the format:
+ *
+ * char   msgtype;
+ * int32  length; // length of message including 'length' but excluding
+ *                // 'msgtype', in network byte order
+ * <payload>
+ *
+ * There are three message types:
+ *
+ * BeginRedoForBlock ('B'): Prepare for WAL replay for given block
+ * PushPage ('P'): Copy a page image (in the payload) to buffer cache
+ * ApplyRecord ('A'): Apply a WAL record (in the payload)
+ * GetPage ('G'): Return a page image from buffer cache.
+ *
+ * Currently, you only get a response to GetPage requests; the response is
+ * simply a 8k page, without any headers. Errors are logged to stderr.
+ *
+ * FIXME:
+ * - this currently requires a valid PGDATA, and creates a lock file there
+ *   like a normal postmaster. There's no fundamental reason for that, though.
+ * - should have EndRedoForBlock, and flush page cache, to allow using this
+ *   mechanism for more than one block without restarting the process.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__)
+#define MALLOC_NO_MMAP
+#include <malloc.h>
+#endif
+
+#ifndef HAVE_GETRUSAGE
+#include "rusagestub.h"
+#endif
+
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogrecovery.h"
+#endif
+#include "access/xlogutils.h"
+#include "catalog/pg_class.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/postmaster.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+
+#include "inmem_smgr.h"
+
+#ifdef HAVE_LIBSECCOMP
+#include "neon_seccomp.h"
+#endif
+
+PG_MODULE_MAGIC;
+
+static int	ReadRedoCommand(StringInfo inBuf);
+static void BeginRedoForBlock(StringInfo input_message);
+static void PushPage(StringInfo input_message);
+static void ApplyRecord(StringInfo input_message);
+static void apply_error_callback(void *arg);
+static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
+static void GetPage(StringInfo input_message);
+static ssize_t buffered_read(void *buf, size_t count);
+
+static BufferTag target_redo_tag;
+
+static XLogReaderState *reader_state;
+
+#define TRACE DEBUG5
+
+#ifdef HAVE_LIBSECCOMP
+static void
+enter_seccomp_mode(void)
+{
+	PgSeccompRule syscalls[] =
+	{
+		/* Hard requirements */
+		PG_SCMP_ALLOW(exit_group),
+		PG_SCMP_ALLOW(pselect6),
+		PG_SCMP_ALLOW(read),
+		PG_SCMP_ALLOW(select),
+		PG_SCMP_ALLOW(write),
+
+		/* Memory allocation */
+		PG_SCMP_ALLOW(brk),
+#ifndef MALLOC_NO_MMAP
+		/* TODO: musl doesn't have mallopt */
+		PG_SCMP_ALLOW(mmap),
+		PG_SCMP_ALLOW(munmap),
+#endif
+		/*
+		 * getpid() is called on assertion failure, in ExceptionalCondition.
+		 * It's not really needed, but seems pointless to hide it either. The
+		 * system call unlikely to expose a kernel vulnerability, and the PID
+		 * is stored in MyProcPid anyway.
+		 */
+		PG_SCMP_ALLOW(getpid),
+
+		/* Enable those for a proper shutdown.
+		PG_SCMP_ALLOW(munmap),
+		PG_SCMP_ALLOW(shmctl),
+		PG_SCMP_ALLOW(shmdt),
+		PG_SCMP_ALLOW(unlink), // shm_unlink
+		*/
+	};
+
+#ifdef MALLOC_NO_MMAP
+	/* Ask glibc not to use mmap() */
+	mallopt(M_MMAP_MAX, 0);
+#endif
+
+	seccomp_load_rules(syscalls, lengthof(syscalls));
+}
+#endif /* HAVE_LIBSECCOMP */
+
+/*
+ * Entry point for the WAL redo process.
+ *
+ * Performs similar initialization as PostgresMain does for normal
+ * backend processes. Some initialization was done in CallExtMain
+ * already.
+ */
+void
+WalRedoMain(int argc, char *argv[])
+{
+	int			firstchar;
+	StringInfoData input_message;
+#ifdef HAVE_LIBSECCOMP
+	bool		enable_seccomp;
+#endif
+
+	am_wal_redo_postgres = true;
+
+	/*
+	 * WAL redo does not need a large number of buffers. And speed of
+	 * DropRelFileNodeAllLocalBuffers() is proportional to the number of
+	 * buffers. So let's keep it small (default value is 1024)
+	 */
+	num_temp_buffers = 4;
+
+	/*
+	 * install the simple in-memory smgr
+	 */
+	smgr_hook = smgr_inmem;
+	smgr_init_hook = smgr_init_inmem;
+
+	/*
+	 * Validate we have been given a reasonable-looking DataDir and change into it.
+	 */
+	checkDataDir();
+	ChangeToDataDir();
+
+	/*
+	 * Create lockfile for data directory.
+	 */
+	CreateDataDirLockFile(false);
+
+	/* read control file (error checking and contains config ) */
+	LocalProcessControlFile(false);
+
+	/*
+	 * process any libraries that should be preloaded at postmaster start
+	 */
+	process_shared_preload_libraries();
+
+	/* Initialize MaxBackends (if under postmaster, was done already) */
+	InitializeMaxBackends();
+
+#if PG_VERSION_NUM >= 150000
+	/*
+	 * Give preloaded libraries a chance to request additional shared memory.
+	 */
+	process_shmem_requests();
+
+	/*
+	 * Now that loadable modules have had their chance to request additional
+	 * shared memory, determine the value of any runtime-computed GUCs that
+	 * depend on the amount of shared memory required.
+	 */
+	InitializeShmemGUCs();
+
+	/*
+	 * Now that modules have been loaded, we can process any custom resource
+	 * managers specified in the wal_consistency_checking GUC.
+	 */
+	InitializeWalConsistencyChecking();
+#endif
+
+	CreateSharedMemoryAndSemaphores();
+
+	/*
+	 * Remember stand-alone backend startup time,roughly at the same point
+	 * during startup that postmaster does so.
+	 */
+	PgStartTime = GetCurrentTimestamp();
+
+	/*
+	 * Create a per-backend PGPROC struct in shared memory. We must do
+	 * this before we can use LWLocks.
+	 */
+	InitAuxiliaryProcess();
+
+	SetProcessingMode(NormalProcessing);
+
+	/* Redo routines won't work if we're not "in recovery" */
+	InRecovery = true;
+
+	/*
+	 * Create the memory context we will use in the main loop.
+	 *
+	 * MessageContext is reset once per iteration of the main loop, ie, upon
+	 * completion of processing of each command message from the client.
+	 */
+	MessageContext = AllocSetContextCreate(TopMemoryContext,
+										   "MessageContext",
+										   ALLOCSET_DEFAULT_SIZES);
+
+	/* we need a ResourceOwner to hold buffer pins */
+	Assert(CurrentResourceOwner == NULL);
+	CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo");
+
+	/* Initialize resource managers */
+	for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+	{
+		if (RmgrTable[rmid].rm_startup != NULL)
+			RmgrTable[rmid].rm_startup();
+	}
+	reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL);
+
+#ifdef HAVE_LIBSECCOMP
+	/* We prefer opt-out to opt-in for greater security */
+	enable_seccomp = true;
+	for (int i = 1; i < argc; i++)
+		if (strcmp(argv[i], "--disable-seccomp") == 0)
+			enable_seccomp = false;
+
+	/*
+	 * We deliberately delay the transition to the seccomp mode
+	 * until it's time to enter the main processing loop;
+	 * else we'd have to add a lot more syscalls to the allowlist.
+	 */
+	if (enable_seccomp)
+		enter_seccomp_mode();
+#endif /* HAVE_LIBSECCOMP */
+
+	/*
+	 * Main processing loop
+	 */
+	MemoryContextSwitchTo(MessageContext);
+	initStringInfo(&input_message);
+
+	for (;;)
+	{
+		/* Release memory left over from prior query cycle. */
+		resetStringInfo(&input_message);
+
+		set_ps_display("idle");
+
+		/*
+		 * (3) read a command (loop blocks here)
+		 */
+		firstchar = ReadRedoCommand(&input_message);
+		switch (firstchar)
+		{
+			case 'B':			/* BeginRedoForBlock */
+				BeginRedoForBlock(&input_message);
+				break;
+
+			case 'P':			/* PushPage */
+				PushPage(&input_message);
+				break;
+
+			case 'A':			/* ApplyRecord */
+				ApplyRecord(&input_message);
+				break;
+
+			case 'G':			/* GetPage */
+				GetPage(&input_message);
+				break;
+
+				/*
+				 * EOF means we're done. Perform normal shutdown.
+				 */
+			case EOF:
+				ereport(LOG,
+						(errmsg("received EOF on stdin, shutting down")));
+
+#ifdef HAVE_LIBSECCOMP
+				/*
+				 * Skip the shutdown sequence, leaving some garbage behind.
+				 * Hopefully, postgres will clean it up in the next run.
+				 * This way we don't have to enable extra syscalls, which is nice.
+				 * See enter_seccomp_mode() above.
+				 */
+				if (enable_seccomp)
+					_exit(0);
+#endif /* HAVE_LIBSECCOMP */
+				/*
+				 * NOTE: if you are tempted to add more code here, DON'T!
+				 * Whatever you had in mind to do should be set up as an
+				 * on_proc_exit or on_shmem_exit callback, instead. Otherwise
+				 * it will fail to be called during other backend-shutdown
+				 * scenarios.
+				 */
+				proc_exit(0);
+
+			default:
+				ereport(FATAL,
+						(errcode(ERRCODE_PROTOCOL_VIOLATION),
+						 errmsg("invalid frontend message type %d",
+								firstchar)));
+		}
+	}							/* end of input-reading loop */
+}
+
+
+/* Version compatility wrapper for ReadBufferWithoutRelcache */
+static inline Buffer
+NeonRedoReadBuffer(RelFileNode rnode,
+		   ForkNumber forkNum, BlockNumber blockNum,
+		   ReadBufferMode mode)
+{
+#if PG_VERSION_NUM >= 150000
+	return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode,
+									 NULL, /* no strategy */
+									 true); /* WAL redo is only performed on permanent rels */
+#else
+	return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode,
+									 NULL); /* no strategy */
+#endif
+}
+
+
+/*
+ * Some debug function that may be handy for now.
+ */
+pg_attribute_unused()
+static char *
+pprint_buffer(char *data, int len)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+	appendStringInfo(&s, "\n");
+	for (int i = 0; i < len; i++) {
+
+		appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) );
+		if (i % 32 == 31) {
+			appendStringInfo(&s, "\n");
+		}
+	}
+	appendStringInfo(&s, "\n");
+
+	return s.data;
+}
+
+/* ----------------------------------------------------------------
+ *		routines to obtain user input
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * Read next command from the client.
+ *
+ *	the string entered by the user is placed in its parameter inBuf,
+ *	and we act like a Q message was received.
+ *
+ *	EOF is returned if end-of-file input is seen; time to shut down.
+ * ----------------
+ */
+static int
+ReadRedoCommand(StringInfo inBuf)
+{
+	ssize_t		ret;
+	char		hdr[1 + sizeof(int32)];
+	int			qtype;
+	int32		len;
+
+	/* Read message type and message length */
+	ret = buffered_read(hdr, sizeof(hdr));
+	if (ret != sizeof(hdr))
+	{
+		if (ret == 0)
+			return EOF;
+		else if (ret < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("could not read message header: %m")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("unexpected EOF")));
+	}
+
+	qtype = hdr[0];
+	memcpy(&len, &hdr[1], sizeof(int32));
+	len = pg_ntoh32(len);
+
+	if (len < 4)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("invalid message length")));
+
+	len -= 4;					/* discount length itself */
+
+	/* Read the message payload */
+	enlargeStringInfo(inBuf, len);
+	ret = buffered_read(inBuf->data, len);
+	if (ret != len)
+	{
+		if (ret < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("could not read message: %m")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("unexpected EOF")));
+	}
+	inBuf->len = len;
+	inBuf->data[len] = '\0';
+
+	return qtype;
+}
+
+/*
+ * Prepare for WAL replay on given block
+ */
+static void
+BeginRedoForBlock(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	SMgrRelation reln;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+	wal_redo_buffer = InvalidBuffer;
+
+	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
+
+	elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
+		 target_redo_tag.rnode.spcNode,
+		 target_redo_tag.rnode.dbNode,
+		 target_redo_tag.rnode.relNode,
+		 target_redo_tag.forkNum,
+		 target_redo_tag.blockNum);
+
+	reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
+	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
+		reln->smgr_cached_nblocks[forknum] < blknum + 1)
+	{
+		reln->smgr_cached_nblocks[forknum] = blknum + 1;
+	}
+}
+
+/*
+ * Receive a page given by the client, and put it into buffer cache.
+ */
+static void
+PushPage(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	const char *content;
+	Buffer		buf;
+	Page		page;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 * 8k page content
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+	content = pq_getmsgbytes(input_message, BLCKSZ);
+
+	buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_ZERO_AND_LOCK);
+	wal_redo_buffer = buf;
+	page = BufferGetPage(buf);
+	memcpy(page, content, BLCKSZ);
+	MarkBufferDirty(buf); /* pro forma */
+	UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Receive a WAL record, and apply it.
+ *
+ * All the pages should be loaded into the buffer cache by PushPage calls already.
+ */
+static void
+ApplyRecord(StringInfo input_message)
+{
+	char	   *errormsg;
+	XLogRecPtr	lsn;
+	XLogRecord *record;
+	int			nleft;
+	ErrorContextCallback errcallback;
+#if PG_VERSION_NUM >= 150000
+	DecodedXLogRecord *decoded;
+#endif
+
+	/*
+	 * message format:
+	 *
+	 * LSN (the *end* of the record)
+	 * record
+	 */
+	lsn = pq_getmsgint64(input_message);
+
+	smgrinit();					/* reset inmem smgr state */
+
+	/* note: the input must be aligned here */
+	record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord));
+
+	nleft = input_message->len - input_message->cursor;
+	if (record->xl_tot_len != sizeof(XLogRecord) + nleft)
+		elog(ERROR, "mismatch between record (%d) and message size (%d)",
+			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = apply_error_callback;
+	errcallback.arg = (void *) reader_state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	XLogBeginRead(reader_state, lsn);
+
+#if PG_VERSION_NUM >= 150000
+	decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true);
+
+	if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg))
+		elog(ERROR, "failed to decode WAL record: %s", errormsg);
+	else
+	{
+		/* Record the location of the next record. */
+		decoded->next_lsn = reader_state->NextRecPtr;
+
+		/*
+		 * If it's in the decode buffer, mark the decode buffer space as
+		 * occupied.
+		 */
+		if (!decoded->oversized)
+		{
+			/* The new decode buffer head must be MAXALIGNed. */
+			Assert(decoded->size == MAXALIGN(decoded->size));
+			if ((char *) decoded == reader_state->decode_buffer)
+				reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size;
+			else
+				reader_state->decode_buffer_tail += decoded->size;
+		}
+
+		/* Insert it into the queue of decoded records. */
+		Assert(reader_state->decode_queue_tail != decoded);
+		if (reader_state->decode_queue_tail)
+			reader_state->decode_queue_tail->next = decoded;
+		reader_state->decode_queue_tail = decoded;
+		if (!reader_state->decode_queue_head)
+			reader_state->decode_queue_head = decoded;
+
+		/*
+		 * Update the pointers to the beginning and one-past-the-end of this
+		 * record, again for the benefit of historical code that expected the
+		 * decoder to track this rather than accessing these fields of the record
+		 * itself.
+		 */
+		reader_state->record = reader_state->decode_queue_head;
+		reader_state->ReadRecPtr = reader_state->record->lsn;
+		reader_state->EndRecPtr = reader_state->record->next_lsn;
+	}
+#else
+	/*
+	 * In lieu of calling XLogReadRecord, store the record 'decoded_record'
+	 * buffer directly.
+	 */
+	reader_state->ReadRecPtr = lsn;
+	reader_state->decoded_record = record;
+	if (!DecodeXLogRecord(reader_state, record, &errormsg))
+		elog(ERROR, "failed to decode WAL record: %s", errormsg);
+#endif
+
+	/* Ignore any other blocks than the ones the caller is interested in */
+	redo_read_buffer_filter = redo_block_filter;
+
+	RmgrTable[record->xl_rmid].rm_redo(reader_state);
+
+	/*
+	 * If no base image of the page was provided by PushPage, initialize
+	 * wal_redo_buffer here. The first WAL record must initialize the page
+	 * in that case.
+	 */
+	if (BufferIsInvalid(wal_redo_buffer))
+	{
+		wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode,
+											 target_redo_tag.forkNum,
+											 target_redo_tag.blockNum,
+											 RBM_NORMAL);
+		Assert(!BufferIsInvalid(wal_redo_buffer));
+		ReleaseBuffer(wal_redo_buffer);
+	}
+
+	redo_read_buffer_filter = NULL;
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+
+	elog(TRACE, "applied WAL record with LSN %X/%X",
+		 (uint32) (lsn >> 32), (uint32) lsn);
+#if PG_VERSION_NUM >= 150000
+	if (decoded && decoded->oversized)
+		pfree(decoded);
+#endif
+}
+
+/*
+ * Error context callback for errors occurring during ApplyRecord
+ */
+static void
+apply_error_callback(void *arg)
+{
+	XLogReaderState *record = (XLogReaderState *) arg;
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+	xlog_outdesc(&buf, record);
+
+	/* translator: %s is a WAL record description */
+	errcontext("WAL redo at %X/%X for %s",
+			   LSN_FORMAT_ARGS(record->ReadRecPtr),
+			   buf.data);
+
+	pfree(buf.data);
+}
+
+
+
+static bool
+redo_block_filter(XLogReaderState *record, uint8 block_id)
+{
+	BufferTag	target_tag;
+
+#if PG_VERSION_NUM >= 150000
+	XLogRecGetBlockTag(record, block_id,
+					   &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum);
+#else
+	if (!XLogRecGetBlockTag(record, block_id,
+							&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum))
+	{
+		/* Caller specified a bogus block_id */
+		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+	}
+#endif
+
+	/*
+	 * Can a WAL redo function ever access a relation other than the one that
+	 * it modifies? I don't see why it would.
+	 */
+	if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
+		elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
+			 target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
+
+	/*
+	 * If this block isn't one we are currently restoring, then return 'true'
+	 * so that this gets ignored
+	 */
+	return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
+}
+
+/*
+ * Get a page image back from buffer cache.
+ *
+ * After applying some records.
+ */
+static void
+GetPage(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	Buffer		buf;
+	Page		page;
+	int			tot_written;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+
+	/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
+
+	buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL);
+	Assert(buf == wal_redo_buffer);
+	page = BufferGetPage(buf);
+	/* single thread, so don't bother locking the page */
+
+	/* Response: Page content */
+	tot_written = 0;
+	do {
+		ssize_t		rc;
+
+		rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written);
+		if (rc < 0) {
+			/* If interrupted by signal, just retry */
+			if (errno == EINTR)
+				continue;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to stdout: %m")));
+		}
+		tot_written += rc;
+	} while (tot_written < BLCKSZ);
+
+	ReleaseBuffer(buf);
+	DropRelFileNodeAllLocalBuffers(rnode);
+	wal_redo_buffer = InvalidBuffer;
+
+	elog(TRACE, "Page sent back for block %u", blknum);
+}
+
+
+/* Buffer used by buffered_read() */
+static char stdin_buf[16 * 1024];
+static size_t stdin_len = 0;	/* # of bytes in buffer */
+static size_t stdin_ptr = 0;	/* # of bytes already consumed */
+
+/*
+ * Like read() on stdin, but buffered.
+ *
+ * We cannot use libc's buffered fread(), because it uses syscalls that we
+ * have disabled with seccomp(). Depending on the platform, it can call
+ * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat'
+ * seems problematic because it allows interrogating files by path name.
+ *
+ * The return value is the number of bytes read. On error, -1 is returned, and
+ * errno is set appropriately. Unlike read(), this fills the buffer completely
+ * unless an error happens or EOF is reached.
+ */
+static ssize_t
+buffered_read(void *buf, size_t count)
+{
+	char	   *dst = buf;
+
+	while (count > 0)
+	{
+		size_t		nthis;
+
+		if (stdin_ptr == stdin_len)
+		{
+			ssize_t		ret;
+
+			ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf));
+			if (ret < 0)
+			{
+				/* don't do anything here that could set 'errno' */
+				return ret;
+			}
+			if (ret == 0)
+			{
+				/* EOF */
+				break;
+			}
+			stdin_len = (size_t) ret;
+			stdin_ptr = 0;
+		}
+		nthis = Min(stdin_len - stdin_ptr, count);
+
+		memcpy(dst, &stdin_buf[stdin_ptr], nthis);
+
+		stdin_ptr += nthis;
+		count -= nthis;
+		dst += nthis;
+	}
+
+	return (dst - (char *) buf);
+}
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"

 [package.extras]
-sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]

 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]

 [[package]]
 name = "aws-sam-translator"
@@ -514,14 +514,6 @@ python-versions = ">=3.7"
 [package.dependencies]
 typing-extensions = ">=4.1.0"

-[[package]]
-name = "cached-property"
-version = "1.5.2"
-description = "A decorator for caching properties in classes."
-category = "main"
-optional = false
-python-versions = "*"
-
 [[package]]
 name = "certifi"
 version = "2022.6.15"
@@ -568,7 +560,7 @@ optional = false
 python-versions = ">=3.6.0"

 [package.extras]
-unicode_backport = ["unicodedata2"]
+unicode-backport = ["unicodedata2"]

 [[package]]
 name = "click"
@@ -601,7 +593,7 @@ python-versions = ">=3.6"
 cffi = ">=1.12"

 [package.extras]
-docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
+docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
 pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
 sdist = ["setuptools_rust (>=0.11.4)"]
@@ -746,9 +738,9 @@ python-versions = ">=3.6.1,<4.0"

 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements_deprecated_finder = ["pip-api", "pipreqs"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]

 [[package]]
 name = "itsdangerous"
@@ -823,7 +815,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-"testing.libs" = ["simplejson", "ujson", "yajl"]
+testing-libs = ["simplejson", "ujson", "yajl"]

 [[package]]
 name = "jsonpointer"
@@ -844,11 +836,12 @@ python-versions = "*"
 [package.dependencies]
 attrs = ">=17.4.0"
 pyrsistent = ">=0.14.0"
+setuptools = "*"
 six = ">=1.11.0"

 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]

 [[package]]
 name = "junit-xml"
@@ -908,6 +901,7 @@ pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
 responses = ">=0.9.0"
+setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0"
 xmltodict = "*"
@@ -1016,6 +1010,7 @@ python-versions = ">=3.7.0,<4.0.0"
 jsonschema = ">=3.2.0,<5.0.0"
 openapi-schema-validator = ">=0.2.0,<0.3.0"
 PyYAML = ">=5.1"
+setuptools = "*"

 [package.extras]
 requests = ["requests"]
@@ -1348,7 +1343,7 @@ urllib3 = ">=1.21.1,<1.27"

 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]

 [[package]]
 name = "responses"
@@ -1402,6 +1397,19 @@ python-versions = ">= 2.7"
 attrs = "*"
 pbr = "*"

+[[package]]
+name = "setuptools"
+version = "65.5.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1468,6 +1476,14 @@ category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"

+[[package]]
+name = "types-toml"
+version = "0.10.8"
+description = "Typing stubs for toml"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "types-urllib3"
 version = "1.26.17"
@@ -1552,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975"
+content-hash = "9352a89d49d34807f6a58f6c3f898acbd8cf3570e0f45ede973673644bde4d0e"

 [metadata.files]
 aiopg = [
@@ -1647,10 +1663,6 @@ botocore-stubs = [
    {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"},
    {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"},
 ]
-cached-property = [
-    {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"},
-    {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"},
-]
 certifi = [
    {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"},
    {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"},
@@ -1966,6 +1978,7 @@ prometheus-client = [
 psycopg2-binary = [
    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
+    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -1999,6 +2012,7 @@ psycopg2-binary = [
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
+    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -2010,6 +2024,7 @@ psycopg2-binary = [
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
+    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -2026,18 +2041,7 @@ py = [
    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
@@ -2147,6 +2151,13 @@ pyyaml = [
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2194,6 +2205,10 @@ sarif-om = [
    {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
    {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
+setuptools = [
+    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
+    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+]
 six = [
    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -2222,6 +2237,10 @@ types-s3transfer = [
    {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
    {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
 ]
+types-toml = [
+    {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"},
+    {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"},
+]
 types-urllib3 = [
    {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
    {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -11,7 +11,6 @@ use crate::{
    compute, http, mgmt, stream, url,
    waiters::{self, Waiter, Waiters},
 };
-use metrics::{register_int_counter_vec, IntCounterVec};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
@@ -20,15 +19,6 @@ use tracing::{info, warn};

 static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);

-static AUTH_METHOD_USED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_auth_method_used_total",
-        "Number of authentication requests served.",
-        &["method_name"],
-    )
-    .unwrap()
-});
-
 /// Give caller an opportunity to wait for the cloud's reply.
 pub async fn with_waiter<R, T, E>(
    psql_session_id: impl Into<String>,
@@ -182,11 +172,8 @@ impl BackendType<'_, ClientCredentials<'_>> {
            // support SNI or other means of passing the project name.
            // We now expect to see a very specific payload in the place of password.
            if creds.project().is_none() {
-                AUTH_METHOD_USED_COUNTER
-                    .with_label_values(&["password_hack"])
-                    .inc();
-
                warn!("project name not specified, resorting to the password hack auth flow");
+
                let payload = AuthFlow::new(client)
                    .begin(auth::PasswordHack)
                    .await?
@@ -223,10 +210,6 @@ impl BackendType<'_, ClientCredentials<'_>> {

        let res = match self {
            Console(endpoint, creds) => {
-                AUTH_METHOD_USED_COUNTER
-                    .with_label_values(&["console"])
-                    .inc();
-
                info!(
                    user = creds.user,
                    project = creds.project(),
@@ -237,10 +220,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    .await
            }
            Postgres(endpoint, creds) => {
-                AUTH_METHOD_USED_COUNTER
-                    .with_label_values(&["postgres"])
-                    .inc();
-
                info!("performing mock authentication using a local postgres instance");
                postgres::Api::new(&endpoint, &creds)
                    .handle_user(client)
@@ -248,7 +227,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
-                AUTH_METHOD_USED_COUNTER.with_label_values(&["link"]).inc();
                info!("performing link authentication");
                link::handle_user(&url, client).await
            }
--- a/Show More
+++ b/Show More