Use access counter for file cache pages

Use access counter for giel cache pages
Cache reconstructed pages on disk
2026-03-05 17:30:38 +00:00 · 2022-11-04 12:55:21 +02:00 · 2022-11-04 12:22:47 +02:00 · 2022-11-03 19:57:27 +02:00 · 2022-11-03 19:01:03 +02:00 · 2022-10-31 23:00:54 +03:00
83 changed files with 3958 additions and 1455 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -73,6 +73,13 @@ runs:
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

+    - name: Download compatibility snapshot for Postgres 14
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
@@ -80,6 +87,8 @@ runs:
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -154,6 +163,15 @@ runs:
          scripts/generate_and_push_perf_report.sh
        fi

+    - name: Upload compatibility snapshot for Postgres 14
+      if: github.ref_name == 'release'
+      uses: ./.github/actions/upload
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        prefix: latest
+
    - name: Create Allure report
      if: always()
      uses: ./.github/actions/allure-report
--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: neon-storage-ireland
    bucket_region: eu-west-1
    console_mgmt_base_url: http://neon-stress-console.local
-    env_name: neon-stress
    etcd_endpoints: neon-stress-etcd.local:2379
    safekeeper_enable_s3_offload: 'false'
    pageserver_config_stub:
@@ -12,6 +11,7 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: neon-stress/wal
    hostname_suffix: ".local"
    remote_user: admin
  children:
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -1,7 +1,6 @@
 ---
 storage:
  vars:
-    env_name: prod-1
    console_mgmt_base_url: http://console-release.local
    bucket_name: zenith-storage-oregon
    bucket_region: us-west-2
@@ -12,6 +11,7 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: prod-1/wal
    hostname_suffix: ".local"
    remote_user: admin

--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    env_name: us-stage
    etcd_endpoints: zenith-us-stage-etcd.local:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
@@ -11,6 +10,7 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: us-stage/wal
    hostname_suffix: ".local"
    remote_user: admin

--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: neon-staging-storage-us-east-2
    bucket_region: us-east-2
    console_mgmt_base_url: http://console-staging.local
-    env_name: us-stage
    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
@@ -11,6 +10,7 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
    hostname_suffix: ""
    remote_user: ssm-user
    ansible_aws_ssm_region: us-east-2
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.us-east-2.aws.neon.build"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: dev
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -481,6 +481,7 @@ jobs:

  neon-image:
    runs-on: dev
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

    steps:
@@ -494,10 +495,11 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
    runs-on: dev
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

    steps:
@@ -508,11 +510,12 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

  compute-node-image:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -527,11 +530,12 @@ jobs:
        # cloud repo depends on this image name, thus duplicating it
        # remove compute-node when cloud repo is updated
      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}

  compute-node-image-v14:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -543,12 +547,13 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}


  compute-node-image-v15:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -560,11 +565,11 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}

  promote-images:
    runs-on: dev
-    needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
@@ -577,8 +582,9 @@ jobs:

    steps:
      - name: Promote image to latest
-        run:
-          MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+        run: |
+          export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
+          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
    runs-on: dev
@@ -597,19 +603,19 @@ jobs:
          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json

      - name: Pull neon image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon

      - name: Pull compute tools image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools

      - name: Pull compute node image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node

      - name: Pull compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

      - name: Pull compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15

      - name: Pull rust image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
@@ -619,11 +625,11 @@ jobs:
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -819,3 +825,52 @@ jobs:
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  deploy-proxy-new:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-test-snapshot:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ deploy, deploy-proxy ]
+    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
+    steps:
+      - name: Promote compatibility snapshot for the release
+        shell: bash -euxo pipefail {0}
+        env:
+          BUCKET: neon-github-public-dev
+          PREFIX: artifacts/latest
+        run: |
+          for build_type in debug release; do
+            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
+            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+
+            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          done
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -894,19 +894,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "dashmap"
-version = "5.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
-dependencies = [
- "cfg-if",
- "hashbrown",
- "lock_api",
- "once_cell",
- "parking_lot_core 0.9.3",
-]
-
 [[package]]
 name = "data-encoding"
 version = "2.3.2"
@@ -2154,7 +2141,6 @@ dependencies = [
 "criterion",
 "crossbeam-utils",
 "daemonize",
- "dashmap",
 "etcd_broker",
 "fail",
 "futures",
@@ -2184,6 +2170,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "svg_fmt",
 "tar",
 "tempfile",
 "thiserror",
@@ -2202,7 +2189,10 @@ dependencies = [
 name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
 "const_format",
+ "postgres_ffi",
 "serde",
 "serde_with",
 "utils",
@@ -3475,6 +3465,12 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"

+[[package]]
+name = "svg_fmt"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+
 [[package]]
 name = "symbolic-common"
 version = "8.8.0"
@@ -3946,6 +3942,16 @@ dependencies = [
 "tracing-core",
 ]

+[[package]]
+name = "tracing-serde"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.16"
@@ -3956,12 +3962,15 @@ dependencies = [
 "nu-ansi-term",
 "once_cell",
 "regex",
+ "serde",
+ "serde_json",
 "sharded-slab",
 "smallvec",
 "thread_local",
 "tracing",
 "tracing-core",
 "tracing-log",
+ "tracing-serde",
 ]

 [[package]]
@@ -4056,6 +4065,8 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "strum",
+ "strum_macros",
 "tempfile",
 "thiserror",
 "tokio",
--- a/3
+++ b/3
@@ -44,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
    && cachepot -s

 # Build final image
@@ -65,6 +65,7 @@ RUN set -e \

 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin

--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -1,24 +1,26 @@
-ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.0
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v14
+#
+# This file is identical to the Dockerfile.compute-node-v15 file
+# except for the version of Postgres that is built.
+#

+ARG TAG=pinned
+
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config

+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
@@ -29,22 +31,20 @@ RUN cd postgres && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
-    tar xvzf postgis-3.3.0.tar.gz && \
-    cd postgis-3.3.0 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
+    tar xvzf postgis-3.3.1.tar.gz && \
+    cd postgis-3.3.1 && \
    ./autogen.sh && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    ./configure && \
@@ -57,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils

-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold

 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -77,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -110,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control

+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -128,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -155,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -175,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -189,12 +212,6 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -4,26 +4,23 @@
 #

 ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.1
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v15

+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config

+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
@@ -34,14 +31,12 @@ RUN cd postgres && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -62,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils

-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold

 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -82,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -115,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control

+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -133,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -160,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -180,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -194,12 +212,6 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -424,8 +424,29 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
        db_client.simple_query(&alter_query)?;

        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed since postgres 15, where this privilege is removed by default.
-        let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
+        // This is needed because since postgres 15 this privilege is removed by default.
+        let grant_query = "DO $$\n\
+                BEGIN\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    ) AND\n\
+                    current_setting('server_version_num')::int/10000 >= 15\n\
+                    THEN\n\
+                        IF EXISTS(\n\
+                            SELECT rolname\n\
+                            FROM pg_catalog.pg_roles\n\
+                            WHERE rolname = 'web_access'\n\
+                        )\n\
+                        THEN\n\
+                            GRANT CREATE ON SCHEMA public TO web_access;\n\
+                        END IF;\n\
+                    END IF;\n\
+                END\n\
+            $$;"
+        .to_string();
+
        info!("grant query for db {} : {}", &db.name, &grant_query);
        db_client.simple_query(&grant_query)?;
    }
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -183,18 +183,18 @@ impl PostgresNode {
    }

    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
-        let pg_path = self.env.pg_bin_dir(pg_version).join("postgres");
+        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
        let mut cmd = Command::new(&pg_path);

        cmd.arg("--sync-safekeepers")
            .env_clear()
            .env(
                "LD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version).to_str().unwrap(),
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env(
                "DYLD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version).to_str().unwrap(),
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env("PGDATA", self.pgdata().to_str().unwrap())
            .stdout(Stdio::piped())
@@ -282,9 +282,7 @@ impl PostgresNode {
    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
-        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
-        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
-        conf.append("wal_log_hints", "on");
+        conf.append("wal_log_hints", "off");
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
@@ -422,7 +420,7 @@ impl PostgresNode {
    }

    fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
-        let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl");
+        let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
        let mut cmd = Command::new(pg_ctl_path);
        cmd.args(
            [
@@ -440,11 +438,11 @@ impl PostgresNode {
        .env_clear()
        .env(
            "LD_LIBRARY_PATH",
-            self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
+            self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
        )
        .env(
            "DYLD_LIBRARY_PATH",
-            self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
+            self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
        );
        if let Some(token) = auth_token {
            cmd.env("ZENITH_AUTH_TOKEN", token);
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -52,6 +52,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
            // size smaller. Our test etcd clusters are very small.
            // See https://github.com/etcd-io/etcd/issues/7910
            "--quota-backend-bytes=100000000".to_string(),
+            // etcd doesn't compact (vacuum) with default settings,
+            // enable it to prevent space exhaustion.
+            "--auto-compaction-mode=revision".to_string(),
+            "--auto-compaction-retention=1".to_string(),
        ])
        .stdout(Stdio::from(etcd_stdout_file))
        .stderr(Stdio::from(etcd_stderr_file))
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -201,28 +201,28 @@ impl LocalEnv {
        self.pg_distrib_dir.clone()
    }

-    pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match pg_version {
-            14 => path.join(format!("v{pg_version}")),
-            15 => path.join(format!("v{pg_version}")),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("bin"),
-            15 => self.pg_distrib_dir(pg_version).join("bin"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("lib"),
-            15 => self.pg_distrib_dir(pg_version).join("lib"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

@@ -422,10 +422,10 @@ impl LocalEnv {
            "directory '{}' already exists. Perhaps already initialized?",
            base_path.display()
        );
-        if !self.pg_bin_dir(pg_version).join("postgres").exists() {
+        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
-                self.pg_bin_dir(pg_version).display()
+                self.pg_bin_dir(pg_version)?.display()
            );
        }
        for binary in ["pageserver", "safekeeper"] {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -123,7 +123,6 @@ impl SafekeeperNode {
                .args(&["--id", self.id.to_string().as_ref()])
                .args(&["--listen-pg", &listen_pg])
                .args(&["--listen-http", &listen_http])
-                .args(&["--recall", "1 second"])
                .arg("--daemonize"),
        );
        if !self.conf.sync {
--- a/docker-compose/compute/shell/compute.sh
+++ b/docker-compose/compute/shell/compute.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eux
+
+PG_VERSION=${PG_VERSION:-14}
+
+SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
+SPEC_FILE=/tmp/spec.json
+
+echo "Waiting pageserver become ready."
+while ! nc -z pageserver 6400; do
+     sleep 1;
+done
+echo "Page server is ready."
+
+echo "Create a tenant and timeline"
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{}"
+     http://pageserver:9898/v1/tenant/
+)
+tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+)
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
+
+echo "Overwrite tenant id and timeline id in spec file"
+tenant_id=$(echo ${result} | jq -r .tenant_id)
+timeline_id=$(echo ${result} | jq -r .timeline_id)
+
+sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+
+cat ${SPEC_FILE}
+
+echo "Start compute node"
+/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
+     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
+     -b /usr/local/bin/postgres                              \
+     -S ${SPEC_FILE}
--- a/docker-compose/compute/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute/var/db/postgres/specs/spec.json
@@ -0,0 +1,141 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2022-10-12T18:00:00.000Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+    "cluster": {
+        "cluster_id": "docker_compose",
+        "name": "docker_compose_test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "cloud_admin",
+                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                "options": null
+            }
+        ],
+        "databases": [
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "port",
+                "value": "55433",
+                "vartype": "integer"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "1MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "5s",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_keep_size",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "restart_after_crash",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "neon",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.safekeepers",
+                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.timeline_id",
+                "value": "TIMELINE_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.tenant_id",
+                "value": "TENANT_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.pageserver_connstring",
+                "value": "host=pageserver port=6400",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_write_lag",
+                "value": "500MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_flush_lag",
+                "value": "10GB",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+    ]
+}
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -0,0 +1,200 @@
+version: '3'
+
+services:
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.4
+    ports:
+      - 2379:2379
+      - 2380:2380
+    environment:
+      # This signifficantly speeds up etcd and we anyway don't data persistency there.
+      ETCD_UNSAFE_NO_FSYNC: "1"
+    command: 
+      - "etcd"
+      - "--auto-compaction-mode=revision"
+      - "--auto-compaction-retention=1"
+      - "--name=etcd-cluster"
+      - "--initial-cluster-state=new"
+      - "--initial-cluster-token=etcd-cluster-1"
+      - "--initial-cluster=etcd-cluster=http://etcd:2380"
+      - "--initial-advertise-peer-urls=http://etcd:2380"
+      - "--advertise-client-urls=http://etcd:2379"
+      - "--listen-client-urls=http://0.0.0.0:2379"
+      - "--listen-peer-urls=http://0.0.0.0:2380"
+      - "--quota-backend-bytes=134217728" # 128 MB
+
+  minio:
+    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    command: server /data --address :9000 --console-address ":9001"
+
+  minio_create_buckets:
+    image: minio/mc
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command: 
+      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
+             echo 'Waiting to start minio...' && sleep 1;
+         done;
+         /usr/bin/mc mb minio/neon --region=eu-north-1;
+         exit 0;"
+    depends_on:
+      - minio
+
+  pageserver:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - BROKER_ENDPOINT='http://etcd:2379'
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+       #- 6400:6400  # pg protocol handler
+       - 9898:9898 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper1:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
+      - SAFEKEEPER_ID=1
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7676:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper2:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
+      - SAFEKEEPER_ID=2
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7677:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper3:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
+      - SAFEKEEPER_ID=3
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7678:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  compute:
+    build:
+      context: ./image/compute
+      args:
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - http_proxy=$http_proxy
+        - https_proxy=$https_proxy
+    environment:
+      - PG_VERSION=${PG_VERSION:-14}
+      #- RUST_BACKTRACE=1
+    volumes:
+      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute/shell/:/shell/
+    ports:
+      - 55433:55433 # pg protocol handler
+      - 3080:3080 # http endpoints
+    entrypoint:
+      - "/shell/compute.sh"
+    depends_on:
+      - safekeeper1
+      - safekeeper2
+      - safekeeper3
+      - pageserver
+
+  compute_is_ready:
+    image: postgres:latest
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - "until pg_isready -h compute -p 55433 ; do
+            echo 'Waiting to start compute...' && sleep 1;
+         done"
+    depends_on:
+      - compute
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -0,0 +1,10 @@
+ARG COMPUTE_IMAGE=compute-node-v14:latest
+FROM neondatabase/${COMPUTE_IMAGE}
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -80,4 +80,6 @@
 - [015-storage-messaging](rfcs/015-storage-messaging.md)
 - [016-connection-routing](rfcs/016-connection-routing.md)
 - [017-timeline-data-management](rfcs/017-timeline-data-management.md)
+- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
+- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
 - [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall
 1. `neondatabase/compute-tools` and `neondatabase/compute-node`

 2. `neondatabase/neon`
+
+## Docker Compose example
+
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+
+- etcd x 1
+- pageserver x 1
+- safekeeper x 3
+- compute x 1
+- MinIO x 1        # This is Amazon S3 compatible object storage
+
+### How to use
+
+1. create containers
+
+You can specify version of neon cluster using following environment values.
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+```
+$ cd docker-compose/docker-compose.yml
+$ docker-compose down   # remove the conainers if exists
+$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+Creating network "dockercompose_default" with the default driver
+Creating dockercompose_etcd3_1 ...
+(...omit...)
+```
+
+2. connect compute node
+```
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
+postgres=# CREATE TABLE t(key int primary key, value text);
+CREATE TABLE
+postgres=# insert into t values(1,1);
+INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+3. If you want to see the log, you can use `docker-compose logs` command.
+```
+# check the container name you want to see
+$ docker ps
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+(...omit...)
+
+$ docker logs -f dockercompose_compute_1
+2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
+2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
+(...omit...)
+```
+
+4. If you want to see durable data in MinIO which is s3 compatible storage
+
+Access http://localhost:9001 and sign in.
+
+- Username: `minio`
+- Password: `password`
+
+You can see durable pages and WAL data in `neon` bucket.
--- a/docs/rfcs/019-tenant-timeline-lifecycles.md
+++ b/docs/rfcs/019-tenant-timeline-lifecycles.md
@@ -0,0 +1,91 @@
+# Managing Tenant and Timeline lifecycles
+
+## Summary
+
+The pageserver has a Tenant object in memory for each tenant it manages, and a
+Timeline for each timeline. There are a lot of tasks that operate on the tenants
+and timelines with references to those objects. We have some mechanisms to track
+which tasks are operating on each Tenant and Timeline, and to request them to
+shutdown when a tenant or timeline is deleted, but it does not cover all uses,
+and as a result we have many race conditions around tenant/timeline shutdown.
+
+## Motivation
+
+We have a bunch of race conditions that can produce weird errors and can be hard
+to track down.
+
+## Non Goals
+
+This RFC only covers the problem of ensuring that a task/thread isn't operating
+on a Tenant or Timeline. It does not cover what states, aside from Active and
+non-Active, each Tenant and Timeline should have, or when exactly the transitions
+should happen.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Pageserver. Although I wonder if the safekeeper should have a similar mechanism.
+
+## Current situation
+
+Most pageserver tasks of are managed by task_mgr.rs:
+
+- LibpqEndpointListener
+- HttpEndPointListener
+- WalReceiverManager and -Connection
+- GarbageCollector and Compaction
+- InitialLogicalSizeCalculation
+
+In addition to those tasks, the walreceiver performs some direct tokio::spawn
+calls to spawn tasks that are not registered with 'task_mgr'. And all of these
+tasks can spawn extra operations with tokio spawn_blocking.
+
+Whenever a tenant or timeline is removed from the system, by pageserver
+shutdown, delete_timeline or tenant-detach operation, we rely on the task
+registry in 'task_mgr.rs' to wait until there are no tasks operating on the
+tenant or timeline, before its Tenant/Timeline object is removed. That relies on
+each task to register itself with the tenant/timeline ID in
+'task_mgr.rs'. However, there are many gaps in that. For example,
+GarbageCollection and Compaction tasks are registered with the tenant, but when
+they proceed to operate on a particular timeline of the tenant, they don't
+register with timeline ID. Because of that, the timeline can be deleted while GC
+or compaction is running on it, causing failures in the GC or compaction (see
+https://github.com/neondatabase/neon/issues/2442).
+
+Another problem is that the task registry only works for tokio Tasks. There is
+no way to register a piece of code that runs inside spawn_blocking(), for
+example.
+
+## Proposed implementation
+
+This "voluntary" registration of tasks is fragile. Let's use Rust language features
+to enforce that a tenant/timeline cannot be removed from the system when there is
+still some code operating on it.
+
+Let's introduce new Guard objects for Tenant and Timeline, and do all actions through
+the Guard object. Something like:
+
+TenantActiveGuard: Guard object over Arc<Tenant>. When you acquire the guard,
+the code checks that the tenant is in Active state. If it's not, you get an
+error. You can change the state of the tenant to Stopping while there are
+ActiveTenantGuard objects still on it, to prevent new ActiveTenantGuards from
+being acquired, but the Tenant cannot be removed until all the guards are gone.
+
+TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the
+tenant is not in Active state. Used for operations like attach/detach. Perhaps
+allow only one such guard on a Tenant at a time.
+
+Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think
+we need at least two states: Active and Stopping. The Stopping state is used at
+deletion, to prevent new TimelineActiveGuards from appearing, while you wait for
+existing TimelineActiveGuards to die out.
+
+The shutdown-signaling, using shutdown_watcher() and is_shutdown_requested(),
+probably also needs changes to deal with the new Guards. The rule is that if you
+have a TenantActiveGuard, and the tenant's state changes from Active to
+Stopping, the is_shutdown_requested() function should return true, and
+shutdown_watcher() future should return.
+
+This signaling doesn't neessarily need to cover all cases. For example, if you
+have a block of code in spawn_blocking(), it might be acceptable if
+is_shutdown_requested() doesn't return true even though the tenant is in
+Stopping state, as long as the code finishes reasonably fast.
--- a/libs/etcd_broker/src/subscription_value.rs
+++ b/libs/etcd_broker/src/subscription_value.rs
@@ -29,6 +29,9 @@ pub struct SkTimelineInfo {
    #[serde_as(as = "Option<DisplayFromStr>")]
    #[serde(default)]
    pub peer_horizon_lsn: Option<Lsn>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub local_start_lsn: Option<Lsn>,
    /// A connection string to use for WAL receiving.
    #[serde(default)]
    pub safekeeper_connstr: Option<String>,
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
 const_format = "0.2.21"
+anyhow = { version = "1.0", features = ["backtrace"] }
+bytes = "1.0.1"

 utils = { path = "../utils" }
+postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,6 +2,7 @@ use const_format::formatcp;

 /// Public API types
 pub mod models;
+pub mod reltag;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,6 +7,10 @@ use utils::{
    lsn::Lsn,
 };

+use crate::reltag::RelTag;
+use anyhow::bail;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+
 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TenantState {
@@ -19,6 +23,22 @@ pub enum TenantState {
    Broken,
 }

+/// A state of a timeline in pageserver's memory.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum TimelineState {
+    /// Timeline is fully operational, its background jobs are running.
+    Active,
+    /// A timeline is recognized by pageserver, but not yet ready to operate.
+    /// The status indicates, that the timeline could eventually go back to Active automatically:
+    /// for example, if the owning tenant goes back to Active again.
+    Suspended,
+    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
+    /// automatically become Active after certain events: only a management call can change this status.
+    Paused,
+    /// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
+    Broken,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
@@ -160,6 +180,8 @@ pub struct TimelineInfo {
    pub remote_consistent_lsn: Option<Lsn>,
    pub awaits_download: bool,

+    pub state: TimelineState,
+
    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
    // compatility with older clients.
    pub local: LocalTimelineInfo,
@@ -201,3 +223,160 @@ pub struct FailpointConfig {
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }
+
+// Wrapped in libpq CopyData
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+}
+
+// Wrapped in libpq CopyData
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub dbnode: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub db_size: i64,
+}
+
+impl PagestreamFeMessage {
+    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.get_u8();
+        match msg_tag {
+            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+                blkno: body.get_u32(),
+            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
+            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.exists as u8);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::GetPage(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put(&resp.page[..]);
+            }
+
+            Self::Error(resp) => {
+                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
+        }
+
+        bytes.into()
+    }
+}
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -37,22 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
 });

 impl Conf {
-    pub fn pg_distrib_dir(&self) -> PathBuf {
+    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match self.pg_version {
-            14 => path.join(format!("v{}", self.pg_version)),
-            15 => path.join(format!("v{}", self.pg_version)),
-            _ => panic!("Unsupported postgres version: {}", self.pg_version),
+            14 => Ok(path.join(format!("v{}", self.pg_version))),
+            15 => Ok(path.join(format!("v{}", self.pg_version))),
+            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }

-    fn pg_bin_dir(&self) -> PathBuf {
-        self.pg_distrib_dir().join("bin")
+    fn pg_bin_dir(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir()?.join("bin"))
    }

-    fn pg_lib_dir(&self) -> PathBuf {
-        self.pg_distrib_dir().join("lib")
+    fn pg_lib_dir(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir()?.join("lib"))
    }

    pub fn wal_dir(&self) -> PathBuf {
@@ -60,12 +60,12 @@ impl Conf {
    }

    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
-        let path = self.pg_bin_dir().join(command);
+        let path = self.pg_bin_dir()?.join(command);
        ensure!(path.exists(), "Command {:?} does not exist", path);
        let mut cmd = Command::new(path);
        cmd.env_clear()
-            .env("LD_LIBRARY_PATH", self.pg_lib_dir())
-            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
+            .env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
        Ok(cmd)
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -16,7 +16,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe_dir::path_with_suffix_extension;
+use utils::crashsafe::path_with_suffix_extension;

 use crate::{Download, DownloadError, RemoteObjectId};

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -19,7 +19,7 @@ thiserror = "1.0"
 tokio = { version = "1.17", features = ["macros"]}
 tokio-rustls = "0.23"
 tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 nix = "0.25"
 signal-hook = "0.3.10"
 rand = "0.8.3"
@@ -30,6 +30,8 @@ rustls-split = "0.3.0"
 git-version = "0.3.5"
 serde_with = "2.0"
 once_cell = "1.13.0"
+strum = "0.24"
+strum_macros = "0.24"


 metrics = { path = "../metrics" }
--- a/libs/utils/src/crashsafe_dir.rs
+++ b/libs/utils/src/crashsafe_dir.rs
@@ -12,16 +12,8 @@ pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
    let path = path.as_ref();

    fs::create_dir(path)?;
-    File::open(path)?.sync_all()?;
-
-    if let Some(parent) = path.parent() {
-        File::open(parent)?.sync_all()
-    } else {
-        Err(io::Error::new(
-            io::ErrorKind::InvalidInput,
-            "can't find parent",
-        ))
-    }
+    fsync_file_and_parent(path)?;
+    Ok(())
 }

 /// Similar to [`std::fs::create_dir_all`], except we fsync all
@@ -65,12 +57,12 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {

    // Fsync the created directories from child to parent.
    for &path in dirs_to_create.iter() {
-        File::open(path)?.sync_all()?;
+        fsync(path)?;
    }

    // If we created any new directories, fsync the parent.
    if !dirs_to_create.is_empty() {
-        File::open(path)?.sync_all()?;
+        fsync(path)?;
    }

    Ok(())
@@ -92,6 +84,33 @@ pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str)
        .with_extension(new_extension.as_ref())
 }

+pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
+    let parent = file_path.parent().ok_or_else(|| {
+        io::Error::new(
+            io::ErrorKind::Other,
+            format!("File {file_path:?} has no parent"),
+        )
+    })?;
+
+    fsync(file_path)?;
+    fsync(parent)?;
+    Ok(())
+}
+
+pub fn fsync(path: &Path) -> io::Result<()> {
+    File::open(path)
+        .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
+        .and_then(|file| {
+            file.sync_all().map_err(|e| {
+                io::Error::new(
+                    e.kind(),
+                    format!("Failed to sync file {path:?} data and metadata: {e}"),
+                )
+            })
+        })
+        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -75,6 +75,12 @@ impl From<[u8; 16]> for Id {
    }
 }

+impl From<Id> for u128 {
+    fn from(id: Id) -> Self {
+        u128::from_le_bytes(id.0)
+    }
+}
+
 impl fmt::Display for Id {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(&self.hex_encode())
@@ -136,6 +142,12 @@ macro_rules! id_newtype {
            }
        }

+        impl From<$t> for u128 {
+            fn from(id: $t) -> Self {
+                u128::from(id.0)
+            }
+        }
+
        impl fmt::Display for $t {
            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
                self.0.fmt(f)
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -22,8 +22,8 @@ pub mod pq_proto;
 // dealing with connstring parsing and handy access to it's parts
 pub mod connstring;

-// helper functions for creating and fsyncing directories/trees
-pub mod crashsafe_dir;
+// helper functions for creating and fsyncing
+pub mod crashsafe;

 // common authentication routines
 pub mod auth;
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,11 +1,35 @@
 use std::{
    fs::{File, OpenOptions},
    path::Path,
+    str::FromStr,
 };

 use anyhow::{Context, Result};
+use strum_macros::{EnumString, EnumVariantNames};

-pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
+#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[strum(serialize_all = "snake_case")]
+pub enum LogFormat {
+    Plain,
+    Json,
+}
+
+impl LogFormat {
+    pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
+        use strum::VariantNames;
+        LogFormat::from_str(s).with_context(|| {
+            format!(
+                "Unrecognized log format. Please specify one of: {:?}",
+                LogFormat::VARIANTS
+            )
+        })
+    }
+}
+pub fn init(
+    log_filename: impl AsRef<Path>,
+    daemonize: bool,
+    log_format: LogFormat,
+) -> Result<File> {
    // Don't open the same file for output multiple times;
    // the different fds could overwrite each other's output.
    let log_file = OpenOptions::new()
@@ -21,22 +45,50 @@ pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));

+    let x: File = log_file.try_clone().unwrap();
    let base_logger = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
-        .with_target(false) // don't include event targets
-        .with_ansi(false); // don't use colors in log file;
+        .with_target(false)
+        .with_ansi(false)
+        .with_writer(move || -> Box<dyn std::io::Write> {
+            // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
+            // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
+            // for example to be in line with docker log command which expects logs comimg from stdout
+            if daemonize {
+                Box::new(x.try_clone().unwrap())
+            } else {
+                Box::new(std::io::stdout())
+            }
+        });

-    // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
-    // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
-    // for example to be in line with docker log command which expects logs comimg from stdout
-    if daemonize {
-        let x = log_file.try_clone().unwrap();
-        base_logger
-            .with_writer(move || x.try_clone().unwrap())
-            .init();
-    } else {
-        base_logger.init();
+    match log_format {
+        LogFormat::Json => base_logger.json().init(),
+        LogFormat::Plain => base_logger.init(),
    }

    Ok(log_file)
 }
+
+// #[cfg(test)]
+// Due to global logger, can't run tests in same process.
+// So until there's a non-global one, the tests are in ../tests/ as separate files.
+#[macro_export(local_inner_macros)]
+macro_rules! test_init_file_logger {
+    ($log_level:expr, $log_format:expr) => {{
+        use std::str::FromStr;
+        std::env::set_var("RUST_LOG", $log_level);
+
+        let tmp_dir = tempfile::TempDir::new().unwrap();
+        let log_file_path = tmp_dir.path().join("logfile");
+
+        let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap();
+        let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap();
+
+        let log_file = std::fs::OpenOptions::new()
+            .read(true)
+            .open(&log_file_path)
+            .unwrap();
+
+        log_file
+    }};
+}
--- a/libs/utils/tests/logger_json_test.rs
+++ b/libs/utils/tests/logger_json_test.rs
@@ -0,0 +1,36 @@
+// This could be in ../src/logging.rs but since the logger is global, these
+// can't be run in threads of the same process
+use std::fs::File;
+use std::io::{BufRead, BufReader, Lines};
+use tracing::*;
+use utils::test_init_file_logger;
+
+fn read_lines(file: File) -> Lines<BufReader<File>> {
+    BufReader::new(file).lines()
+}
+
+#[test]
+fn test_json_format_has_message_and_custom_field() {
+    std::env::set_var("RUST_LOG", "info");
+
+    let log_file = test_init_file_logger!("info", "json");
+
+    let custom_field: &str = "hi";
+    trace!(custom = %custom_field, "test log message");
+    debug!(custom = %custom_field, "test log message");
+    info!(custom = %custom_field, "test log message");
+    warn!(custom = %custom_field, "test log message");
+    error!(custom = %custom_field, "test log message");
+
+    let lines = read_lines(log_file);
+    for line in lines {
+        let content = line.unwrap();
+        let json_object = serde_json::from_str::<serde_json::Value>(&content).unwrap();
+
+        assert_eq!(json_object["fields"]["custom"], "hi");
+        assert_eq!(json_object["fields"]["message"], "test log message");
+
+        assert_ne!(json_object["level"], "TRACE");
+        assert_ne!(json_object["level"], "DEBUG");
+    }
+}
--- a/libs/utils/tests/logger_plain_test.rs
+++ b/libs/utils/tests/logger_plain_test.rs
@@ -0,0 +1,36 @@
+// This could be in ../src/logging.rs but since the logger is global, these
+// can't be run in threads of the same process
+use std::fs::File;
+use std::io::{BufRead, BufReader, Lines};
+use tracing::*;
+use utils::test_init_file_logger;
+
+fn read_lines(file: File) -> Lines<BufReader<File>> {
+    BufReader::new(file).lines()
+}
+
+#[test]
+fn test_plain_format_has_message_and_custom_field() {
+    std::env::set_var("RUST_LOG", "warn");
+
+    let log_file = test_init_file_logger!("warn", "plain");
+
+    let custom_field: &str = "hi";
+    trace!(custom = %custom_field, "test log message");
+    debug!(custom = %custom_field, "test log message");
+    info!(custom = %custom_field, "test log message");
+    warn!(custom = %custom_field, "test log message");
+    error!(custom = %custom_field, "test log message");
+
+    let lines = read_lines(log_file);
+    for line in lines {
+        let content = line.unwrap();
+        serde_json::from_str::<serde_json::Value>(&content).unwrap_err();
+        assert!(content.contains("custom=hi"));
+        assert!(content.contains("test log message"));
+
+        assert!(!content.contains("TRACE"));
+        assert!(!content.contains("DEBUG"));
+        assert!(!content.contains("INFO"));
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -67,7 +67,7 @@ remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
 close_fds = "0.3.2"
 walkdir = "2.3.2"
-dashmap = "5.4.0"
+svg_fmt = "0.4.1"

 [dev-dependencies]
 criterion = "0.4"
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,8 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;

-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
+use pageserver_api::reltag::{RelTag, SlruKind};

 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -0,0 +1,150 @@
+//! A tool for visualizing the arrangement of layerfiles within a timeline.
+//!
+//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
+//! page-lsn space, where every delta layer is a rectangle and every image layer is a
+//! thick line. Legend:
+//! - The x axis (left to right) represents page index.
+//! - The y axis represents LSN, growing upwards.
+//!
+//! Coordinates in both axis are compressed for better readability.
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
+//!
+//! Example use:
+//! ```
+//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
+//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ firefox out.svg
+//! ```
+//!
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
+//!
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
+use anyhow::Result;
+use pageserver::repository::Key;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    ops::Range,
+};
+use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
+    let set: BTreeSet<T> = coords.into_iter().collect();
+
+    let mut map: BTreeMap<T, usize> = BTreeMap::new();
+    for (i, e) in set.iter().enumerate() {
+        map.insert(*e, i);
+    }
+
+    map
+}
+
+fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
+    let split: Vec<&str> = name.split("__").collect();
+    let keys: Vec<&str> = split[0].split('-').collect();
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    if lsns.len() == 1 {
+        lsns.push(lsns[0]);
+    }
+
+    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
+    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    (keys, lsns)
+}
+
+fn main() -> Result<()> {
+    // Parse layer filenames from stdin
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    let stdin = io::stdin();
+    for line in stdin.lock().lines() {
+        let range = parse_filename(&line.unwrap());
+        ranges.push(range);
+    }
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for (keyr, lsnr) in &ranges {
+        keys.push(keyr.start);
+        keys.push(keyr.end);
+        lsns.push(lsnr.start);
+        lsns.push(lsnr.end);
+    }
+
+    // Analyze
+    let key_map = build_coordinate_compression_map(keys);
+    let lsn_map = build_coordinate_compression_map(lsns);
+
+    // Initialize stats
+    let mut num_deltas = 0;
+    let mut num_images = 0;
+
+    // Draw
+    let stretch = 3.0; // Stretch out vertically for better visibility
+    println!(
+        "{}",
+        BeginSvg {
+            w: key_map.len() as f32,
+            h: stretch * lsn_map.len() as f32
+        }
+    );
+    for (keyr, lsnr) in &ranges {
+        let key_start = *key_map.get(&keyr.start).unwrap();
+        let key_end = *key_map.get(&keyr.end).unwrap();
+        let key_diff = key_end - key_start;
+        let lsn_max = lsn_map.len();
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
+        let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
+
+        let mut lsn_diff = (lsn_end - lsn_start) as f32;
+        let mut fill = Fill::None;
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let mut lsn_offset = 0.0;
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        match lsn_start.cmp(&lsn_end) {
+            Ordering::Less => num_deltas += 1,
+            Ordering::Equal => {
+                num_images += 1;
+                lsn_diff = 0.3;
+                lsn_offset = -lsn_diff / 2.0;
+                margin = 0.05;
+                fill = Fill::Color(rgb(0, 0, 0));
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+
+        println!(
+            "    {}",
+            rectangle(
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
+            )
+            .fill(fill)
+            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
+            .border_radius(0.4)
+        );
+    }
+    println!("{}", EndSvg);
+
+    eprintln!("num_images: {}", num_images);
+    eprintln!("num_deltas: {}", num_deltas);
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,7 +14,7 @@ use metrics::set_build_info_metric;

 use pageserver::{
    config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, task_mgr,
+    http, page_cache, page_image_cache, page_service, profiling, task_mgr,
    task_mgr::TaskKind,
    task_mgr::{
        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
@@ -87,7 +87,7 @@ fn main() -> anyhow::Result<()> {

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
-        utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| {
+        utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
            format!(
                "Failed to create tenants root dir at '{}'",
                tenants_path.display()
@@ -101,6 +101,7 @@ fn main() -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);
+    page_image_cache::init(64 * conf.page_cache_size); // temporary hack for benchmarking

    start_pageserver(conf, daemonize).context("Failed to start pageserver")?;

@@ -199,7 +200,7 @@ fn initialize_config(

 fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
+    let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?;

    info!("version: {}", version());

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,6 +7,7 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::RemoteStorageConfig;
 use std::env;
+use utils::crashsafe::path_with_suffix_extension;

 use std::path::{Path, PathBuf};
 use std::str::FromStr;
@@ -16,6 +17,7 @@ use toml_edit::{Document, Item};
 use url::Url;
 use utils::{
    id::{NodeId, TenantId, TimelineId},
+    logging::LogFormat,
    postgres_backend::AuthType,
 };

@@ -24,6 +26,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt};

 /// The name of the metadata file pageserver creates per timeline.
 pub const METADATA_FILE_NAME: &str = "metadata";
+pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 const TENANT_CONFIG_NAME: &str = "config";

 pub mod defaults {
@@ -43,6 +46,8 @@ pub mod defaults {
    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;

+    pub const DEFAULT_LOG_FORMAT: &str = "plain";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -61,6 +66,7 @@ pub mod defaults {
 # initial superuser role name to use when creating a new tenant
 #initial_superuser_name = '{DEFAULT_SUPERUSER}'

+#log_format = '{DEFAULT_LOG_FORMAT}'
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -124,6 +130,8 @@ pub struct PageServerConf {

    /// Etcd broker endpoints to connect to.
    pub broker_endpoints: Vec<Url>,
+
+    pub log_format: LogFormat,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -190,6 +198,8 @@ struct PageServerConfigBuilder {
    profiling: BuilderValue<ProfilingConfig>,
    broker_etcd_prefix: BuilderValue<String>,
    broker_endpoints: BuilderValue<Vec<Url>>,
+
+    log_format: BuilderValue<LogFormat>,
 }

 impl Default for PageServerConfigBuilder {
@@ -217,6 +227,7 @@ impl Default for PageServerConfigBuilder {
            profiling: Set(ProfilingConfig::Disabled),
            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
            broker_endpoints: Set(Vec::new()),
+            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
        }
    }
 }
@@ -289,6 +300,10 @@ impl PageServerConfigBuilder {
        self.profiling = BuilderValue::Set(profiling)
    }

+    pub fn log_format(&mut self, log_format: LogFormat) {
+        self.log_format = BuilderValue::Set(log_format)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let broker_endpoints = self
            .broker_endpoints
@@ -333,6 +348,7 @@ impl PageServerConfigBuilder {
            broker_etcd_prefix: self
                .broker_etcd_prefix
                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
+            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
        })
    }
 }
@@ -364,6 +380,17 @@ impl PageServerConf {
        self.timelines_path(tenant_id).join(timeline_id.to_string())
    }

+    pub fn timeline_uninit_mark_file_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&timeline_id, &tenant_id),
+            TIMELINE_UNINIT_MARK_SUFFIX,
+        )
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -374,28 +401,28 @@ impl PageServerConf {
    //
    // Postgres distribution paths
    //
-    pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match pg_version {
-            14 => path.join(format!("v{pg_version}")),
-            15 => path.join(format!("v{pg_version}")),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("bin"),
-            15 => self.pg_distrib_dir(pg_version).join("bin"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("lib"),
-            15 => self.pg_distrib_dir(pg_version).join("lib"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

@@ -446,6 +473,9 @@ impl PageServerConf {
                        })
                        .collect::<anyhow::Result<_>>()?,
                ),
+                "log_format" => builder.log_format(
+                    LogFormat::from_config(&parse_toml_string(key, item)?)?
+                ),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -558,6 +588,7 @@ impl PageServerConf {
            default_tenant_conf: TenantConf::dummy_conf(),
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
        }
    }
 }
@@ -652,6 +683,8 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10

+log_format = 'json'
+
 "#;

    #[test]
@@ -691,6 +724,7 @@ id = 10
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -735,6 +769,7 @@ id = 10
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                log_format: LogFormat::Json,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -618,6 +618,7 @@ components:
        - last_record_lsn
        - disk_consistent_lsn
        - awaits_download
+        - state
      properties:
        timeline_id:
          type: string
@@ -660,6 +661,8 @@ components:
          type: integer
        awaits_download:
          type: boolean
+        state:
+          type: string

        # These 'local' and 'remote' fields just duplicate some of the fields
        # above. They are kept for backwards-compatibility. They can be removed,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -129,6 +129,7 @@ async fn build_timeline_info(
        }
    };
    let current_physical_size = Some(timeline.get_physical_size());
+    let state = timeline.current_state();

    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
@@ -158,6 +159,7 @@ async fn build_timeline_info(

        remote_consistent_lsn,
        awaits_download,
+        state,

        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
        // with the control plane.
@@ -294,7 +296,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body

    let timeline_info = async {
        let timeline = tokio::task::spawn_blocking(move || {
-            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
+            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false)
        })
        .await
        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
@@ -331,14 +333,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let timeline = tenant_mgr::get_tenant(tenant_id, true)
-        .and_then(|tenant| tenant.get_timeline(timeline_id))
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
        .map_err(ApiError::NotFound)?;
    let result = match timeline
        .find_lsn_for_timestamp(timestamp_pg)
        .map_err(ApiError::InternalServerError)?
    {
-        LsnForTimestamp::Present(lsn) => format!("{}", lsn),
+        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
        LsnForTimestamp::Future(_lsn) => "future".into(),
        LsnForTimestamp::Past(_lsn) => "past".into(),
        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
@@ -781,11 +782,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 // Run GC immediately on given timeline.
-// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
-// This probably should require special authentication or a global flag to
-// enable, I don't think we want to or need to allow regular clients to invoke
-// GC.
-//     @hllinnaka in commits ec44f4b29, 3aca717f3
 #[cfg(feature = "testing")]
 async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -793,16 +789,16 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
-    let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let _span_guard =
        info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
-    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon());
+    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());

    // Use tenant's pitr setting
-    let pitr = repo.get_pitr_interval();
-    let result = repo
+    let pitr = tenant.get_pitr_interval();
+    let result = tenant
        .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
        // better once the types support it.
@@ -811,19 +807,15 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 }

 // Run compaction immediately on given timeline.
-// FIXME This is just for tests. Don't expect this to be exposed to
-// the users or the api.
-//     @dhammika in commit a0781f229
 #[cfg(feature = "testing")]
 async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = repo
-        .get_timeline(timeline_id)
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
    timeline.compact().map_err(ApiError::InternalServerError)?;

@@ -837,10 +829,9 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = repo
-        .get_timeline(timeline_id)
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
    timeline
        .checkpoint(CheckpointConfig::Forced)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,10 +12,10 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
@@ -43,19 +43,19 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
 pub fn import_timeline_from_postgres_datadir(
-    path: &Path,
    tline: &Timeline,
-    lsn: Lsn,
+    pgdata_path: &Path,
+    pgdata_lsn: Lsn,
 ) -> Result<()> {
    let mut pg_control: Option<ControlFileData> = None;

    // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
    // Then fishing out pg_control would be unnecessary
-    let mut modification = tline.begin_modification(lsn);
+    let mut modification = tline.begin_modification(pgdata_lsn);
    modification.init_empty()?;

    // Import all but pg_wal
-    let all_but_wal = WalkDir::new(path)
+    let all_but_wal = WalkDir::new(pgdata_path)
        .into_iter()
        .filter_entry(|entry| !entry.path().ends_with("pg_wal"));
    for entry in all_but_wal {
@@ -63,7 +63,7 @@ pub fn import_timeline_from_postgres_datadir(
        let metadata = entry.metadata().expect("error getting dir entry metadata");
        if metadata.is_file() {
            let absolute_path = entry.path();
-            let relative_path = absolute_path.strip_prefix(path)?;
+            let relative_path = absolute_path.strip_prefix(pgdata_path)?;

            let file = File::open(absolute_path)?;
            let len = metadata.len() as usize;
@@ -84,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir(
        "Postgres cluster was not shut down cleanly"
    );
    ensure!(
-        pg_control.checkPointCopy.redo == lsn.0,
+        pg_control.checkPointCopy.redo == pgdata_lsn.0,
        "unexpected checkpoint REDO pointer"
    );

@@ -92,10 +92,10 @@ pub fn import_timeline_from_postgres_datadir(
    // this reads the checkpoint record itself, advancing the tip of the timeline to
    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
    import_wal(
-        &path.join("pg_wal"),
+        &pgdata_path.join("pg_wal"),
        tline,
        Lsn(pg_control.checkPointCopy.redo),
-        lsn,
+        pgdata_lsn,
    )?;

    Ok(())
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -5,10 +5,10 @@ pub mod import_datadir;
 pub mod keyspace;
 pub mod metrics;
 pub mod page_cache;
+pub mod page_image_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
-pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
 pub mod task_mgr;
@@ -46,6 +46,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

 pub const LOG_FILE_NAME: &str = "pageserver.log";

+static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
+
 /// Config for the Repository checkpointer
 #[derive(Debug, Clone, Copy)]
 pub enum CheckpointConfig {
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -36,9 +36,8 @@
 //! mapping is automatically removed and the slot is marked free.
 //!

-use dashmap::mapref::entry::Entry;
-use dashmap::DashMap;
 use std::{
+    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
    sync::{
        atomic::{AtomicU8, AtomicUsize, Ordering},
@@ -109,10 +108,10 @@ enum CacheKey {
 }

 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
-struct MaterializedPageHashKey {
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    key: Key,
+pub struct MaterializedPageHashKey {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub key: Key,
 }

 #[derive(Clone)]
@@ -169,11 +168,18 @@ impl Slot {
 pub struct PageCache {
    /// This contains the mapping from the cache key to buffer slot that currently
    /// contains the page, if any.
-    materialized_page_map: DashMap<MaterializedPageHashKey, Vec<Version>>,
+    ///
+    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
+    /// this HashMap can be replaced with a more concurrent version, there are
+    /// plenty of such crates around.
+    ///
+    /// If you add support for caching different kinds of objects, each object kind
+    /// can have a separate mapping map, next to this field.
+    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    ephemeral_page_map: DashMap<(u64, u32), usize>,
+    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,

-    immutable_page_map: DashMap<(u64, u32), usize>,
+    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -610,7 +616,7 @@ impl PageCache {
    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
        match cache_key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = &self.materialized_page_map;
+                let map = self.materialized_page_map.read().unwrap();
                let versions = map.get(hash_key)?;

                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
@@ -623,11 +629,11 @@ impl PageCache {
                Some(version.slot_idx)
            }
            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = &self.ephemeral_page_map;
+                let map = self.ephemeral_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = &self.immutable_page_map;
+                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
            }
        }
@@ -640,7 +646,7 @@ impl PageCache {
    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
        match key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = &self.materialized_page_map;
+                let map = self.materialized_page_map.read().unwrap();
                let versions = map.get(hash_key)?;

                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
@@ -650,11 +656,11 @@ impl PageCache {
                }
            }
            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = &self.ephemeral_page_map;
+                let map = self.ephemeral_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = &self.immutable_page_map;
+                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
            }
        }
@@ -669,7 +675,7 @@ impl PageCache {
                hash_key: old_hash_key,
                lsn: old_lsn,
            } => {
-                let map = &self.materialized_page_map;
+                let mut map = self.materialized_page_map.write().unwrap();
                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
                    let versions = old_entry.get_mut();

@@ -684,12 +690,12 @@ impl PageCache {
                }
            }
            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = &self.ephemeral_page_map;
+                let mut map = self.ephemeral_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = &self.immutable_page_map;
+                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
            }
@@ -707,8 +713,8 @@ impl PageCache {
                hash_key: new_key,
                lsn: new_lsn,
            } => {
-                let map = &self.materialized_page_map;
-                let mut versions = map.entry(new_key.clone()).or_default();
+                let mut map = self.materialized_page_map.write().unwrap();
+                let versions = map.entry(new_key.clone()).or_default();
                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
                    Err(version_idx) => {
@@ -724,7 +730,7 @@ impl PageCache {
                }
            }
            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = &self.ephemeral_page_map;
+                let mut map = self.ephemeral_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
@@ -734,7 +740,7 @@ impl PageCache {
                }
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = &self.immutable_page_map;
+                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
--- a/pageserver/src/page_image_cache.rs
+++ b/pageserver/src/page_image_cache.rs
@@ -0,0 +1,345 @@
+//!
+//! Global page image cache
+//!
+//! Unlike page_cache it holds only most recent version of reconstructed page images.
+//! And it uses invalidation mechanism to avoid layer ap lookups.
+
+use crate::page_cache::MaterializedPageHashKey;
+use crate::pgdatadir_mapping::{rel_block_to_key, BlockNumber};
+use crate::repository::Key;
+use crate::tenant::Timeline;
+use crate::virtual_file::VirtualFile;
+use anyhow::{bail, Result};
+use bytes::Bytes;
+use once_cell::sync::OnceCell;
+use pageserver_api::reltag::RelTag;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+use std::os::unix::fs::FileExt;
+use std::sync::{Arc, Condvar, Mutex};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+static PAGE_CACHE: OnceCell<Mutex<PageImageCache>> = OnceCell::new();
+const TEST_PAGE_CACHE_SIZE: usize = 50;
+pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
+
+enum PageImageState {
+    Vacant,                        // entry is not used
+    Loaded(bool),                  // page is loaded or has failed
+    Loading(Option<Arc<Condvar>>), // page in process of loading, Condvar is created on demand when some thread need to wait load completion
+}
+
+struct CacheEntry {
+    key: MaterializedPageHashKey,
+
+    // next+prev are used for LRU L2-list and next is also used for L1 free pages list
+    next: usize,
+    prev: usize,
+
+    collision: usize, // L1 hash collision chain
+
+    access_count: u32,
+    state: PageImageState,
+}
+
+pub struct PageImageCache {
+    free_list: usize, // L1 list of free entries
+    pages: Vec<CacheEntry>,
+    hash_table: Vec<usize>, // indexes in pages array
+    file: Arc<VirtualFile>,
+}
+
+///
+/// Initialize the page cache. This must be called once at page server startup.
+///
+pub fn init(size: usize) {
+    if PAGE_CACHE
+        .set(Mutex::new(PageImageCache::new(size)))
+        .is_err()
+    {
+        panic!("page cache already initialized");
+    }
+}
+
+///
+/// Get a handle to the page cache.
+///
+pub fn get() -> &'static Mutex<PageImageCache> {
+    //
+    // In unit tests, page server startup doesn't happen and no one calls
+    // page_image_cache::init(). Initialize it here with a tiny cache, so that the
+    // page cache is usable in unit tests.
+    //
+    if cfg!(test) {
+        PAGE_CACHE.get_or_init(|| Mutex::new(PageImageCache::new(TEST_PAGE_CACHE_SIZE)))
+    } else {
+        PAGE_CACHE.get().expect("page cache not initialized")
+    }
+}
+
+fn hash<T: Hash>(t: &T) -> usize {
+    let mut s = DefaultHasher::new();
+    t.hash(&mut s);
+    s.finish() as usize
+}
+
+impl PageImageCache {
+    fn new(size: usize) -> Self {
+        let mut pages: Vec<CacheEntry> = Vec::with_capacity(size + 1);
+        let hash_table = vec![0usize; size];
+        let file = Arc::new(
+            VirtualFile::open_with_options(
+                &std::path::PathBuf::from("page.cache"),
+                std::fs::OpenOptions::new()
+                    .read(true)
+                    .write(true)
+                    .create(true)
+                    .truncate(true),
+            )
+            .unwrap(),
+        );
+        // Dummy key
+        let dummy_key = MaterializedPageHashKey {
+            key: Key::MIN,
+            tenant_id: TenantId::from([0u8; 16]),
+            timeline_id: TimelineId::from([0u8; 16]),
+        };
+
+        // LRU list head
+        pages.push(CacheEntry {
+            key: dummy_key.clone(),
+            next: 0,
+            prev: 0,
+            access_count: 0,
+            collision: 0,
+            state: PageImageState::Vacant,
+        });
+
+        // Construct L1 free page list
+        for i in 0..size {
+            pages.push(CacheEntry {
+                key: dummy_key.clone(),
+                next: i + 2, // build L1-list of free pages
+                prev: 0,
+                access_count: 0,
+                collision: 0,
+                state: PageImageState::Vacant,
+            });
+        }
+        pages[size - 1].next = 0; // en of free page list
+
+        PageImageCache {
+            free_list: 1,
+            pages,
+            hash_table,
+            file,
+        }
+    }
+
+    // Unlink from L2-list
+    fn unlink(&mut self, index: usize) {
+        let next = self.pages[index].next;
+        let prev = self.pages[index].prev;
+        self.pages[next].prev = prev;
+        self.pages[prev].next = next;
+    }
+
+    // Link in L2-list after specified element
+    fn link_after(&mut self, after: usize, index: usize) {
+        let next = self.pages[after].next;
+        self.pages[index].prev = after;
+        self.pages[index].next = next;
+        self.pages[next].prev = index;
+        self.pages[after].next = index;
+    }
+
+    fn prune(&mut self, index: usize) {
+        self.pages[index].prev = index;
+        self.pages[index].next = index;
+    }
+
+    fn is_empty(&self, index: usize) -> bool {
+        self.pages[index].next == index
+    }
+}
+
+// Remove entry from cache: o page invalidation or drop relation
+pub fn remove(key: Key, tenant_id: TenantId, timeline_id: TimelineId) {
+    let key = MaterializedPageHashKey {
+        key,
+        tenant_id,
+        timeline_id,
+    };
+    let this = get();
+    let mut cache = this.lock().unwrap();
+    let h = hash(&key) % cache.hash_table.len();
+    let mut index = cache.hash_table[h];
+    let mut prev = 0usize;
+    while index != 0 {
+        if cache.pages[index].key == key {
+            if !cache.is_empty(index) {
+                cache.pages[index].state = PageImageState::Vacant;
+                // Remove from LRU list
+                cache.unlink(index);
+                // Insert entry in free list
+                cache.pages[index].next = cache.free_list;
+                cache.free_list = index;
+            } else {
+                // Page is process of loading: we can not remove it righ now,
+                // so just mark for deletion
+                cache.pages[index].next = 0; // make is_empty == false
+            }
+            // Remove from hash table
+            if prev == 0 {
+                cache.hash_table[h] = cache.pages[index].collision;
+            } else {
+                cache.pages[prev].collision = cache.pages[index].collision;
+            }
+            break;
+        }
+        prev = index;
+        index = cache.pages[index].collision;
+    }
+    // It's Ok if image not found
+}
+
+// Find or load page image in the cache
+pub fn lookup(timeline: &Timeline, rel: RelTag, blkno: BlockNumber, lsn: Lsn) -> Result<Bytes> {
+    let key = MaterializedPageHashKey {
+        key: rel_block_to_key(rel, blkno),
+        tenant_id: timeline.tenant_id,
+        timeline_id: timeline.timeline_id,
+    };
+    let this = get();
+    let mut cache = this.lock().unwrap();
+    let h = hash(&key) % cache.hash_table.len();
+
+    'lookup: loop {
+        let mut index = cache.hash_table[h];
+        while index != 0 {
+            if cache.pages[index].key == key {
+                // cache hit
+                match &cache.pages[index].state {
+                    PageImageState::Loaded(success) => {
+                        if *success {
+                            // Pin page
+                            if cache.pages[index].access_count == 0 {
+                                cache.unlink(index);
+                            }
+                            cache.pages[index].access_count += 1;
+                            let file = cache.file.clone();
+                            drop(cache);
+                            let mut buf = [0u8; PAGE_SZ];
+                            file.read_exact_at(&mut buf, index as u64 * PAGE_SZ as u64)?;
+                            cache = this.lock().unwrap();
+                            assert!(cache.pages[index].access_count > 0);
+                            cache.pages[index].access_count -= 1;
+                            if cache.pages[index].access_count == 0 {
+                                // Move to the head of LRU list
+                                cache.link_after(0, index);
+                            }
+                            return Ok(Bytes::from(buf.to_vec()));
+                        } else {
+                            return Err(anyhow::anyhow!("page loading failed earlier"));
+                        }
+                    }
+                    PageImageState::Loading(event) => {
+                        // Create event on which to sleep if not yet assigned
+                        let cv = match event {
+                            None => {
+                                let cv = Arc::new(Condvar::new());
+                                cache.pages[index].state =
+                                    PageImageState::Loading(Some(cv.clone()));
+                                cv
+                            }
+                            Some(cv) => cv.clone(),
+                        };
+                        cache = cv.wait(cache).unwrap();
+                        // Retry lookup
+                        continue 'lookup;
+                    }
+                    PageImageState::Vacant => bail!("Vacant entry is not expected here"),
+                };
+            }
+            index = cache.pages[index].collision;
+        }
+        let file = cache.file.clone();
+        // Cache miss
+        index = cache.free_list;
+        if index == 0 {
+            // no free items
+            let victim = cache.pages[0].prev; // take least recently used element from the tail of LRU list
+            assert!(victim != 0);
+            assert!(cache.pages[victim].access_count == 0);
+            // Remove victim from hash table
+            let h = hash(&cache.pages[victim].key) % cache.hash_table.len();
+            index = cache.hash_table[h];
+            let mut prev = 0usize;
+            while index != victim {
+                assert!(index != 0);
+                prev = index;
+                index = cache.pages[index].collision;
+            }
+            if prev == 0 {
+                cache.hash_table[h] = cache.pages[victim].collision;
+            } else {
+                cache.pages[prev].collision = cache.pages[victim].collision;
+            }
+            // and from LRU list
+            cache.unlink(victim);
+
+            index = victim;
+        } else {
+            // Use next free item
+            cache.free_list = cache.pages[index].next;
+        }
+        // Make is_empty(index) == true. If entry is removed in process of loaded,
+        // it will be updated so that !is_empty(index)
+        cache.prune(index);
+
+        // Insert in hash table
+        cache.pages[index].collision = cache.hash_table[h];
+        cache.hash_table[h] = index;
+
+        cache.pages[index].key = key;
+        cache.pages[index].state = PageImageState::Loading(None);
+        drop(cache); //release lock
+
+        // Load page
+        let result = timeline.get_rel_page_at_lsn(rel, blkno, lsn, true);
+        let mut success = false;
+        if let Ok(page) = &result {
+            success = true;
+            file.write_all_at(&page, index as u64 * PAGE_SZ as u64)?;
+        }
+        cache = this.lock().unwrap();
+        if let PageImageState::Loading(event) = &cache.pages[index].state {
+            // Are there some waiting threads?
+            if let Some(cv) = event {
+                // If so, then wakeup them
+                cv.notify_all();
+            }
+        } else {
+            bail!("Loading state is expected");
+        }
+        if cache.is_empty(index) {
+            // entry was not marked as deleted {
+            // Page is loaded
+
+            // match &res { ... } is same as `res.as_ref().ok().cloned()`
+            cache.pages[index].state = PageImageState::Loaded(success);
+            // Link the page to the head of LRU list
+            cache.link_after(0, index);
+        } else {
+            cache.pages[index].state = PageImageState::Vacant;
+            // Return page to free list
+            cache.pages[index].next = cache.free_list;
+            cache.free_list = index;
+        }
+        // only the first one gets the full error from `get_rel_page_at_lsn`
+        return result;
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,8 +10,15 @@
 //

 use anyhow::{bail, ensure, Context, Result};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamNblocksRequest, PagestreamNblocksResponse,
+};
+
 use std::io;
 use std::net::TcpListener;
 use std::str;
@@ -32,10 +39,10 @@ use utils::{

 use crate::basebackup;
 use crate::config::{PageServerConf, ProfilingConfig};
-use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
+use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
+use crate::page_image_cache;
 use crate::profiling::profpoint_start;
-use crate::reltag::RelTag;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -45,163 +52,6 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// Wrapped in libpq CopyData
-enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-}
-
-// Wrapped in libpq CopyData
-enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-}
-
-#[derive(Debug)]
-struct PagestreamExistsRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-    blkno: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeRequest {
-    latest: bool,
-    lsn: Lsn,
-    dbnode: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamExistsResponse {
-    exists: bool,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksResponse {
-    n_blocks: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageResponse {
-    page: Bytes,
-}
-
-#[derive(Debug)]
-struct PagestreamErrorResponse {
-    message: String,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeResponse {
-    db_size: i64,
-}
-
-impl PagestreamFeMessage {
-    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.get_u8();
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-                blkno: body.get_u32(),
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
-            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
-                bytes.put_u8(resp.exists as u8);
-            }
-
-            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
-                bytes.put_u32(resp.n_blocks);
-            }
-
-            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
-                bytes.put(&resp.page[..]);
-            }
-
-            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
-                bytes.put_i64(resp.db_size);
-            }
-        }
-
-        bytes.into()
-    }
-}
-
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
    async_stream::try_stream! {
        loop {
@@ -500,11 +350,8 @@ impl PageServerHandler {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline(
-            timeline_id,
-            base_lsn,
-            pg_version,
-        )?;
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
@@ -527,7 +374,8 @@ impl PageServerHandler {
        // - use block_in_place()
        let mut copyin_stream = Box::pin(copyin_stream(pgb));
        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?;
+        tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
+        timeline.initialize()?;

        // Drain the rest of the Copy data
        let mut bytes_after_tar = 0;
@@ -544,12 +392,6 @@ impl PageServerHandler {
        // It wouldn't work if base came from vanilla postgres though,
        // since we discard some log files.

-        // Flush data to disk, then upload to s3
-        info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush)?;
-
-        timeline.launch_wal_receiver()?;
-
        info!("done");
        Ok(())
    }
@@ -740,8 +582,12 @@ impl PageServerHandler {
        // current profiling is based on a thread-local variable, so it doesn't work
        // across awaits
        let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
-        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;

+        let page = if req.latest {
+            page_image_cache::lookup(timeline, req.rel, req.blkno, lsn)
+        } else {
+            timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, false)
+        }?;
        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
        }))
@@ -1068,7 +914,8 @@ impl postgres_backend_async::Handler for PageServerHandler {
 }

 fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result<Arc<Timeline>> {
-    tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id))
+    tenant_mgr::get_tenant(tenant_id, true)
+        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
 }

 ///
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,12 +7,12 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
@@ -1179,7 +1179,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

-fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
@@ -1373,6 +1373,17 @@ fn is_rel_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }

+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
 pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
    Ok(match key.field1 {
        0x01 => {
@@ -1403,7 +1414,9 @@ pub fn create_test_timeline(
    timeline_id: utils::id::TimelineId,
    pg_version: u32,
 ) -> Result<std::sync::Arc<Timeline>> {
-    let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?;
+    let tline = tenant
+        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
+        .initialize()?;
    let mut m = tline.begin_modification(Lsn(8));
    m.init_empty()?;
    m.commit()?;
--- a/pageserver/src/storage_sync/download.rs
+++ b/pageserver/src/storage_sync/download.rs
@@ -22,7 +22,7 @@ use crate::{
    TEMP_FILE_SUFFIX,
 };
 use utils::{
-    crashsafe_dir::path_with_suffix_extension,
+    crashsafe::path_with_suffix_extension,
    id::{TenantId, TenantTimelineId, TimelineId},
 };

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,10 +1,12 @@
 //!

-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use pageserver_api::models::TimelineState;
+use tokio::sync::watch;
 use tokio::task::spawn_blocking;
 use tracing::*;

@@ -32,10 +34,12 @@ use crate::tenant::{
 use crate::config::{PageServerConf, METADATA_FILE_NAME};
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::TimelineMetrics;
+use crate::page_image_cache;
 use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::reltag::RelTag;
+use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::tenant_config::TenantConfOpt;
+use pageserver_api::reltag::RelTag;

 use postgres_ffi::to_pg_timestamp;
 use utils::{
@@ -52,6 +56,7 @@ use crate::task_mgr::TaskKind;
 use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::CheckpointConfig;
+use crate::ZERO_PAGE;
 use crate::{
    page_cache,
    storage_sync::{self, index::LayerFileMetadata},
@@ -158,6 +163,8 @@ pub struct Timeline {

    /// Relation size cache
    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+
+    state: watch::Sender<TimelineState>,
 }

 /// Internal structure to hold all data needed for logical size calculation.
@@ -305,10 +312,6 @@ pub struct GcInfo {

 /// Public interface functions
 impl Timeline {
-    //------------------------------------------------------------------------------
-    // Public GET functions
-    //------------------------------------------------------------------------------
-
    /// Get the LSN where this branch was created
    pub fn get_ancestor_lsn(&self) -> Lsn {
        self.ancestor_lsn
@@ -418,9 +421,11 @@ impl Timeline {
    /// those functions with an LSN that has been processed yet is an error.
    ///
    pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
+        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
+
        // This should never be called from the WAL receiver, because that could lead
        // to a deadlock.
-        ensure!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
            "wait_lsn cannot be called in WAL receiver"
        );
@@ -443,7 +448,7 @@ impl Timeline {
        &self,
        lsn: Lsn,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
@@ -453,12 +458,6 @@ impl Timeline {
        Ok(())
    }

-    //------------------------------------------------------------------------------
-    // Public PUT functions, to update the repository with new page versions.
-    //
-    // These are called by the WAL receiver to digest WAL records.
-    //------------------------------------------------------------------------------
-
    /// Flush to disk all data that was written with the put_* functions
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
@@ -477,6 +476,91 @@ impl Timeline {
        }
    }

+    pub fn compact(&self) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+
+        // Last record Lsn could be zero in case the timelie was just created
+        if !last_record_lsn.is_valid() {
+            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
+            return Ok(());
+        }
+
+        //
+        // High level strategy for compaction / image creation:
+        //
+        // 1. First, calculate the desired "partitioning" of the
+        // currently in-use key space. The goal is to partition the
+        // key space into roughly fixed-size chunks, but also take into
+        // account any existing image layers, and try to align the
+        // chunk boundaries with the existing image layers to avoid
+        // too much churn. Also try to align chunk boundaries with
+        // relation boundaries.  In principle, we don't know about
+        // relation boundaries here, we just deal with key-value
+        // pairs, and the code in pgdatadir_mapping.rs knows how to
+        // map relations into key-value pairs. But in practice we know
+        // that 'field6' is the block number, and the fields 1-5
+        // identify a relation. This is just an optimization,
+        // though.
+        //
+        // 2. Once we know the partitioning, for each partition,
+        // decide if it's time to create a new image layer. The
+        // criteria is: there has been too much "churn" since the last
+        // image layer? The "churn" is fuzzy concept, it's a
+        // combination of too many delta files, or too much WAL in
+        // total in the delta file. Or perhaps: if creating an image
+        // file would allow to delete some older files.
+        //
+        // 3. After that, we compact all level0 delta files if there
+        // are too many of them.  While compacting, we also garbage
+        // collect any page versions that are no longer needed because
+        // of the new image layers we created in step 2.
+        //
+        // TODO: This high level strategy hasn't been implemented yet.
+        // Below are functions compact_level0() and create_image_layers()
+        // but they are a bit ad hoc and don't quite work like it's explained
+        // above. Rewrite it.
+        let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
+
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Define partitioning schema if needed
+
+        match self.repartition(
+            self.get_last_record_lsn(),
+            self.get_compaction_target_size(),
+        ) {
+            Ok((partitioning, lsn)) => {
+                // 2. Create new image layers for partitions that have been modified
+                // "enough".
+                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
+                if !layer_paths_to_upload.is_empty()
+                    && self.upload_layers.load(atomic::Ordering::Relaxed)
+                {
+                    storage_sync::schedule_layer_upload(
+                        self.tenant_id,
+                        self.timeline_id,
+                        layer_paths_to_upload,
+                        None,
+                    );
+                }
+
+                // 3. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size)?;
+                timer.stop_and_record();
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                error!("could not compact, repartitioning keyspace failed: {err:?}");
+            }
+        };
+
+        Ok(())
+    }
+
    /// Mutate the timeline with a [`TimelineWriter`].
    pub fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
@@ -484,6 +568,109 @@ impl Timeline {
            _write_guard: self.write_lock.lock().unwrap(),
        }
    }
+
+    /// Retrieve current logical size of the timeline.
+    ///
+    /// The size could be lagging behind the actual number, in case
+    /// the initial size calculation has not been run (gets triggered on the first size access).
+    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
+        let current_size = self.current_logical_size.current_size()?;
+        debug!("Current size: {current_size:?}");
+
+        let size = current_size.size();
+        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
+            (current_size, self.current_logical_size.initial_part_end)
+        {
+            self.try_spawn_size_init_task(init_lsn);
+        }
+
+        Ok(size)
+    }
+
+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
+    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+        let layers = self.layers.read().unwrap();
+        if let Some(open_layer) = &layers.open_layer {
+            let open_layer_size = open_layer.size()?;
+            drop(layers);
+            let last_freeze_at = self.last_freeze_at.load();
+            let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+            let distance = last_lsn.widening_sub(last_freeze_at);
+            // Checkpointing the open layer can be triggered by layer size or LSN range.
+            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
+            // we want to stay below that with a big margin.  The LSN distance determines how
+            // much WAL the safekeepers need to store.
+            if distance >= self.get_checkpoint_distance().into()
+                || open_layer_size > self.get_checkpoint_distance()
+                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
+            {
+                info!(
+                    "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                    distance,
+                    open_layer_size,
+                    last_freeze_ts.elapsed()
+                );
+
+                self.freeze_inmem_layer(true);
+                self.last_freeze_at.store(last_lsn);
+                *(self.last_freeze_ts.write().unwrap()) = Instant::now();
+
+                // Launch a task to flush the frozen layer to disk, unless
+                // a task was already running. (If the task was running
+                // at the time that we froze the layer, it must've seen the
+                // the layer we just froze before it exited; see comments
+                // in flush_frozen_layers())
+                if let Ok(guard) = self.layer_flush_lock.try_lock() {
+                    drop(guard);
+                    let self_clone = Arc::clone(self);
+                    task_mgr::spawn(
+                        task_mgr::BACKGROUND_RUNTIME.handle(),
+                        task_mgr::TaskKind::LayerFlushTask,
+                        Some(self.tenant_id),
+                        Some(self.timeline_id),
+                        "layer flush task",
+                        false,
+                        async move { self_clone.flush_frozen_layers(false) },
+                    );
+                }
+            }
+        }
+        Ok(())
+    }
+
+    pub fn set_state(&self, new_state: TimelineState) {
+        match (self.current_state(), new_state) {
+            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
+                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+            }
+            (TimelineState::Broken, _) => {
+                error!("Ignoring state update {new_state:?} for broken tenant");
+            }
+            (TimelineState::Paused, TimelineState::Active) => {
+                debug!("Not activating a paused timeline");
+            }
+            (_, new_state) => {
+                self.state.send_replace(new_state);
+            }
+        }
+    }
+
+    pub fn current_state(&self) -> TimelineState {
+        *self.state.borrow()
+    }
+
+    pub fn is_active(&self) -> bool {
+        self.current_state() == TimelineState::Active
+    }
+
+    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
+        self.state.subscribe()
+    }
 }

 // Private functions
@@ -527,7 +714,7 @@ impl Timeline {
    ///
    /// Loads the metadata for the timeline into memory, but not the layer map.
    #[allow(clippy::too_many_arguments)]
-    pub fn new(
+    pub(super) fn new(
        conf: &'static PageServerConf,
        tenant_conf: Arc<RwLock<TenantConfOpt>>,
        metadata: TimelineMetadata,
@@ -537,8 +724,9 @@ impl Timeline {
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        upload_layers: bool,
        pg_version: u32,
-    ) -> Timeline {
+    ) -> Self {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
+        let (state, _) = watch::channel(TimelineState::Suspended);

        let mut result = Timeline {
            conf,
@@ -595,16 +783,17 @@ impl Timeline {

            last_received_wal: Mutex::new(None),
            rel_size_cache: RwLock::new(HashMap::new()),
+            state,
        };
        result.repartition_threshold = result.get_checkpoint_distance() / 10;
        result
    }

-    pub fn launch_wal_receiver(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
        if !is_etcd_client_initialized() {
            if cfg!(test) {
                info!("not launching WAL receiver because etcd client hasn't been initialized");
-                return Ok(());
+                return;
            } else {
                panic!("etcd client not initialized");
            }
@@ -632,16 +821,14 @@ impl Timeline {
            walreceiver_connect_timeout,
            lagging_wal_timeout,
            max_lsn_wal_lag,
-        )?;
-
-        Ok(())
+        );
    }

    ///
    /// Scan the timeline directory to populate the layer map.
    /// Returns all timeline-related files that were found and loaded.
    ///
-    pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
+    pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut layers = self.layers.write().unwrap();
        let mut num_layers = 0;

@@ -727,33 +914,13 @@ impl Timeline {
        Ok(())
    }

-    pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
+    pub(super) fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
        self.layer_removal_cs
            .try_lock()
            .map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
    }

-    /// Retrieve current logical size of the timeline.
-    ///
-    /// The size could be lagging behind the actual number, in case
-    /// the initial size calculation has not been run (gets triggered on the first size access).
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
-        let current_size = self.current_logical_size.current_size()?;
-        debug!("Current size: {current_size:?}");
-
-        let size = current_size.size();
-        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
-            (current_size, self.current_logical_size.initial_part_end)
-        {
-            self.try_spawn_size_init_task(init_lsn);
-        }
-
-        Ok(size)
-    }
-
    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
-        let timeline_id = self.timeline_id;
-
        // Atomically check if the timeline size calculation had already started.
        // If the flag was not already set, this sets it.
        if !self
@@ -770,17 +937,42 @@ impl Timeline {
                "initial size calculation",
                false,
                async move {
-                    let calculated_size = self_clone.calculate_logical_size(init_lsn)?;
-                    let result = spawn_blocking(move || {
-                        self_clone.current_logical_size.initial_logical_size.set(calculated_size)
-                    }).await?;
-                    match result {
-                        Ok(()) => info!("Successfully calculated initial logical size"),
-                        Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
+                    let mut timeline_state_updates = self_clone.subscribe_for_state_updates();
+                    let self_calculation = Arc::clone(&self_clone);
+                    tokio::select! {
+                        calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
+                            let calculated_size = calculation_result
+                                .context("Failed to spawn calculation result task")?
+                                .context("Failed to calculate logical size")?;
+                            match self_clone.current_logical_size.initial_logical_size.set(calculated_size) {
+                                Ok(()) => info!("Successfully calculated initial logical size"),
+                                Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
+                            }
+                            Ok(())
+                        },
+                        new_event = async {
+                            loop {
+                                match timeline_state_updates.changed().await {
+                                    Ok(()) => {
+                                        let new_state = *timeline_state_updates.borrow();
+                                        match new_state {
+                                            // we're running this job for active timelines only
+                                            TimelineState::Active => continue,
+                                            TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state),
+                                        }
+                                    }
+                                    Err(_sender_dropped_error) => return None,
+                                }
+                            }
+                        } => {
+                            match new_event {
+                                Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"),
+                                None => info!("Timeline dropped state updates sender, stopping init size calculation"),
+                            }
+                            Ok(())
+                        },
                    }
-                    Ok(())
-                }
-                .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id))
+                }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)),
            );
        }
    }
@@ -971,7 +1163,7 @@ impl Timeline {
        Some((lsn, img))
    }

-    fn get_ancestor_timeline(&self) -> Result<Arc<Timeline>> {
+    fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
            format!(
                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
@@ -1030,14 +1222,14 @@ impl Timeline {
        Ok(layer)
    }

-    fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn)?;
        layer.put_value(key, lsn, val)?;
        Ok(())
    }

-    fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
+    fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
        let layer = self.get_layer_for_write(lsn)?;
        layer.put_tombstone(key_range, lsn)?;

@@ -1076,64 +1268,6 @@ impl Timeline {
        drop(layers);
    }

-    ///
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    ///
-    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let layers = self.layers.read().unwrap();
-        if let Some(open_layer) = &layers.open_layer {
-            let open_layer_size = open_layer.size()?;
-            drop(layers);
-            let last_freeze_at = self.last_freeze_at.load();
-            let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-            let distance = last_lsn.widening_sub(last_freeze_at);
-            // Checkpointing the open layer can be triggered by layer size or LSN range.
-            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
-            // we want to stay below that with a big margin.  The LSN distance determines how
-            // much WAL the safekeepers need to store.
-            if distance >= self.get_checkpoint_distance().into()
-                || open_layer_size > self.get_checkpoint_distance()
-                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-            {
-                info!(
-                    "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                    distance,
-                    open_layer_size,
-                    last_freeze_ts.elapsed()
-                );
-
-                self.freeze_inmem_layer(true);
-                self.last_freeze_at.store(last_lsn);
-                *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-                // Launch a task to flush the frozen layer to disk, unless
-                // a task was already running. (If the task was running
-                // at the time that we froze the layer, it must've seen the
-                // the layer we just froze before it exited; see comments
-                // in flush_frozen_layers())
-                if let Ok(guard) = self.layer_flush_lock.try_lock() {
-                    drop(guard);
-                    let self_clone = Arc::clone(self);
-                    task_mgr::spawn(
-                        task_mgr::BACKGROUND_RUNTIME.handle(),
-                        task_mgr::TaskKind::LayerFlushTask,
-                        Some(self.tenant_id),
-                        Some(self.timeline_id),
-                        "layer flush task",
-                        false,
-                        async move { self_clone.flush_frozen_layers(false) },
-                    );
-                }
-            }
-        }
-        Ok(())
-    }
-
    /// Flush all frozen layers to disk.
    ///
    /// Only one task at a time can be doing layer-flushing for a
@@ -1141,7 +1275,7 @@ impl Timeline {
    /// currently doing the flushing, this function will wait for it
    /// to finish. If 'wait' is false, this function will return
    /// immediately instead.
-    fn flush_frozen_layers(&self, wait: bool) -> Result<()> {
+    fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
        let flush_lock_guard = if wait {
            self.layer_flush_lock.lock().unwrap()
        } else {
@@ -1180,7 +1314,7 @@ impl Timeline {
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
+    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -1238,7 +1372,7 @@ impl Timeline {
        &self,
        disk_consistent_lsn: Lsn,
        layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        // We can only save a valid 'prev_record_lsn' value on disk if we
        // flushed *all* in-memory changes to disk. We only track
        // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -1283,7 +1417,7 @@ impl Timeline {
            false,
        )?;

-        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+        if self.can_upload_layers() {
            storage_sync::schedule_layer_upload(
                self.tenant_id,
                self.timeline_id,
@@ -1299,7 +1433,7 @@ impl Timeline {
    fn create_delta_layer(
        &self,
        frozen_layer: &InMemoryLayer,
-    ) -> Result<(PathBuf, LayerFileMetadata)> {
+    ) -> anyhow::Result<(PathBuf, LayerFileMetadata)> {
        // Write it out
        let new_delta = frozen_layer.write_to_disk()?;
        let new_delta_path = new_delta.path();
@@ -1334,92 +1468,7 @@ impl Timeline {
        Ok((new_delta_path, LayerFileMetadata::new(sz)))
    }

-    pub fn compact(&self) -> anyhow::Result<()> {
-        let last_record_lsn = self.get_last_record_lsn();
-
-        // Last record Lsn could be zero in case the timelie was just created
-        if !last_record_lsn.is_valid() {
-            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(());
-        }
-
-        //
-        // High level strategy for compaction / image creation:
-        //
-        // 1. First, calculate the desired "partitioning" of the
-        // currently in-use key space. The goal is to partition the
-        // key space into roughly fixed-size chunks, but also take into
-        // account any existing image layers, and try to align the
-        // chunk boundaries with the existing image layers to avoid
-        // too much churn. Also try to align chunk boundaries with
-        // relation boundaries.  In principle, we don't know about
-        // relation boundaries here, we just deal with key-value
-        // pairs, and the code in pgdatadir_mapping.rs knows how to
-        // map relations into key-value pairs. But in practice we know
-        // that 'field6' is the block number, and the fields 1-5
-        // identify a relation. This is just an optimization,
-        // though.
-        //
-        // 2. Once we know the partitioning, for each partition,
-        // decide if it's time to create a new image layer. The
-        // criteria is: there has been too much "churn" since the last
-        // image layer? The "churn" is fuzzy concept, it's a
-        // combination of too many delta files, or too much WAL in
-        // total in the delta file. Or perhaps: if creating an image
-        // file would allow to delete some older files.
-        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
-        let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
-
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Define partitioning schema if needed
-
-        match self.repartition(
-            self.get_last_record_lsn(),
-            self.get_compaction_target_size(),
-        ) {
-            Ok((partitioning, lsn)) => {
-                // 2. Create new image layers for partitions that have been modified
-                // "enough".
-                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
-                if !layer_paths_to_upload.is_empty()
-                    && self.upload_layers.load(atomic::Ordering::Relaxed)
-                {
-                    storage_sync::schedule_layer_upload(
-                        self.tenant_id,
-                        self.timeline_id,
-                        layer_paths_to_upload,
-                        None,
-                    );
-                }
-
-                // 3. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size)?;
-                timer.stop_and_record();
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                error!("could not compact, repartitioning keyspace failed: {err:?}");
-            }
-        };
-
-        Ok(())
-    }
-
-    fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> {
+    fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
        let mut partitioning_guard = self.partitioning.lock().unwrap();
        if partitioning_guard.1 == Lsn(0)
            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
@@ -1433,7 +1482,7 @@ impl Timeline {
    }

    // Is it time to create a new image layer for the given partition?
-    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<bool> {
+    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
        let layers = self.layers.read().unwrap();

        for part_range in &partition.ranges {
@@ -1478,7 +1527,7 @@ impl Timeline {
        partitioning: &KeyPartitioning,
        lsn: Lsn,
        force: bool,
-    ) -> Result<HashMap<PathBuf, LayerFileMetadata>> {
+    ) -> anyhow::Result<HashMap<PathBuf, LayerFileMetadata>> {
        let timer = self.metrics.create_images_time_histo.start_timer();
        let mut image_layers: Vec<ImageLayer> = Vec::new();
        for partition in partitioning.parts.iter() {
@@ -1496,7 +1545,32 @@ impl Timeline {
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
-                        let img = self.get(key, lsn)?;
+                        let img = match self.get(key, lsn) {
+                            Ok(img) => img,
+                            Err(err) => {
+                                // If we fail to reconstruct a VM or FSM page, we can zero the
+                                // page without losing any actual user data. That seems better
+                                // than failing repeatedly and getting stuck.
+                                //
+                                // We had a bug at one point, where we truncated the FSM and VM
+                                // in the pageserver, but the Postgres didn't know about that
+                                // and continued to generate incremental WAL records for pages
+                                // that didn't exist in the pageserver. Trying to replay those
+                                // WAL records failed to find the previous image of the page.
+                                // This special case allows us to recover from that situation.
+                                // See https://github.com/neondatabase/neon/issues/2601.
+                                //
+                                // Unfortunately we cannot do this for the main fork, or for
+                                // any metadata keys, keys, as that would lead to actual data
+                                // loss.
+                                if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
+                                    warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
+                                    ZERO_PAGE.clone()
+                                } else {
+                                    return Err(err);
+                                }
+                            }
+                        };
                        image_layer_writer.put_image(key, &img)?;
                        key = key.next();
                    }
@@ -1546,7 +1620,7 @@ impl Timeline {
    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
    /// as Level 1 files.
    ///
-    fn compact_level0(&self, target_file_size: u64) -> Result<()> {
+    fn compact_level0(&self, target_file_size: u64) -> anyhow::Result<()> {
        let layers = self.layers.read().unwrap();
        let mut level0_deltas = layers.get_level0_deltas()?;
        drop(layers);
@@ -1813,7 +1887,7 @@ impl Timeline {
        }
        drop(layers);

-        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+        if self.can_upload_layers() {
            storage_sync::schedule_layer_upload(
                self.tenant_id,
                self.timeline_id,
@@ -1856,12 +1930,12 @@ impl Timeline {
    ///
    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
    /// whether a record is needed for PITR.
-    pub fn update_gc_info(
+    pub(super) fn update_gc_info(
        &self,
        retain_lsns: Vec<Lsn>,
        cutoff_horizon: Lsn,
        pitr: Duration,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        let mut gc_info = self.gc_info.write().unwrap();

        gc_info.horizon_cutoff = cutoff_horizon;
@@ -1916,8 +1990,8 @@ impl Timeline {
    /// within a layer file. We can only remove the whole file if it's fully
    /// obsolete.
    ///
-    pub fn gc(&self) -> Result<GcResult> {
-        let mut result: GcResult = Default::default();
+    pub(super) fn gc(&self) -> anyhow::Result<GcResult> {
+        let mut result: GcResult = GcResult::default();
        let now = SystemTime::now();

        fail_point!("before-timeline-gc");
@@ -1959,10 +2033,10 @@ impl Timeline {
                new_gc_cutoff
            );
            write_guard.store_and_unlock(new_gc_cutoff).wait();
-
-            // Persist metadata file
-            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
        }
+        // Persist the new GC cutoff value in the metadata file, before
+        // we actually remove anything.
+        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;

        info!("GC starting");

@@ -2089,18 +2163,15 @@ impl Timeline {
        }

        info!(
-            "GC completed removing {} layers, cuttof {}",
+            "GC completed removing {} layers, cutoff {}",
            result.layers_removed, new_gc_cutoff
        );
+
        if result.layers_removed != 0 {
-            fail_point!("gc-before-save-metadata", |_| {
-                info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point");
-                std::process::abort();
-            });
-            return Ok(result);
+            fail_point!("after-timeline-gc-removed-layers");
        }

-        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+        if self.can_upload_layers() {
            storage_sync::schedule_layer_delete(
                self.tenant_id,
                self.timeline_id,
@@ -2189,6 +2260,11 @@ impl Timeline {
            }
        }
    }
+
+    fn can_upload_layers(&self) -> bool {
+        self.upload_layers.load(atomic::Ordering::Relaxed)
+            && self.current_state() != TimelineState::Broken
+    }
 }

 /// Helper function for get_reconstruct_data() to add the path of layers traversed
@@ -2239,11 +2315,12 @@ impl<'a> TimelineWriter<'a> {
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
+    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
+        page_image_cache::remove(key, self.tenant_id, self.timeline_id);
        self.tl.put_value(key, lsn, value)
    }

-    pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
+    pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
        self.tl.put_tombstone(key_range, lsn)
    }

--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -12,7 +12,7 @@ use tracing::*;

 use remote_storage::GenericRemoteStorage;

-use crate::config::{PageServerConf, METADATA_FILE_NAME};
+use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
 use crate::http::models::TenantInfo;
 use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
 use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
@@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt;
 use crate::walredo::PostgresRedoManager;
 use crate::TEMP_FILE_SUFFIX;

-use utils::crashsafe_dir::{self, path_with_suffix_extension};
+use utils::crashsafe::{self, path_with_suffix_extension};
 use utils::id::{TenantId, TimelineId};

 mod tenants_state {
@@ -265,58 +265,98 @@ fn create_tenant_files(
        temporary_tenant_dir.display()
    );

-    let temporary_tenant_timelines_dir = rebase_directory(
-        &conf.timelines_path(&tenant_id),
-        &target_tenant_directory,
-        &temporary_tenant_dir,
-    )?;
-    let temporary_tenant_config_path = rebase_directory(
-        &conf.tenant_config_path(tenant_id),
-        &target_tenant_directory,
-        &temporary_tenant_dir,
-    )?;
-
    // top-level dir may exist if we are creating it through CLI
-    crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| {
+    crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
        format!(
            "could not create temporary tenant directory {}",
            temporary_tenant_dir.display()
        )
    })?;
-    // first, create a config in the top-level temp directory, fsync the file
-    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?;
-    // then, create a subdirectory in the top-level temp directory, fsynced
-    crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
+
+    let creation_result = try_create_target_tenant_dir(
+        conf,
+        tenant_conf,
+        tenant_id,
+        &temporary_tenant_dir,
+        &target_tenant_directory,
+    );
+
+    if creation_result.is_err() {
+        error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
+        if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
+            error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
+        } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
+            error!(
+                "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
+            )
+        }
+    }
+
+    creation_result
+}
+
+fn try_create_target_tenant_dir(
+    conf: &'static PageServerConf,
+    tenant_conf: TenantConfOpt,
+    tenant_id: TenantId,
+    temporary_tenant_dir: &Path,
+    target_tenant_directory: &Path,
+) -> Result<(), anyhow::Error> {
+    let temporary_tenant_timelines_dir = rebase_directory(
+        &conf.timelines_path(&tenant_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+    let temporary_tenant_config_path = rebase_directory(
+        &conf.tenant_config_path(tenant_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
+        || {
+            format!(
+                "Failed to write tenant {} config to {}",
+                tenant_id,
+                temporary_tenant_config_path.display()
+            )
+        },
+    )?;
+    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
-            "could not create temporary tenant timelines directory {}",
+            "could not create tenant {} temporary timelines directory {}",
+            tenant_id,
            temporary_tenant_timelines_dir.display()
        )
    })?;
-
    fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
    });

-    // move-rename tmp directory with all files synced into a permanent directory, fsync its parent
-    fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| {
+    fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
        format!(
-            "failed to move temporary tenant directory {} into the permanent one {}",
+            "failed to move tenant {} temporary directory {} into the permanent one {}",
+            tenant_id,
            temporary_tenant_dir.display(),
            target_tenant_directory.display()
        )
    })?;
    let target_dir_parent = target_tenant_directory.parent().with_context(|| {
        format!(
-            "Failed to get tenant dir parent for {}",
+            "Failed to get tenant {} dir parent for {}",
+            tenant_id,
            target_tenant_directory.display()
        )
    })?;
-    fs::File::open(target_dir_parent)?.sync_all()?;
-
-    info!(
-        "created tenant directory structure in {}",
-        target_tenant_directory.display()
-    );
+    crashsafe::fsync(target_dir_parent).with_context(|| {
+        format!(
+            "Failed to fsync renamed directory's parent {} for tenant {}",
+            target_dir_parent.display(),
+            tenant_id,
+        )
+    })?;

    Ok(())
 }
@@ -602,6 +642,15 @@ fn is_temporary(path: &Path) -> bool {
    }
 }

+fn is_uninit_mark(path: &Path) -> bool {
+    match path.file_name() {
+        Some(name) => name
+            .to_string_lossy()
+            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
+        None => false,
+    }
+}
+
 fn collect_timelines_for_tenant(
    config: &'static PageServerConf,
    tenant_path: &Path,
@@ -644,28 +693,74 @@ fn collect_timelines_for_tenant(
                            e
                        );
                    }
+                } else if is_uninit_mark(&timeline_dir) {
+                    let timeline_uninit_mark_file = &timeline_dir;
+                    info!(
+                        "Found an uninit mark file {}, removing the timeline and its uninit mark",
+                        timeline_uninit_mark_file.display()
+                    );
+                    let timeline_id = timeline_uninit_mark_file
+                        .file_stem()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline uninit mark name {}",
+                                timeline_uninit_mark_file.display()
+                            )
+                        })?;
+                    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+                    if let Err(e) =
+                        remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
+                    {
+                        error!("Failed to clean up uninit marked timeline: {e:?}");
+                    }
                } else {
-                    match collect_timeline_files(&timeline_dir) {
-                        Ok((timeline_id, metadata, timeline_files)) => {
-                            tenant_timelines.insert(
-                                timeline_id,
-                                TimelineLocalFiles::collected(metadata, timeline_files),
-                            );
+                    let timeline_id = timeline_dir
+                        .file_name()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline dir name {}",
+                                timeline_dir.display()
+                            )
+                        })?;
+                    let timeline_uninit_mark_file =
+                        config.timeline_uninit_mark_file_path(tenant_id, timeline_id);
+                    if timeline_uninit_mark_file.exists() {
+                        info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark");
+                        if let Err(e) = remove_timeline_and_uninit_mark(
+                            &timeline_dir,
+                            &timeline_uninit_mark_file,
+                        ) {
+                            error!("Failed to clean up uninit marked timeline: {e:?}");
                        }
-                        Err(e) => {
-                            error!(
-                                "Failed to process timeline dir contents at '{}', reason: {:?}",
-                                timeline_dir.display(),
-                                e
-                            );
-                            match remove_if_empty(&timeline_dir) {
-                                Ok(true) => info!(
-                                    "Removed empty timeline directory {}",
-                                    timeline_dir.display()
-                                ),
-                                Ok(false) => (),
-                                Err(e) => {
-                                    error!("Failed to remove empty timeline directory: {e:?}")
+                    } else {
+                        match collect_timeline_files(&timeline_dir) {
+                            Ok((metadata, timeline_files)) => {
+                                tenant_timelines.insert(
+                                    timeline_id,
+                                    TimelineLocalFiles::collected(metadata, timeline_files),
+                                );
+                            }
+                            Err(e) => {
+                                error!(
+                                    "Failed to process timeline dir contents at '{}', reason: {:?}",
+                                    timeline_dir.display(),
+                                    e
+                                );
+                                match remove_if_empty(&timeline_dir) {
+                                    Ok(true) => info!(
+                                        "Removed empty timeline directory {}",
+                                        timeline_dir.display()
+                                    ),
+                                    Ok(false) => (),
+                                    Err(e) => {
+                                        error!("Failed to remove empty timeline directory: {e:?}")
+                                    }
                                }
                            }
                        }
@@ -688,24 +783,41 @@ fn collect_timelines_for_tenant(
    Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
 }

+fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
+    fs::remove_dir_all(&timeline_dir)
+        .or_else(|e| {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                // we can leave the uninit mark without a timeline dir,
+                // just remove the mark then
+                Ok(())
+            } else {
+                Err(e)
+            }
+        })
+        .with_context(|| {
+            format!(
+                "Failed to remove unit marked timeline directory {}",
+                timeline_dir.display()
+            )
+        })?;
+    fs::remove_file(&uninit_mark).with_context(|| {
+        format!(
+            "Failed to remove timeline uninit mark file {}",
+            uninit_mark.display()
+        )
+    })?;
+
+    Ok(())
+}
+
 // discover timeline files and extract timeline metadata
 //  NOTE: ephemeral files are excluded from the list
 fn collect_timeline_files(
    timeline_dir: &Path,
-) -> anyhow::Result<(
-    TimelineId,
-    TimelineMetadata,
-    HashMap<PathBuf, LayerFileMetadata>,
-)> {
+) -> anyhow::Result<(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>)> {
    let mut timeline_files = HashMap::new();
    let mut timeline_metadata_path = None;

-    let timeline_id = timeline_dir
-        .file_name()
-        .and_then(OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<TimelineId>()
-        .context("Could not parse timeline id out of the timeline dir name")?;
    let timeline_dir_entries =
        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
    for entry in timeline_dir_entries {
@@ -754,5 +866,5 @@ fn collect_timeline_files(
        "Timeline has no ancestor and no layer files"
    );

-    Ok((timeline_id, metadata, timeline_files))
+    Ok((metadata, timeline_files))
 }
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -175,7 +175,7 @@ async fn wait_for_active_tenant(
                        }
                        state => {
                            debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}");
-                            tokio::time::sleep(wait).await;
+                            continue;
                        }
                    }
                }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,9 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walrecord::*;
+use crate::ZERO_PAGE;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
@@ -43,8 +44,6 @@ use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;

-static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
-
 pub struct WalIngest<'a> {
    timeline: &'a Timeline,

--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -12,6 +12,7 @@
 use std::{
    collections::{hash_map, HashMap},
    num::NonZeroU64,
+    ops::ControlFlow,
    sync::Arc,
    time::Duration,
 };
@@ -26,7 +27,8 @@ use etcd_broker::{
    subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
    BrokerUpdate, Client,
 };
-use tokio::select;
+use pageserver_api::models::TimelineState;
+use tokio::{select, sync::watch};
 use tracing::*;

 use crate::{
@@ -47,7 +49,7 @@ pub fn spawn_connection_manager_task(
    wal_connect_timeout: Duration,
    lagging_wal_timeout: Duration,
    max_lsn_wal_lag: NonZeroU64,
-) -> anyhow::Result<()> {
+) {
    let mut etcd_client = get_etcd_client().clone();

    let tenant_id = timeline.tenant_id;
@@ -58,10 +60,7 @@ pub fn spawn_connection_manager_task(
        TaskKind::WalReceiverManager,
        Some(tenant_id),
        Some(timeline_id),
-        &format!(
-            "walreceiver for tenant {} timeline {}",
-            timeline.tenant_id, timeline.timeline_id
-        ),
+        &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
        false,
        async move {
            info!("WAL receiver broker started, connecting to etcd");
@@ -75,19 +74,21 @@ pub fn spawn_connection_manager_task(
                select! {
                    _ = task_mgr::shutdown_watcher() => {
                        info!("WAL receiver shutdown requested, shutting down");
-                        // Kill current connection, if any
-                        if let Some(wal_connection) = walreceiver_state.wal_connection.take()
-                        {
-                            wal_connection.connection_task.shutdown().await;
-                        }
+                        walreceiver_state.shutdown().await;
                        return Ok(());
                    },
-
-                    _ = connection_manager_loop_step(
+                    loop_step_result = connection_manager_loop_step(
                        &broker_loop_prefix,
                        &mut etcd_client,
                        &mut walreceiver_state,
-                    ) => {},
+                    ) => match loop_step_result {
+                        ControlFlow::Continue(()) => continue,
+                        ControlFlow::Break(()) => {
+                            info!("Connection manager loop ended, shutting down");
+                            walreceiver_state.shutdown().await;
+                            return Ok(());
+                        }
+                    },
                }
            }
        }
@@ -95,7 +96,6 @@ pub fn spawn_connection_manager_task(
            info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
        ),
    );
-    Ok(())
 }

 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
@@ -105,7 +105,17 @@ async fn connection_manager_loop_step(
    broker_prefix: &str,
    etcd_client: &mut Client,
    walreceiver_state: &mut WalreceiverState,
-) {
+) -> ControlFlow<(), ()> {
+    let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
+
+    match wait_for_active_timeline(&mut timeline_state_updates).await {
+        ControlFlow::Continue(()) => {}
+        ControlFlow::Break(()) => {
+            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
+            return ControlFlow::Break(());
+        }
+    }
+
    let id = TenantTimelineId {
        tenant_id: walreceiver_state.timeline.tenant_id,
        timeline_id: walreceiver_state.timeline.timeline_id,
@@ -130,10 +140,12 @@ async fn connection_manager_loop_step(
        //  - change connection if the rules decide so, or if the current connection dies
        //  - receive updates from broker
        //      - this might change the current desired connection
+        //  - timeline state changes to something that does not allow walreceiver to run concurrently
        select! {
            broker_connection_result = &mut broker_subscription.watcher_handle => {
+                info!("Broker connection was closed from the other side, ending current broker loop step");
                cleanup_broker_connection(broker_connection_result, walreceiver_state);
-                return;
+                return ControlFlow::Continue(());
            },

            Some(wal_connection_update) = async {
@@ -186,11 +198,36 @@ async fn connection_manager_loop_step(
                            (&mut broker_subscription.watcher_handle).await,
                            walreceiver_state,
                        );
-                        return;
+                        return ControlFlow::Continue(());
                    }
                }
            },

+            new_event = async {
+                loop {
+                    match timeline_state_updates.changed().await {
+                        Ok(()) => {
+                            let new_state = walreceiver_state.timeline.current_state();
+                            match new_state {
+                                // we're already active as walreceiver, no need to reactivate
+                                TimelineState::Active => continue,
+                                TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                            }
+                        }
+                        Err(_sender_dropped_error) => return ControlFlow::Break(()),
+                    }
+                }
+            } => match new_event {
+                ControlFlow::Continue(new_state) => {
+                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
+                    return ControlFlow::Continue(());
+                }
+                ControlFlow::Break(()) => {
+                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
+                    return ControlFlow::Break(());
+                }
+            },
+
            _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
        }

@@ -217,6 +254,34 @@ async fn connection_manager_loop_step(
    }
 }

+async fn wait_for_active_timeline(
+    timeline_state_updates: &mut watch::Receiver<TimelineState>,
+) -> ControlFlow<(), ()> {
+    let current_state = *timeline_state_updates.borrow();
+    if current_state == TimelineState::Active {
+        return ControlFlow::Continue(());
+    }
+
+    loop {
+        match timeline_state_updates.changed().await {
+            Ok(()) => {
+                let new_state = *timeline_state_updates.borrow();
+                match new_state {
+                    TimelineState::Active => {
+                        debug!("Timeline state changed to active, continuing the walreceiver connection manager");
+                        return ControlFlow::Continue(());
+                    }
+                    state => {
+                        debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}");
+                        continue;
+                    }
+                }
+            }
+            Err(_sender_dropped_error) => return ControlFlow::Break(()),
+        }
+    }
+}
+
 fn cleanup_broker_connection(
    broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
    walreceiver_state: &mut WalreceiverState,
@@ -724,6 +789,12 @@ impl WalreceiverState {
            self.wal_connection_retries.remove(&node_id);
        }
    }
+
+    async fn shutdown(mut self) {
+        if let Some(wal_connection) = self.wal_connection.take() {
+            wal_connection.connection_task.shutdown().await;
+        }
+    }
 }

 #[derive(Debug, PartialEq, Eq)]
@@ -802,6 +873,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -818,6 +890,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some("no commit_lsn".to_string()),
                    },
                    etcd_version: 0,
@@ -834,6 +908,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
                        safekeeper_connstr: Some("no commit_lsn".to_string()),
                    },
                    etcd_version: 0,
@@ -850,6 +925,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -909,6 +985,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -925,6 +1003,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some("not advanced Lsn".to_string()),
                    },
                    etcd_version: 0,
@@ -941,6 +1021,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some("not enough advanced Lsn".to_string()),
                    },
                    etcd_version: 0,
@@ -975,6 +1057,8 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
+                    local_start_lsn: None,
+
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1007,6 +1091,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some("smaller commit_lsn".to_string()),
                    },
                    etcd_version: 0,
@@ -1023,6 +1109,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1039,6 +1127,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -1084,6 +1174,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1100,6 +1192,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1169,6 +1263,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1185,6 +1281,8 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
+                        local_start_lsn: None,
+
                        safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()),
                    },
                    etcd_version: 0,
@@ -1256,6 +1354,8 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
+                    local_start_lsn: None,
+
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1327,6 +1427,8 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
+                    local_start_lsn: None,
+
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1374,7 +1476,9 @@ mod tests {
            timeline: harness
                .load()
                .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
-                .expect("Failed to create an empty timeline for dummy wal connection manager"),
+                .expect("Failed to create an empty timeline for dummy wal connection manager")
+                .initialize()
+                .unwrap(),
            wal_connect_timeout: Duration::from_secs(1),
            lagging_wal_timeout: Duration::from_secs(1),
            max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -35,7 +35,7 @@ use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::crashsafe_dir::path_with_suffix_extension;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

 use crate::metrics::{
@@ -43,10 +43,10 @@ use crate::metrics::{
    WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::v14::nonrelfile_utils::{
@@ -610,13 +610,26 @@ impl PostgresRedoProcess {
            );
            fs::remove_dir_all(&datadir)?;
        }
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).map_err(|e| {
+            Error::new(
+                ErrorKind::Other,
+                format!("incorrect pg_bin_dir path: {}", e),
+            )
+        })?;
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).map_err(|e| {
+            Error::new(
+                ErrorKind::Other,
+                format!("incorrect pg_lib_dir path: {}", e),
+            )
+        })?;
+
        info!("running initdb in {}", datadir.display());
-        let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb"))
+        let initdb = Command::new(pg_bin_dir_path.join("initdb"))
            .args(&["-D", &datadir.to_string_lossy()])
            .arg("-N")
            .env_clear()
-            .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
-            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS
            .close_fds()
            .output()
            .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?;
@@ -642,14 +655,14 @@ impl PostgresRedoProcess {
        }

        // Start postgres itself
-        let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres"))
+        let mut child = Command::new(pg_bin_dir_path.join("postgres"))
            .arg("--wal-redo")
            .stdin(Stdio::piped())
            .stderr(Stdio::piped())
            .stdout(Stdio::piped())
            .env_clear()
-            .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
-            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("PGDATA", &datadir)
            // The redo process is not trusted, so it runs in seccomp mode
            // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"

 [package.extras]
-sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]

 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]

 [[package]]
 name = "aws-sam-translator"
@@ -514,14 +514,6 @@ python-versions = ">=3.7"
 [package.dependencies]
 typing-extensions = ">=4.1.0"

-[[package]]
-name = "cached-property"
-version = "1.5.2"
-description = "A decorator for caching properties in classes."
-category = "main"
-optional = false
-python-versions = "*"
-
 [[package]]
 name = "certifi"
 version = "2022.6.15"
@@ -568,7 +560,7 @@ optional = false
 python-versions = ">=3.6.0"

 [package.extras]
-unicode_backport = ["unicodedata2"]
+unicode-backport = ["unicodedata2"]

 [[package]]
 name = "click"
@@ -601,7 +593,7 @@ python-versions = ">=3.6"
 cffi = ">=1.12"

 [package.extras]
-docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
+docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
 pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
 sdist = ["setuptools_rust (>=0.11.4)"]
@@ -746,9 +738,9 @@ python-versions = ">=3.6.1,<4.0"

 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements_deprecated_finder = ["pip-api", "pipreqs"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]

 [[package]]
 name = "itsdangerous"
@@ -823,7 +815,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-"testing.libs" = ["simplejson", "ujson", "yajl"]
+testing-libs = ["simplejson", "ujson", "yajl"]

 [[package]]
 name = "jsonpointer"
@@ -844,11 +836,12 @@ python-versions = "*"
 [package.dependencies]
 attrs = ">=17.4.0"
 pyrsistent = ">=0.14.0"
+setuptools = "*"
 six = ">=1.11.0"

 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]

 [[package]]
 name = "junit-xml"
@@ -908,6 +901,7 @@ pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
 responses = ">=0.9.0"
+setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0"
 xmltodict = "*"
@@ -1016,6 +1010,7 @@ python-versions = ">=3.7.0,<4.0.0"
 jsonschema = ">=3.2.0,<5.0.0"
 openapi-schema-validator = ">=0.2.0,<0.3.0"
 PyYAML = ">=5.1"
+setuptools = "*"

 [package.extras]
 requests = ["requests"]
@@ -1348,7 +1343,7 @@ urllib3 = ">=1.21.1,<1.27"

 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]

 [[package]]
 name = "responses"
@@ -1402,6 +1397,19 @@ python-versions = ">= 2.7"
 attrs = "*"
 pbr = "*"

+[[package]]
+name = "setuptools"
+version = "65.5.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1468,6 +1476,14 @@ category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"

+[[package]]
+name = "types-toml"
+version = "0.10.8"
+description = "Typing stubs for toml"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "types-urllib3"
 version = "1.26.17"
@@ -1552,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975"
+content-hash = "17cdbfe90f1b06dffaf24c3e076384ec08dd4a2dce5a05e50565f7364932eb2d"

 [metadata.files]
 aiopg = [
@@ -1647,10 +1663,6 @@ botocore-stubs = [
    {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"},
    {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"},
 ]
-cached-property = [
-    {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"},
-    {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"},
-]
 certifi = [
    {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"},
    {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"},
@@ -2194,6 +2206,10 @@ sarif-om = [
    {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
    {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
+setuptools = [
+    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
+    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+]
 six = [
    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -2222,6 +2238,10 @@ types-s3transfer = [
    {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
    {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
 ]
+types-toml = [
+    {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"},
+    {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"},
+]
 types-urllib3 = [
    {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
    {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ requests = "^2.26.0"
 pytest-xdist = "^2.3.0"
 asyncpg = "^0.24.0"
 aiopg = "^1.3.1"
-cached-property = "^1.5.2"
 Jinja2 = "^3.0.2"
 types-requests = "^2.28.5"
 types-psycopg2 = "^2.9.18"
@@ -29,12 +28,14 @@ Werkzeug = "2.1.2"
 pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"
+toml = "^0.10.2"

 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
 mypy = "==0.971"
 black = "^22.6.0"
 isort = "^5.10.1"
+types-toml = "^0.10.8"

 [build-system]
 requires = ["poetry-core>=1.0.0"]
@@ -74,7 +75,6 @@ strict = true
 [[tool.mypy.overrides]]
 module = [
    "asyncpg.*",
-    "cached_property.*",
    "pg8000.*",
 ]
 ignore_missing_imports = true
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -21,7 +21,8 @@ use metrics::set_build_info_metric;
 use safekeeper::broker;
 use safekeeper::control_file;
 use safekeeper::defaults::{
-    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
+    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
+    DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
 };
 use safekeeper::http;
 use safekeeper::remove_wal;
@@ -31,8 +32,12 @@ use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use utils::auth::JwtAuth;
 use utils::{
-    http::endpoint, id::NodeId, logging, project_git_version, shutdown::exit_now, signals,
-    tcp_listener,
+    http::endpoint,
+    id::NodeId,
+    logging::{self, LogFormat},
+    project_git_version,
+    shutdown::exit_now,
+    signals, tcp_listener,
 };

 const LOCK_FILE_NAME: &str = "safekeeper.lock";
@@ -72,10 +77,6 @@ fn main() -> anyhow::Result<()> {
        conf.listen_http_addr = addr.to_string();
    }

-    if let Some(recall) = arg_matches.get_one::<String>("recall") {
-        conf.recall_period = humantime::parse_duration(recall)?;
-    }
-
    let mut given_id = None;
    if let Some(given_id_str) = arg_matches.get_one::<String>("id") {
        given_id = Some(NodeId(
@@ -93,6 +94,16 @@ fn main() -> anyhow::Result<()> {
        conf.broker_etcd_prefix = prefix.to_string();
    }

+    if let Some(heartbeat_timeout_str) = arg_matches.get_one::<String>("heartbeat-timeout") {
+        conf.heartbeat_timeout =
+            humantime::parse_duration(heartbeat_timeout_str).with_context(|| {
+                format!(
+                    "failed to parse heartbeat-timeout {}",
+                    heartbeat_timeout_str
+                )
+            })?;
+    }
+
    if let Some(backup_threads) = arg_matches.get_one::<String>("wal-backup-threads") {
        conf.backup_runtime_threads = backup_threads
            .parse()
@@ -105,6 +116,14 @@ fn main() -> anyhow::Result<()> {
        let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
        conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?);
    }
+    if let Some(max_offloader_lag_str) = arg_matches.get_one::<String>("max-offloader-lag") {
+        conf.max_offloader_lag_bytes = max_offloader_lag_str.parse().with_context(|| {
+            format!(
+                "failed to parse max offloader lag {}",
+                max_offloader_lag_str
+            )
+        })?;
+    }
    // Seems like there is no better way to accept bool values explicitly in clap.
    conf.wal_backup_enabled = arg_matches
        .get_one::<String>("enable-wal-backup")
@@ -116,11 +135,15 @@ fn main() -> anyhow::Result<()> {
        .get_one::<String>("auth-validation-public-key-path")
        .map(PathBuf::from);

+    if let Some(log_format) = arg_matches.get_one::<String>("log-format") {
+        conf.log_format = LogFormat::from_config(log_format)?;
+    }
+
    start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
 }

 fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bool) -> Result<()> {
-    let log_file = logging::init("safekeeper.log", conf.daemonize)?;
+    let log_file = logging::init("safekeeper.log", conf.daemonize, conf.log_format)?;

    info!("version: {GIT_VERSION}");

@@ -361,11 +384,6 @@ fn cli() -> Command {
                .short('p')
                .long("pageserver"),
        )
-        .arg(
-            Arg::new("recall")
-                .long("recall")
-                .help("Period for requestion pageserver to call for replication"),
-        )
        .arg(
            Arg::new("daemonize")
                .short('d')
@@ -397,6 +415,11 @@ fn cli() -> Command {
            .long("broker-etcd-prefix")
            .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
        )
+        .arg(
+            Arg::new("heartbeat-timeout")
+                .long("heartbeat-timeout")
+                .help(formatcp!("Peer is considered dead after not receiving heartbeats from it during this period (default {}s), passed as a human readable duration.", DEFAULT_HEARTBEAT_TIMEOUT.as_secs()))
+        )
        .arg(
            Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")),
        ).arg(
@@ -404,6 +427,11 @@ fn cli() -> Command {
                .long("remote-storage")
                .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"<BUCKETNAME>\", \"bucket_region\":\"<REGION>\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring structure on the file system.")
        )
+        .arg(
+            Arg::new("max-offloader-lag")
+                .long("max-offloader-lag")
+                .help(formatcp!("Safekeeper won't be elected for WAL offloading if it is lagging for more than this value (default {}MB) in bytes", DEFAULT_MAX_OFFLOADER_LAG_BYTES / (1 << 20)))
+        )
        .arg(
            Arg::new("enable-wal-backup")
                .long("enable-wal-backup")
@@ -416,6 +444,11 @@ fn cli() -> Command {
                .long("auth-validation-public-key-path")
                .help("Path to an RSA .pem public key which is used to check JWT tokens")
        )
+        .arg(
+            Arg::new("log-format")
+                .long("log-format")
+                .help("Format for logging, either 'plain' or 'json'")
+        )
 }

 #[test]
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -1,6 +1,5 @@
 //! Communication with etcd, providing safekeeper peers and pageserver coordination.

-use anyhow::anyhow;
 use anyhow::Context;
 use anyhow::Error;
 use anyhow::Result;
@@ -12,11 +11,9 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::time::Duration;
-use tokio::spawn;
 use tokio::task::JoinHandle;
 use tokio::{runtime, time::sleep};
 use tracing::*;
-use url::Url;

 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
@@ -56,113 +53,6 @@ fn timeline_safekeeper_path(
    )
 }

-pub struct Election {
-    pub election_name: String,
-    pub candidate_name: String,
-    pub broker_endpoints: Vec<Url>,
-}
-
-impl Election {
-    pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec<Url>) -> Self {
-        Self {
-            election_name,
-            candidate_name,
-            broker_endpoints,
-        }
-    }
-}
-
-pub struct ElectionLeader {
-    client: Client,
-    keep_alive: JoinHandle<Result<()>>,
-}
-
-impl ElectionLeader {
-    pub async fn check_am_i(
-        &mut self,
-        election_name: String,
-        candidate_name: String,
-    ) -> Result<bool> {
-        let resp = self.client.leader(election_name).await?;
-
-        let kv = resp
-            .kv()
-            .ok_or_else(|| anyhow!("failed to get leader response"))?;
-        let leader = kv.value_str()?;
-
-        Ok(leader == candidate_name)
-    }
-
-    pub async fn give_up(self) {
-        self.keep_alive.abort();
-        // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway
-        // should we await for keep alive termination?
-        let _ = self.keep_alive.await;
-    }
-}
-
-pub async fn get_leader(req: &Election, leader: &mut Option<ElectionLeader>) -> Result<()> {
-    let mut client = Client::connect(req.broker_endpoints.clone(), None)
-        .await
-        .context("Could not connect to etcd")?;
-
-    let lease = client
-        .lease_grant(LEASE_TTL_SEC, None)
-        .await
-        .context("Could not acquire a lease");
-
-    let lease_id = lease.map(|l| l.id()).unwrap();
-
-    // kill previous keepalive, if any
-    if let Some(l) = leader.take() {
-        l.give_up().await;
-    }
-
-    let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id));
-    // immediately save handle to kill task if we get canceled below
-    *leader = Some(ElectionLeader {
-        client: client.clone(),
-        keep_alive,
-    });
-
-    client
-        .campaign(
-            req.election_name.clone(),
-            req.candidate_name.clone(),
-            lease_id,
-        )
-        .await?;
-
-    Ok(())
-}
-
-async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> {
-    let (mut keeper, mut ka_stream) = client
-        .lease_keep_alive(lease_id)
-        .await
-        .context("failed to create keepalive stream")?;
-
-    loop {
-        let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
-
-        keeper
-            .keep_alive()
-            .await
-            .context("failed to send LeaseKeepAliveRequest")?;
-
-        ka_stream
-            .message()
-            .await
-            .context("failed to receive LeaseKeepAliveResponse")?;
-
-        sleep(push_interval).await;
-    }
-}
-
-pub fn get_candiate_name(system_id: NodeId) -> String {
-    format!("id_{system_id}")
-}
-
 async fn push_sk_info(
    ttid: TenantTimelineId,
    mut client: Client,
@@ -236,7 +126,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
        let handles = active_tlis
            .iter()
            .map(|tli| {
-                let sk_info = tli.get_public_info(&conf);
+                let sk_info = tli.get_safekeeper_info(&conf);
                let key =
                    timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id);
                let lease = leases.remove(&tli.ttid).unwrap();
@@ -282,6 +172,9 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
            Some(new_info) => {
                // note: there are blocking operations below, but it's considered fine for now
                if let Ok(tli) = GlobalTimelines::get(new_info.key.id) {
+                    // Note that we also receive *our own* info. That's
+                    // important, as it is used as an indication of live
+                    // connection to the broker.
                    tli.record_safekeeper_info(&new_info.value, new_info.key.node_id)
                        .await?
                }
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,6 +1,7 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::safekeeper::{
-    AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry,
+    AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory,
+    TermSwitchEntry,
 };
 use anyhow::{bail, Result};
 use serde::{Deserialize, Serialize};
@@ -134,7 +135,7 @@ pub struct SafeKeeperStateV4 {
    // fundamental; but state is saved here only for informational purposes and
    // obviously can be stale. (Currently not saved at all, but let's provision
    // place to have less file version upgrades).
-    pub peers: Peers,
+    pub peers: PersistedPeers,
 }

 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState> {
@@ -165,7 +166,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
            backup_lsn: Lsn(0),
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
-            peers: Peers(vec![]),
+            peers: PersistedPeers(vec![]),
        });
    // migrate to hexing some ids
    } else if version == 2 {
@@ -188,7 +189,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
            backup_lsn: Lsn(0),
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
-            peers: Peers(vec![]),
+            peers: PersistedPeers(vec![]),
        });
    // migrate to moving tenant_id/timeline_id to the top and adding some lsns
    } else if version == 3 {
@@ -211,7 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
            backup_lsn: Lsn(0),
            peer_horizon_lsn: oldstate.truncate_lsn,
            remote_consistent_lsn: Lsn(0),
-            peers: Peers(vec![]),
+            peers: PersistedPeers(vec![]),
        });
    // migrate to having timeline_start_lsn
    } else if version == 4 {
@@ -234,7 +235,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
            backup_lsn: Lsn::INVALID,
            peer_horizon_lsn: oldstate.peer_horizon_lsn,
            remote_consistent_lsn: Lsn(0),
-            peers: Peers(vec![]),
+            peers: PersistedPeers(vec![]),
        });
    } else if version == 5 {
        info!("reading safekeeper control file version {}", version);
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,11 +1,16 @@
-use defaults::DEFAULT_WAL_BACKUP_RUNTIME_THREADS;
+use defaults::{
+    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
+};
 //
 use remote_storage::RemoteStorageConfig;
 use std::path::PathBuf;
 use std::time::Duration;
 use url::Url;

-use utils::id::{NodeId, TenantId, TenantTimelineId};
+use utils::{
+    id::{NodeId, TenantId, TenantTimelineId},
+    logging::LogFormat,
+};

 pub mod broker;
 pub mod control_file;
@@ -34,8 +39,9 @@ pub mod defaults {
        DEFAULT_PG_LISTEN_PORT,
    };

-    pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10);
    pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
+    pub const DEFAULT_HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(5);
+    pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
 }

 #[derive(Debug, Clone)]
@@ -52,7 +58,6 @@ pub struct SafeKeeperConf {
    pub no_sync: bool,
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
-    pub recall_period: Duration,
    pub remote_storage: Option<RemoteStorageConfig>,
    pub backup_runtime_threads: usize,
    pub wal_backup_enabled: bool,
@@ -60,6 +65,9 @@ pub struct SafeKeeperConf {
    pub broker_endpoints: Vec<Url>,
    pub broker_etcd_prefix: String,
    pub auth_validation_public_key_path: Option<PathBuf>,
+    pub heartbeat_timeout: Duration,
+    pub max_offloader_lag_bytes: u64,
+    pub log_format: LogFormat,
 }

 impl SafeKeeperConf {
@@ -85,13 +93,15 @@ impl Default for SafeKeeperConf {
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            remote_storage: None,
-            recall_period: defaults::DEFAULT_RECALL_PERIOD,
            my_id: NodeId(0),
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
            backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
            wal_backup_enabled: true,
            auth_validation_public_key_path: None,
+            heartbeat_timeout: DEFAULT_HEARTBEAT_TIMEOUT,
+            max_offloader_lag_bytes: DEFAULT_MAX_OFFLOADER_LAG_BYTES,
+            log_format: LogFormat::Plain,
        }
    }
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -11,6 +11,7 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
+
 use tracing::*;

 use crate::control_file;
@@ -132,9 +133,8 @@ pub struct ServerInfo {
    pub wal_seg_size: u32,
 }

-/// Data published by safekeeper to the peers
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PeerInfo {
+pub struct PersistedPeerInfo {
    /// LSN up to which safekeeper offloaded WAL to s3.
    backup_lsn: Lsn,
    /// Term of the last entry.
@@ -145,7 +145,7 @@ pub struct PeerInfo {
    commit_lsn: Lsn,
 }

-impl PeerInfo {
+impl PersistedPeerInfo {
    fn new() -> Self {
        Self {
            backup_lsn: Lsn::INVALID,
@@ -156,10 +156,8 @@ impl PeerInfo {
    }
 }

-// vector-based node id -> peer state map with very limited functionality we
-// need/
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Peers(pub Vec<(NodeId, PeerInfo)>);
+pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);

 /// Persistent information stored on safekeeper node
 /// On disk data is prefixed by magic and format version and followed by checksum.
@@ -203,7 +201,7 @@ pub struct SafeKeeperState {
    // fundamental; but state is saved here only for informational purposes and
    // obviously can be stale. (Currently not saved at all, but let's provision
    // place to have less file version upgrades).
-    pub peers: Peers,
+    pub peers: PersistedPeers,
 }

 #[derive(Debug, Clone)]
@@ -240,7 +238,12 @@ impl SafeKeeperState {
            backup_lsn: local_start_lsn,
            peer_horizon_lsn: local_start_lsn,
            remote_consistent_lsn: Lsn(0),
-            peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()),
+            peers: PersistedPeers(
+                peers
+                    .iter()
+                    .map(|p| (*p, PersistedPeerInfo::new()))
+                    .collect(),
+            ),
        }
    }

--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -7,7 +7,7 @@ use etcd_broker::subscription_value::SkTimelineInfo;

 use postgres_ffi::XLogSegNo;

-use tokio::sync::watch;
+use tokio::{sync::watch, time::Instant};

 use std::cmp::{max, min};

@@ -26,7 +26,7 @@ use utils::{

 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
-    SafekeeperMemState, ServerInfo,
+    SafekeeperMemState, ServerInfo, Term,
 };
 use crate::send_wal::HotStandbyFeedback;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
@@ -36,6 +36,53 @@ use crate::wal_storage;
 use crate::wal_storage::Storage as wal_storage_iface;
 use crate::SafeKeeperConf;

+/// Things safekeeper should know about timeline state on peers.
+#[derive(Debug, Clone)]
+pub struct PeerInfo {
+    pub sk_id: NodeId,
+    /// Term of the last entry.
+    _last_log_term: Term,
+    /// LSN of the last record.
+    _flush_lsn: Lsn,
+    pub commit_lsn: Lsn,
+    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
+    /// sk since backup_lsn.
+    pub local_start_lsn: Lsn,
+    /// When info was received.
+    ts: Instant,
+}
+
+impl PeerInfo {
+    fn from_sk_info(sk_id: NodeId, sk_info: &SkTimelineInfo, ts: Instant) -> PeerInfo {
+        PeerInfo {
+            sk_id,
+            _last_log_term: sk_info.last_log_term.unwrap_or(0),
+            _flush_lsn: sk_info.flush_lsn.unwrap_or(Lsn::INVALID),
+            commit_lsn: sk_info.commit_lsn.unwrap_or(Lsn::INVALID),
+            local_start_lsn: sk_info.local_start_lsn.unwrap_or(Lsn::INVALID),
+            ts,
+        }
+    }
+}
+
+// vector-based node id -> peer state map with very limited functionality we
+// need.
+#[derive(Debug, Clone, Default)]
+pub struct PeersInfo(pub Vec<PeerInfo>);
+
+impl PeersInfo {
+    fn get(&mut self, id: NodeId) -> Option<&mut PeerInfo> {
+        self.0.iter_mut().find(|p| p.sk_id == id)
+    }
+
+    fn upsert(&mut self, p: &PeerInfo) {
+        match self.get(p.sk_id) {
+            Some(rp) => *rp = p.clone(),
+            None => self.0.push(p.clone()),
+        }
+    }
+}
+
 /// Replica status update + hot standby feedback
 #[derive(Debug, Clone, Copy)]
 pub struct ReplicaState {
@@ -74,6 +121,8 @@ impl ReplicaState {
 pub struct SharedState {
    /// Safekeeper object
    sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
+    /// In memory list containing state of peers sent in latest messages from them.
+    peers_info: PeersInfo,
    /// State of replicas
    replicas: Vec<Option<ReplicaState>>,
    /// True when WAL backup launcher oversees the timeline, making sure WAL is
@@ -123,7 +172,8 @@ impl SharedState {

        Ok(Self {
            sk,
-            replicas: Vec::new(),
+            peers_info: PeersInfo(vec![]),
+            replicas: vec![],
            wal_backup_active: false,
            active: false,
            num_computes: 0,
@@ -142,6 +192,7 @@ impl SharedState {

        Ok(Self {
            sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
+            peers_info: PeersInfo(vec![]),
            replicas: Vec::new(),
            wal_backup_active: false,
            active: false,
@@ -201,12 +252,6 @@ impl SharedState {
        self.wal_backup_active
    }

-    // Can this safekeeper offload to s3? Recently joined safekeepers might not
-    // have necessary WAL.
-    fn can_wal_backup(&self) -> bool {
-        self.sk.state.local_start_lsn <= self.sk.inmem.backup_lsn
-    }
-
    fn get_wal_seg_size(&self) -> usize {
        self.sk.state.server.wal_seg_size as usize
    }
@@ -268,6 +313,24 @@ impl SharedState {
        self.replicas.push(Some(state));
        pos
    }
+
+    fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
+        SkTimelineInfo {
+            last_log_term: Some(self.sk.get_epoch()),
+            flush_lsn: Some(self.sk.wal_store.flush_lsn()),
+            // note: this value is not flushed to control file yet and can be lost
+            commit_lsn: Some(self.sk.inmem.commit_lsn),
+            // TODO: rework feedbacks to avoid max here
+            remote_consistent_lsn: Some(max(
+                self.get_replicas_state().remote_consistent_lsn,
+                self.sk.inmem.remote_consistent_lsn,
+            )),
+            peer_horizon_lsn: Some(self.sk.inmem.peer_horizon_lsn),
+            safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
+            backup_lsn: Some(self.sk.inmem.backup_lsn),
+            local_start_lsn: Some(self.sk.state.local_start_lsn),
+        }
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -517,17 +580,6 @@ impl Timeline {
        self.write_shared_state().wal_backup_attend()
    }

-    /// Can this safekeeper offload to s3? Recently joined safekeepers might not
-    /// have necessary WAL.
-    pub fn can_wal_backup(&self) -> bool {
-        if self.is_cancelled() {
-            return false;
-        }
-
-        let shared_state = self.write_shared_state();
-        shared_state.can_wal_backup()
-    }
-
    /// Returns full timeline info, required for the metrics. If the timeline is
    /// not active, returns None instead.
    pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
@@ -632,36 +684,25 @@ impl Timeline {
        Ok(())
    }

-    /// Return public safekeeper info for broadcasting to broker and other peers.
-    pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
+    /// Get safekeeper info for broadcasting to broker and other peers.
+    pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
        let shared_state = self.write_shared_state();
-        SkTimelineInfo {
-            last_log_term: Some(shared_state.sk.get_epoch()),
-            flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
-            // note: this value is not flushed to control file yet and can be lost
-            commit_lsn: Some(shared_state.sk.inmem.commit_lsn),
-            // TODO: rework feedbacks to avoid max here
-            remote_consistent_lsn: Some(max(
-                shared_state.get_replicas_state().remote_consistent_lsn,
-                shared_state.sk.inmem.remote_consistent_lsn,
-            )),
-            peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
-            safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
-            backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
-        }
+        shared_state.get_safekeeper_info(conf)
    }

    /// Update timeline state with peer safekeeper data.
    pub async fn record_safekeeper_info(
        &self,
        sk_info: &SkTimelineInfo,
-        _sk_id: NodeId,
+        sk_id: NodeId,
    ) -> Result<()> {
        let is_wal_backup_action_pending: bool;
        let commit_lsn: Lsn;
        {
            let mut shared_state = self.write_shared_state();
            shared_state.sk.record_safekeeper_info(sk_info)?;
+            let peer_info = PeerInfo::from_sk_info(sk_id, sk_info, Instant::now());
+            shared_state.peers_info.upsert(&peer_info);
            is_wal_backup_action_pending = shared_state.update_status(self.ttid);
            commit_lsn = shared_state.sk.inmem.commit_lsn;
        }
@@ -673,6 +714,22 @@ impl Timeline {
        Ok(())
    }

+    /// Get our latest view of alive peers status on the timeline.
+    /// We pass our own info through the broker as well, so when we don't have connection
+    /// to the broker returned vec is empty.
+    pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
+        let shared_state = self.write_shared_state();
+        let now = Instant::now();
+        shared_state
+            .peers_info
+            .0
+            .iter()
+            // Regard peer as absent if we haven't heard from it within heartbeat_timeout.
+            .filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout)
+            .cloned()
+            .collect()
+    }
+
    /// Add send_wal replica to the in-memory vector of replicas.
    pub fn add_replica(&self, state: ReplicaState) -> usize {
        self.write_shared_state().add_replica(state)
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -1,8 +1,7 @@
 use anyhow::{Context, Result};
-use etcd_broker::subscription_key::{
-    NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind,
-};
+
 use tokio::task::JoinHandle;
+use utils::id::NodeId;

 use std::cmp::min;
 use std::collections::HashMap;
@@ -26,14 +25,11 @@ use tracing::*;

 use utils::{id::TenantTimelineId, lsn::Lsn};

-use crate::broker::{Election, ElectionLeader};
-use crate::timeline::Timeline;
-use crate::{broker, GlobalTimelines, SafeKeeperConf};
+use crate::timeline::{PeerInfo, Timeline};
+use crate::{GlobalTimelines, SafeKeeperConf};

 use once_cell::sync::OnceCell;

-const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000;
-
 const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
 const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;

@@ -70,47 +66,104 @@ struct WalBackupTimelineEntry {
    handle: Option<WalBackupTaskHandle>,
 }

-/// Start per timeline task, if it makes sense for this safekeeper to offload.
-fn consider_start_task(
+async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
+    if let Some(wb_handle) = entry.handle.take() {
+        // Tell the task to shutdown. Error means task exited earlier, that's ok.
+        let _ = wb_handle.shutdown_tx.send(()).await;
+        // Await the task itself. TODO: restart panicked tasks earlier.
+        if let Err(e) = wb_handle.handle.await {
+            warn!("WAL backup task for {} panicked: {}", ttid, e);
+        }
+    }
+}
+
+/// The goal is to ensure that normally only one safekeepers offloads. However,
+/// it is fine (and inevitable, as s3 doesn't provide CAS) that for some short
+/// time we have several ones as they PUT the same files. Also,
+/// - frequently changing the offloader would be bad;
+/// - electing seriously lagging safekeeper is undesirable;
+/// So we deterministically choose among the reasonably caught up candidates.
+/// TODO: take into account failed attempts to deal with hypothetical situation
+/// where s3 is unreachable only for some sks.
+fn determine_offloader(
+    alive_peers: &[PeerInfo],
+    wal_backup_lsn: Lsn,
+    ttid: TenantTimelineId,
+    conf: &SafeKeeperConf,
+) -> (Option<NodeId>, String) {
+    // TODO: remove this once we fill newly joined safekeepers since backup_lsn.
+    let capable_peers = alive_peers
+        .iter()
+        .filter(|p| p.local_start_lsn <= wal_backup_lsn);
+    match capable_peers.clone().map(|p| p.commit_lsn).max() {
+        None => (None, "no connected peers to elect from".to_string()),
+        Some(max_commit_lsn) => {
+            let threshold = max_commit_lsn
+                .checked_sub(conf.max_offloader_lag_bytes)
+                .unwrap_or(Lsn(0));
+            let mut caughtup_peers = capable_peers
+                .clone()
+                .filter(|p| p.commit_lsn >= threshold)
+                .collect::<Vec<_>>();
+            caughtup_peers.sort_by(|p1, p2| p1.sk_id.cmp(&p2.sk_id));
+
+            // To distribute the load, shift by timeline_id.
+            let offloader = caughtup_peers
+                [(u128::from(ttid.timeline_id) % caughtup_peers.len() as u128) as usize]
+                .sk_id;
+
+            let mut capable_peers_dbg = capable_peers
+                .map(|p| (p.sk_id, p.commit_lsn))
+                .collect::<Vec<_>>();
+            capable_peers_dbg.sort_by(|p1, p2| p1.0.cmp(&p2.0));
+            (
+                Some(offloader),
+                format!(
+                    "elected {} among {:?} peers, with {} of them being caughtup",
+                    offloader,
+                    capable_peers_dbg,
+                    caughtup_peers.len()
+                ),
+            )
+        }
+    }
+}
+
+/// Based on peer information determine which safekeeper should offload; if it
+/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
+/// is running, kill it.
+async fn update_task(
    conf: &SafeKeeperConf,
    ttid: TenantTimelineId,
-    task: &mut WalBackupTimelineEntry,
+    entry: &mut WalBackupTimelineEntry,
 ) {
-    if !task.timeline.can_wal_backup() {
-        return;
+    let alive_peers = entry.timeline.get_peers(conf);
+    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn();
+    let (offloader, election_dbg_str) =
+        determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
+    let elected_me = Some(conf.my_id) == offloader;
+
+    if elected_me != (entry.handle.is_some()) {
+        if elected_me {
+            info!("elected for backup {}: {}", ttid, election_dbg_str);
+
+            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
+            let timeline_dir = conf.timeline_dir(&ttid);
+
+            let handle = tokio::spawn(
+                backup_task_main(ttid, timeline_dir, shutdown_rx)
+                    .instrument(info_span!("WAL backup task", ttid = %ttid)),
+            );
+
+            entry.handle = Some(WalBackupTaskHandle {
+                shutdown_tx,
+                handle,
+            });
+        } else {
+            info!("stepping down from backup {}: {}", ttid, election_dbg_str);
+            shut_down_task(ttid, entry).await;
+        }
    }
-    info!("starting WAL backup task for {}", ttid);
-
-    // TODO: decide who should offload right here by simply checking current
-    // state instead of running elections in offloading task.
-    let election_name = SubscriptionKey {
-        cluster_prefix: conf.broker_etcd_prefix.clone(),
-        kind: SubscriptionKind::Operation(
-            ttid,
-            NodeKind::Safekeeper,
-            OperationKind::Safekeeper(SkOperationKind::WalBackup),
-        ),
-    }
-    .watch_key();
-    let my_candidate_name = broker::get_candiate_name(conf.my_id);
-    let election = broker::Election::new(
-        election_name,
-        my_candidate_name,
-        conf.broker_endpoints.clone(),
-    );
-
-    let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-    let timeline_dir = conf.timeline_dir(&ttid);
-
-    let handle = tokio::spawn(
-        backup_task_main(ttid, timeline_dir, shutdown_rx, election)
-            .instrument(info_span!("WAL backup task", ttid = %ttid)),
-    );
-
-    task.handle = Some(WalBackupTaskHandle {
-        shutdown_tx,
-        handle,
-    });
 }

 const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
@@ -158,27 +211,20 @@ async fn wal_backup_launcher_main_loop(
                            timeline,
                            handle: None,
                        });
-                        consider_start_task(&conf, ttid, entry);
+                        update_task(&conf, ttid, entry).await;
                    } else {
                        // need to stop the task
                        info!("stopping WAL backup task for {}", ttid);
-
-                        let entry = tasks.remove(&ttid).unwrap();
-                        if let Some(wb_handle) = entry.handle {
-                            // Tell the task to shutdown. Error means task exited earlier, that's ok.
-                            let _ = wb_handle.shutdown_tx.send(()).await;
-                            // Await the task itself. TODO: restart panicked tasks earlier.
-                            if let Err(e) = wb_handle.handle.await {
-                                warn!("WAL backup task for {} panicked: {}", ttid, e);
-                            }
-                        }
+                        let mut entry = tasks.remove(&ttid).unwrap();
+                        shut_down_task(ttid, &mut entry).await;
                    }
                }
            }
-            // Start known tasks, if needed and possible.
+            // For each timeline needing offloading, check if this safekeeper
+            // should do the job and start/stop the task accordingly.
            _ = ticker.tick() => {
-                for (ttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) {
-                    consider_start_task(&conf, *ttid, entry);
+                for (ttid, entry) in tasks.iter_mut() {
+                    update_task(&conf, *ttid, entry).await;
                }
            }
        }
@@ -190,17 +236,13 @@ struct WalBackupTask {
    timeline_dir: PathBuf,
    wal_seg_size: usize,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,
-    leader: Option<ElectionLeader>,
-    election: Election,
 }

-/// Offload single timeline. Called only after we checked that backup
-/// is required (wal_backup_attend) and possible (can_wal_backup).
+/// Offload single timeline.
 async fn backup_task_main(
    ttid: TenantTimelineId,
    timeline_dir: PathBuf,
    mut shutdown_rx: Receiver<()>,
-    election: Election,
 ) {
    info!("started");
    let res = GlobalTimelines::get(ttid);
@@ -215,8 +257,6 @@ async fn backup_task_main(
        commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
        timeline: tli,
        timeline_dir,
-        leader: None,
-        election,
    };

    // task is spinned up only when wal_seg_size already initialized
@@ -229,9 +269,6 @@ async fn backup_task_main(
            canceled = true;
        }
    }
-    if let Some(l) = wb.leader {
-        l.give_up().await;
-    }
    info!("task {}", if canceled { "canceled" } else { "terminated" });
 }

@@ -239,106 +276,71 @@ impl WalBackupTask {
    async fn run(&mut self) {
        let mut backup_lsn = Lsn(0);

-        // election loop
+        let mut retry_attempt = 0u32;
+        // offload loop
        loop {
-            let mut retry_attempt = 0u32;
+            if retry_attempt == 0 {
+                // wait for new WAL to arrive
+                if let Err(e) = self.commit_lsn_watch_rx.changed().await {
+                    // should never happen, as we hold Arc to timeline.
+                    error!("commit_lsn watch shut down: {:?}", e);
+                    return;
+                }
+            } else {
+                // or just sleep if we errored previously
+                let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS;
+                if let Some(backoff_delay) = UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt)
+                {
+                    retry_delay = min(retry_delay, backoff_delay);
+                }
+                sleep(Duration::from_millis(retry_delay)).await;
+            }

-            info!("acquiring leadership");
-            if let Err(e) = broker::get_leader(&self.election, &mut self.leader).await {
-                error!("error during leader election {:?}", e);
-                sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await;
+            let commit_lsn = *self.commit_lsn_watch_rx.borrow();
+
+            // Note that backup_lsn can be higher than commit_lsn if we
+            // don't have much local WAL and others already uploaded
+            // segments we don't even have.
+            if backup_lsn.segment_number(self.wal_seg_size)
+                >= commit_lsn.segment_number(self.wal_seg_size)
+            {
+                retry_attempt = 0;
+                continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
+            }
+            // Perhaps peers advanced the position, check shmem value.
+            backup_lsn = self.timeline.get_wal_backup_lsn();
+            if backup_lsn.segment_number(self.wal_seg_size)
+                >= commit_lsn.segment_number(self.wal_seg_size)
+            {
+                retry_attempt = 0;
                continue;
            }
-            info!("acquired leadership");

-            // offload loop
-            loop {
-                if retry_attempt == 0 {
-                    // wait for new WAL to arrive
-                    if let Err(e) = self.commit_lsn_watch_rx.changed().await {
-                        // should never happen, as we hold Arc to timeline.
-                        error!("commit_lsn watch shut down: {:?}", e);
+            match backup_lsn_range(
+                backup_lsn,
+                commit_lsn,
+                self.wal_seg_size,
+                &self.timeline_dir,
+            )
+            .await
+            {
+                Ok(backup_lsn_result) => {
+                    backup_lsn = backup_lsn_result;
+                    let res = self.timeline.set_wal_backup_lsn(backup_lsn_result);
+                    if let Err(e) = res {
+                        error!("failed to set wal_backup_lsn: {}", e);
                        return;
                    }
-                } else {
-                    // or just sleep if we errored previously
-                    let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS;
-                    if let Some(backoff_delay) =
-                        UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt)
-                    {
-                        retry_delay = min(retry_delay, backoff_delay);
-                    }
-                    sleep(Duration::from_millis(retry_delay)).await;
+                    retry_attempt = 0;
                }
+                Err(e) => {
+                    error!(
+                        "failed while offloading range {}-{}: {:?}",
+                        backup_lsn, commit_lsn, e
+                    );

-                let commit_lsn = *self.commit_lsn_watch_rx.borrow();
-
-                // Note that backup_lsn can be higher than commit_lsn if we
-                // don't have much local WAL and others already uploaded
-                // segments we don't even have.
-                if backup_lsn.segment_number(self.wal_seg_size)
-                    >= commit_lsn.segment_number(self.wal_seg_size)
-                {
-                    continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
-                }
-                // Perhaps peers advanced the position, check shmem value.
-                backup_lsn = self.timeline.get_wal_backup_lsn();
-                if backup_lsn.segment_number(self.wal_seg_size)
-                    >= commit_lsn.segment_number(self.wal_seg_size)
-                {
-                    continue;
-                }
-
-                if let Some(l) = self.leader.as_mut() {
-                    // Optimization idea for later:
-                    //  Avoid checking election leader every time by returning current lease grant expiration time
-                    //  Re-check leadership only after expiration time,
-                    //  such approach would reduce overhead on write-intensive workloads
-
-                    match l
-                        .check_am_i(
-                            self.election.election_name.clone(),
-                            self.election.candidate_name.clone(),
-                        )
-                        .await
-                    {
-                        Ok(leader) => {
-                            if !leader {
-                                info!("lost leadership");
-                                break;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("error validating leader, {:?}", e);
-                            break;
-                        }
-                    }
-                }
-
-                match backup_lsn_range(
-                    backup_lsn,
-                    commit_lsn,
-                    self.wal_seg_size,
-                    &self.timeline_dir,
-                )
-                .await
-                {
-                    Ok(backup_lsn_result) => {
-                        backup_lsn = backup_lsn_result;
-                        let res = self.timeline.set_wal_backup_lsn(backup_lsn_result);
-                        if let Err(e) = res {
-                            error!("backup error: {}", e);
-                            return;
-                        }
-                        retry_attempt = 0;
-                    }
-                    Err(e) => {
-                        error!(
-                            "failed while offloading range {}-{}: {:?}",
-                            backup_lsn, commit_lsn, e
-                        );
-
-                        retry_attempt = min(retry_attempt + 1, u32::MAX);
+                    if retry_attempt < u32::MAX {
+                        retry_attempt += 1;
                    }
                }
            }
--- a/scripts/docker-compose_test.sh
+++ b/scripts/docker-compose_test.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# this is a shortcut script to avoid duplication in CI
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=dockercompose_compute_1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+	echo "show container information"
+	docker ps
+	docker-compose -f $COMPOSE_FILE logs
+	echo "stop containers..."
+	docker-compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+	echo "start containers (pg_version=$pg_version)."
+	PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
+
+	echo "wait until the compute is ready. timeout after 60s. "
+	cnt=0
+	while sleep 1; do
+		# check timeout
+		cnt=`expr $cnt + 1`
+		if [ $cnt -gt 60 ]; then
+			echo "timeout before the compute is ready."
+			cleanup
+			exit 1
+		fi
+
+		# check if the compute is ready
+		set +o pipefail
+		result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+		set -o pipefail
+		if [ $result -eq 1 ]; then
+			echo "OK. The compute is ready to connect."
+			echo "execute simple queries."
+			docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+			cleanup
+			break
+		fi
+	done
+done
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -17,6 +17,7 @@ import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from enum import Flag, auto
+from functools import cached_property
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast

@@ -27,7 +28,6 @@ import jwt
 import psycopg2
 import pytest
 import requests
-from cached_property import cached_property
 from fixtures.log_helper import log
 from fixtures.types import Lsn, TenantId, TimelineId

@@ -149,19 +149,6 @@ def pytest_configure(config):
        raise Exception('neon binaries not found at "{}"'.format(neon_binpath))


-def profiling_supported():
-    """Return True if the pageserver was compiled with the 'profiling' feature"""
-    bin_pageserver = os.path.join(str(neon_binpath), "pageserver")
-    res = subprocess.run(
-        [bin_pageserver, "--version"],
-        check=True,
-        universal_newlines=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
-    return "profiling:true" in res.stdout
-
-
 def shareable_scope(fixture_name, config) -> Literal["session", "function"]:
    """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.

@@ -874,6 +861,17 @@ class NeonEnv:
        """Get a timeline directory's path based on the repo directory of the test environment"""
        return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)

+    def get_pageserver_version(self) -> str:
+        bin_pageserver = os.path.join(str(neon_binpath), "pageserver")
+        res = subprocess.run(
+            [bin_pageserver, "--version"],
+            check=True,
+            universal_newlines=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        return res.stdout
+
    @cached_property
    def auth_keys(self) -> AuthKeys:
        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_bytes()
@@ -972,10 +970,11 @@ class NeonPageserverApiException(Exception):


 class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, port: int, auth_token: Optional[str] = None):
+    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
        super().__init__()
        self.port = port
        self.auth_token = auth_token
+        self.is_testing_enabled_or_skip = is_testing_enabled_or_skip

        if auth_token is not None:
            self.headers["Authorization"] = f"Bearer {auth_token}"
@@ -994,6 +993,8 @@ class NeonPageserverHttpClient(requests.Session):
        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()

    def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None:
+        self.is_testing_enabled_or_skip()
+
        if isinstance(config_strings, tuple):
            pairs = [config_strings]
        else:
@@ -1111,6 +1112,8 @@ class NeonPageserverHttpClient(requests.Session):
    def timeline_gc(
        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
    ) -> dict[str, Any]:
+        self.is_testing_enabled_or_skip()
+
        log.info(
            f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
        )
@@ -1126,6 +1129,8 @@ class NeonPageserverHttpClient(requests.Session):
        return res_json

    def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        self.is_testing_enabled_or_skip()
+
        log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
@@ -1150,6 +1155,8 @@ class NeonPageserverHttpClient(requests.Session):
        return res_json

    def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
+        self.is_testing_enabled_or_skip()
+
        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
@@ -1469,21 +1476,6 @@ class NeonCli(AbstractNeonCli):
            res.check_returncode()
            return res

-    def pageserver_enabled_features(self) -> Any:
-        bin_pageserver = os.path.join(str(neon_binpath), "pageserver")
-        args = [bin_pageserver, "--enabled-features"]
-        log.info('Running command "{}"'.format(" ".join(args)))
-
-        res = subprocess.run(
-            args,
-            check=True,
-            universal_newlines=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        log.info(f"pageserver_enabled_features success: {res.stdout}")
-        return json.loads(res.stdout)
-
    def pageserver_start(
        self,
        overrides=(),
@@ -1642,6 +1634,7 @@ class NeonPageserver(PgProtocol):
        self.running = False
        self.service_port = port
        self.config_override = config_override
+        self.version = env.get_pageserver_version()

    def start(self, overrides=()) -> "NeonPageserver":
        """
@@ -1671,10 +1664,19 @@ class NeonPageserver(PgProtocol):
    def __exit__(self, exc_type, exc, tb):
        self.stop(immediate=True)

+    def is_testing_enabled_or_skip(self):
+        if '"testing"' not in self.version:
+            pytest.skip("pageserver was built without 'testing' feature")
+
+    def is_profiling_enabled_or_skip(self):
+        if '"profiling"' not in self.version:
+            pytest.skip("pageserver was built without 'profiling' feature")
+
    def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient:
        return NeonPageserverHttpClient(
            port=self.service_port.http,
            auth_token=auth_token,
+            is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
        )


--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -9,7 +9,6 @@ from typing import Dict, List
 import pytest
 from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult
 from fixtures.compare_fixtures import NeonCompare, PgCompare
-from fixtures.neon_fixtures import profiling_supported
 from fixtures.utils import get_scale_for_db


@@ -187,10 +186,8 @@ def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int,
    neon_env_builder.pageserver_config_override = """
 profiling="page_requests"
 """
-    if not profiling_supported():
-        pytest.skip("pageserver was built without 'profiling' feature")
-
    env = neon_env_builder.init_start()
+    env.pageserver.is_profiling_enabled_or_skip()
    env.neon_cli.create_branch("empty", "main")

    neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench")
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -70,18 +70,14 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    # But all others are broken

    # First timeline would not get loaded into pageserver due to corrupt metadata file
-    with pytest.raises(
-        Exception, match=f"Timeline {timeline1} was not found for tenant {tenant1}"
-    ) as err:
+    with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err:
        pg1.start()
    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")

    # Second timeline has no ancestors, only the metadata file and no layer files
    # We don't have the remote storage enabled, which means timeline is in an incorrect state,
    # it's not loaded at all
-    with pytest.raises(
-        Exception, match=f"Timeline {timeline2} was not found for tenant {tenant2}"
-    ) as err:
+    with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err:
        pg2.start()
    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")

@@ -111,18 +107,20 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
            future.result()


-def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv):
+def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http = env.pageserver.http_client()

    tenant_id, _ = env.neon_cli.create_tenant()

+    timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
+    initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

-    # Introduce failpoint when creating a new timeline
+    # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
    pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
    with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
-        _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id)
+        _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id)

    # Restart the page server
    env.neon_cli.pageserver_stop(immediate=True)
@@ -133,3 +131,36 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv):
    assert (
        new_tenant_timelines == old_tenant_timelines
    ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
+
+    timeline_dirs = [d for d in timelines_dir.iterdir()]
+    assert (
+        timeline_dirs == initial_timeline_dirs
+    ), "pageserver should clean its temp timeline files on timeline creation failure"
+
+
+def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
+    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
+    initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
+
+    # Introduce failpoint when creating a new timeline uninit mark, before any other files were created
+    pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
+    with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
+        _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id)
+
+    # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
+    # "New" timeline is not present in the list, allowing pageserver to retry the same request
+    new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
+    assert (
+        new_tenant_timelines == old_tenant_timelines
+    ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
+
+    timeline_dirs = [d for d in timelines_dir.iterdir()]
+    assert (
+        timeline_dirs == initial_timeline_dirs
+    ), "pageserver should clean its temp timeline files on timeline creation failure"
--- a/test_runner/regress/test_close_fds.py
+++ b/test_runner/regress/test_close_fds.py
@@ -1,10 +1,10 @@
 import os.path
 import shutil
 import subprocess
+import threading
 import time
 from contextlib import closing

-from cached_property import threading
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -0,0 +1,267 @@
+import os
+import re
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import pytest
+import toml
+from fixtures.neon_fixtures import (
+    NeonCli,
+    NeonEnvBuilder,
+    NeonPageserverHttpClient,
+    PgBin,
+    PortDistributor,
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from pytest import FixtureRequest
+
+
+def dump_differs(first: Path, second: Path, output: Path) -> bool:
+    """
+    Runs diff(1) command on two SQL dumps and write the output to the given output file.
+    Returns True if the dumps differ, False otherwise.
+    """
+
+    with output.open("w") as stdout:
+        rv = subprocess.run(
+            [
+                "diff",
+                "--unified",  # Make diff output more readable
+                "--ignore-matching-lines=^--",  # Ignore changes in comments
+                "--ignore-blank-lines",
+                str(first),
+                str(second),
+            ],
+            stdout=stdout,
+        )
+
+    return rv.returncode != 0
+
+
+class PortReplacer(object):
+    """
+    Class-helper for replacing ports in config files.
+    """
+
+    def __init__(self, port_distributor: PortDistributor):
+        self.port_distributor = port_distributor
+        self.port_map: Dict[int, int] = {}
+
+    def replace_port(self, value: Union[int, str]) -> Union[int, str]:
+        if isinstance(value, int):
+            if (known_port := self.port_map.get(value)) is not None:
+                return known_port
+
+            self.port_map[value] = self.port_distributor.get_port()
+            return self.port_map[value]
+
+        if isinstance(value, str):
+            # Use regex to find port in a string
+            # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
+            # See https://bugs.python.org/issue27657
+            ports = re.findall(r":(\d+)(?:/|$)", value)
+            assert len(ports) == 1, f"can't find port in {value}"
+            port_int = int(ports[0])
+
+            if (known_port := self.port_map.get(port_int)) is not None:
+                return value.replace(f":{port_int}", f":{known_port}")
+
+            self.port_map[port_int] = self.port_distributor.get_port()
+            return value.replace(f":{port_int}", f":{self.port_map[port_int]}")
+
+        raise TypeError(f"unsupported type {type(value)} of {value=}")
+
+
+def test_backward_compatibility(
+    pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, request: FixtureRequest
+):
+    compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
+    assert (
+        compatibility_snapshot_dir_env is not None
+    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_prepare_snapshot"
+    compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()
+
+    # Make compatibility snapshot artifacts pickupable by Allure
+    # by copying the snapshot directory to the curent test output directory.
+    repo_dir = test_output_dir / "compatibility_snapshot" / "repo"
+
+    shutil.copytree(compatibility_snapshot_dir / "repo", repo_dir)
+
+    # Remove old logs to avoid confusion in test artifacts
+    for logfile in repo_dir.glob("**/*.log"):
+        logfile.unlink()
+
+    # Remove tenants data for computes
+    for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"):
+        shutil.rmtree(tenant)
+
+    # Remove wal-redo temp directory
+    for tenant in (repo_dir / "tenants").glob("*"):
+        shutil.rmtree(tenant / "wal-redo-datadir.___temp")
+
+    # Update paths and ports in config files
+    pr = PortReplacer(port_distributor)
+
+    pageserver_toml = repo_dir / "pageserver.toml"
+    pageserver_config = toml.load(pageserver_toml)
+    new_local_path = pageserver_config["remote_storage"]["local_path"].replace(
+        "/test_prepare_snapshot/",
+        "/test_backward_compatibility/compatibility_snapshot/",
+    )
+
+    pageserver_config["remote_storage"]["local_path"] = new_local_path
+    pageserver_config["listen_http_addr"] = pr.replace_port(pageserver_config["listen_http_addr"])
+    pageserver_config["listen_pg_addr"] = pr.replace_port(pageserver_config["listen_pg_addr"])
+    pageserver_config["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in pageserver_config["broker_endpoints"]
+    ]
+
+    with pageserver_toml.open("w") as f:
+        toml.dump(pageserver_config, f)
+
+    snapshot_config_toml = repo_dir / "config"
+    snapshot_config = toml.load(snapshot_config_toml)
+    snapshot_config["etcd_broker"]["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in snapshot_config["etcd_broker"]["broker_endpoints"]
+    ]
+    snapshot_config["pageserver"]["listen_http_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_http_addr"]
+    )
+    snapshot_config["pageserver"]["listen_pg_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_pg_addr"]
+    )
+    for sk in snapshot_config["safekeepers"]:
+        sk["http_port"] = pr.replace_port(sk["http_port"])
+        sk["pg_port"] = pr.replace_port(sk["pg_port"])
+
+    with (snapshot_config_toml).open("w") as f:
+        toml.dump(snapshot_config, f)
+
+    # Ensure that snapshot doesn't contain references to the original path
+    rv = subprocess.run(
+        [
+            "grep",
+            "--recursive",
+            "--binary-file=without-match",
+            "--files-with-matches",
+            "test_prepare_snapshot/repo",
+            str(repo_dir),
+        ],
+        capture_output=True,
+        text=True,
+    )
+    assert (
+        rv.returncode != 0
+    ), f"there're files referencing `test_prepare_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
+
+    # NeonEnv stub to make NeonCli happy
+    config: Any = type("NeonEnvStub", (object,), {})
+    config.rust_log_override = None
+    config.repo_dir = repo_dir
+    config.pg_version = "14"  # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var
+    config.initial_tenant = snapshot_config["default_tenant_id"]
+
+    # Check that we can start the project
+    cli = NeonCli(config)
+    try:
+        cli.raw_cli(["start"])
+        request.addfinalizer(lambda: cli.raw_cli(["stop"]))
+
+        result = cli.pg_start("main")
+        request.addfinalizer(lambda: cli.pg_stop("main"))
+    except Exception:
+        breaking_changes_allowed = (
+            os.environ.get("ALLOW_BREAKING_CHANGES", "false").lower() == "true"
+        )
+        if breaking_changes_allowed:
+            pytest.xfail("Breaking changes are allowed by ALLOW_BREAKING_CHANGES env var")
+        else:
+            raise
+
+    connstr_all = re.findall(r"Starting postgres node at '([^']+)'", result.stdout)
+    assert len(connstr_all) == 1, f"can't parse connstr from {result.stdout}"
+    connstr = connstr_all[0]
+
+    # Check that the project produces the same dump as the previous version.
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"])
+    initial_dump_differs = dump_differs(
+        compatibility_snapshot_dir / "dump.sql",
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump.filediff",
+    )
+
+    # Check that project can be recovered from WAL
+    # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+    pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1]
+    auth_token = snapshot_config["pageserver"]["auth_token"]
+    pageserver_http = NeonPageserverHttpClient(
+        port=pageserver_port,
+        is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
+        auth_token=auth_token,
+    )
+
+    shutil.rmtree(repo_dir / "local_fs_remote_storage")
+    pageserver_http.timeline_delete(tenant_id, timeline_id)
+    pageserver_http.timeline_create(tenant_id, timeline_id)
+    pg_bin.run(
+        ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
+    )
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    dump_from_wal_differs = dump_differs(
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump-from-wal.sql",
+        test_output_dir / "dump-from-wal.filediff",
+    )
+
+    # Check that we can interract with the data
+    pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr])
+
+    assert not dump_from_wal_differs, "dump from WAL differs"
+    assert not initial_dump_differs, "initial dump differs"
+
+
+@pytest.mark.order(after="test_backward_compatibility")
+# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
+# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
+def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+    # The test doesn't really test anything
+    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
+    #
+    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
+    neon_env_builder.pg_version = "14"
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_local_fs_remote_storage()
+
+    env = neon_env_builder.init_start()
+    pg = env.postgres.create_start("main")
+    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
+    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
+    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
+
+    snapshot_config = toml.load(test_output_dir / "repo" / "config")
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+
+    pageserver_http = env.pageserver.http_client()
+    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    env.postgres.stop_all()
+    for sk in env.safekeepers:
+        sk.stop()
+    env.pageserver.stop()
+
+    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
+    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -1,14 +1,13 @@
-import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-from performance.test_perf_pgbench import get_scales_matrix


-# Test gc_cuttoff
+# Test gc_cutoff
 #
-# This test set fail point after at the end of GC and checks
-# that pageserver normally restarts after it
-@pytest.mark.parametrize("scale", get_scales_matrix(10))
-def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
+# This test sets fail point at the end of GC, and checks that pageserver
+# normally restarts after it. Also, there should be GC ERRORs in the log,
+# but the fixture checks the log for any unexpected ERRORs after every
+# test anyway, so it doesn't need any special attention here.
+def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

@@ -18,21 +17,23 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
            "gc_period": "10 s",
            "gc_horizon": f"{1024 ** 2}",
            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
+            "compaction_period": "5 s",
            # set PITR interval to be small, so we can do GC
            "pitr_interval": "1 s",
+            "compaction_threshold": "3",
+            "image_creation_threshold": "2",
        }
    )
    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    connstr = pg.connstr()
-    pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+    connstr = pg.connstr(options="-csynchronous_commit=off")
+    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])

-    pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
+    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))

    for i in range(5):
        try:
-            pg_bin.run_capture(["pgbench", "-T100", connstr])
+            pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
        except Exception:
            env.pageserver.stop()
            env.pageserver.start()
-            pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
+            pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -105,15 +105,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
    with pytest.raises(Exception):
        import_tar(corrupt_base_tar, wal_tar)

-    # Clean up
-    # TODO it should clean itself
-    client = env.pageserver.http_client()
-    client.timeline_delete(tenant, timeline)
-
    # Importing correct backup works
    import_tar(base_tar, wal_tar)

    # Wait for data to land in s3
+    client = env.pageserver.http_client()
    wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn))
    wait_for_upload(client, tenant, timeline, Lsn(end_lsn))

--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -13,13 +13,8 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"

    env = neon_env_builder.init()
+    env.pageserver.is_testing_enabled_or_skip()

-    # Check if failpoints enables. Otherwise the test doesn't make sense
-    f = env.neon_cli.pageserver_enabled_features()
-
-    assert (
-        "testing" in f["features"]
-    ), "Build pageserver with --features=testing option to run this test"
    neon_env_builder.start()

    # Create a branch for us
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -346,7 +346,11 @@ def test_tenant_relocation(
    log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port)
    pageserver_bin = pathlib.Path(neon_binpath) / "pageserver"

-    new_pageserver_http = NeonPageserverHttpClient(port=new_pageserver_http_port, auth_token=None)
+    new_pageserver_http = NeonPageserverHttpClient(
+        port=new_pageserver_http_port,
+        auth_token=None,
+        is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip,
+    )

    with new_pageserver_helper(
        new_pageserver_dir,
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -23,7 +23,7 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
    initial_tenants = sorted(
        map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
    )
-    initial_tenant_dirs = set([d for d in tenants_dir.iterdir()])
+    initial_tenant_dirs = [d for d in tenants_dir.iterdir()]

    pageserver_http = neon_simple_env.pageserver.http_client()
    pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return"))
@@ -35,26 +35,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
    )
    assert initial_tenants == new_tenants, "should not create new tenants"

-    new_tenant_dirs = list(set([d for d in tenants_dir.iterdir()]) - initial_tenant_dirs)
-    assert len(new_tenant_dirs) == 1, "should have new tenant directory created"
-    tmp_tenant_dir = new_tenant_dirs[0]
-    assert str(tmp_tenant_dir).endswith(
-        ".___temp"
-    ), "new tenant directory created should be a temporary one"
-
-    neon_simple_env.pageserver.stop()
-    neon_simple_env.pageserver.start()
-
-    tenants_after_restart = sorted(
-        map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
-    )
-    dirs_after_restart = set([d for d in tenants_dir.iterdir()])
+    new_tenant_dirs = [d for d in tenants_dir.iterdir()]
    assert (
-        tenants_after_restart == initial_tenants
-    ), "should load all non-corrupt tenants after restart"
-    assert (
-        dirs_after_restart == initial_tenant_dirs
-    ), "pageserver should clean its temp tenant dirs on restart"
+        new_tenant_dirs == initial_tenant_dirs
+    ), "pageserver should clean its temp tenant dirs on tenant creation failure"


 def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -65,7 +65,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
    # check 404
    with pytest.raises(
        NeonPageserverApiException,
-        match=f"Timeline {leaf_timeline_id} was not found for tenant {env.initial_tenant}",
+        match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
    ):
        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)

--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1114,10 +1114,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
                cur.execute("INSERT INTO t (key) VALUES (1)")

    # Remove initial tenant's br1 (active)
-    assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == {
-        "dir_existed": True,
-        "was_active": True,
-    }
+    assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1125,10 +1122,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()

    # Ensure repeated deletion succeeds
-    assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == {
-        "dir_existed": False,
-        "was_active": False,
-    }
+    assert not sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1145,10 +1139,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
        assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()

    # Remove initial tenant's br2 (inactive)
-    assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == {
-        "dir_existed": True,
-        "was_active": False,
-    }
+    assert sk_http.timeline_delete_force(tenant_id, timeline_id_2)["dir_existed"]
    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1156,10 +1147,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()

    # Remove non-existing branch, should succeed
-    assert sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16)) == {
-        "dir_existed": False,
-        "was_active": False,
-    }
+    assert not sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16))["dir_existed"]
    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()
@@ -1168,10 +1156,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):

    # Remove initial tenant fully (two branches are active)
    response = sk_http.tenant_delete_force(tenant_id)
-    assert response[str(timeline_id_3)] == {
-        "dir_existed": True,
-        "was_active": True,
-    }
+    assert response[str(timeline_id_3)]["dir_existed"]
    assert not (sk_data_dir / str(tenant_id)).exists()
    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()

--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
Author	SHA1	Message	Date
Konstantin Knizhnik	7f67f65d92	Use access counter for file cache pages	2022-11-04 12:55:21 +02:00
Konstantin Knizhnik	81527300ef	Use access counter for giel cache pages	2022-11-04 12:22:47 +02:00
Konstantin Knizhnik	ba46de96eb	Cache reconstructed pages on disk	2022-11-03 19:57:27 +02:00
Konstantin Knizhnik	e8dec662e6	Cache reconstructed pages on disk	2022-11-03 19:01:03 +02:00
Konstantin Knizhnik	ebf1972ea4	Make clippy happy	2022-10-31 23:00:54 +03:00
Joonas Koivunen	35890bb293	suggested refactoring to err out awaiting	2022-10-31 22:32:37 +03:00
Konstantin Knizhnik	8769fef1a5	Fix style	2022-10-31 19:59:34 +03:00
Konstantin Knizhnik	7452f91d5a	Replace Arc<Bytes> with BYtes because Bytes maintains its own reference counter	2022-10-31 19:43:38 +03:00
Konstantin Knizhnik	6a50e1f76a	Add image layer cache implementation	2022-10-29 14:29:59 +03:00
Konstantin Knizhnik	0c78aa7589	Implement page image cache with invaidtion mechanism	2022-10-28 22:06:54 +03:00
Arseny Sher	b42bf9265a	Enable etcd compaction in neon_local.	2022-10-27 10:47:08 +03:00
Stas Kelvich	1f08ba5790	Avoid debian-testing packages in compute Dockerfiles plv8 can only be built with a fairly new gold linker version. We used to install it via binutils packages from testing, but it also updates libc and that causes troubles in the resulting image as different extensions were built against different libc versions. We could either use libc from debian-testing everywhere or restrain from using testing packages and install necessary programs manually. This patch uses the latter approach: gold for plv8 and cmake for h3 are installed manually. In a passing declare h3_postgis as a safe extension (previous omission).	2022-10-27 09:44:16 +03:00
bojanserafimov	0c54eb65fb	Move pagestream api to libs/pageserver_api (#2698 )	2022-10-26 17:32:31 -04:00
mikecaat	259a5f356e	Add a docker-compose example file (#1943 ) (#2666 ) Co-authored-by: Masahiro Ikeda <masahiro.ikeda.us@hco.ntt.co.jp>	2022-10-26 13:59:25 +03:00
Sergey Melnikov	a3cb8c11e0	Do not release to new staging proxies on release (#2685 )	2022-10-25 23:51:23 +00:00
bojanserafimov	9fb2287f87	Add draw_timeline binary (#2688 )	2022-10-25 11:25:22 -04:00
Alexander Bayandin	834ffe1bac	Add data format backward compatibility tests (#2626 )	2022-10-25 16:41:50 +02:00
Stas Kelvich	df18b041c0	Use apt version pinning instead of repo priorities Higher `bullseye` priority doesn't works for packages installed via `bullseye-updates`, e.g.: ``` libc-bin: Installed: 2.31-13+deb11u5 Candidate: 2.35-3 Version table: 2.35-3 500 500 http://ftp.debian.org/debian testing/main amd64 Packages *** 2.31-13+deb11u5 500 500 http://deb.debian.org/debian bullseye-updates/main amd64 Packages 100 /var/lib/dpkg/status 2.31-13+deb11u4 990 990 http://deb.debian.org/debian bullseye/main amd64 Packages ``` Try version pinning instead	2022-10-25 14:29:11 +03:00
Anastasia Lubennikova	39897105b2	Check postgres version and ensure that public schema exists before running GRANT query on it	2022-10-25 09:55:24 +03:00
Stas Kelvich	2f399f08b2	Hotfix to disable grant create on public schema `GRANT CREATE ON SCHEMA public` fails if there is no schema `public`. Disable it in release for now and make a better fix later (it is needed for v15 support).	2022-10-25 09:55:24 +03:00
Arseny Sher	9f49605041	Fix division by zero panic in determine_offloader.	2022-10-22 18:25:12 +03:00
Konstantin Knizhnik	7b6431cbd7	Disable wal_log_hints by default (#2598 ) * Disable wal_log_hints by default * Remove obsolete comment anbout wal_log_hints	2022-10-22 14:59:18 +03:00
Lassi Pölönen	321aeac3d4	Json logging capability (#2624 ) * Support configuring the log format as json or plain. Separately test json and plain logger. They would be competing on the same global subscriber otherwise. * Implement log_format for pageserver config * Implement configurable log format for safekeeper.	2022-10-21 17:30:20 +00:00
Andrés	71ef7b6663	Remove cached_property package (#2673 ) Co-authored-by: andres <andres.rodriguez@outlook.es>	2022-10-21 20:02:31 +03:00
Kirill Bulatov	5928cb33c5	Introduce timeline state (#2651 ) Similar to https://github.com/neondatabase/neon/pull/2395, introduces a state field in Timeline, that's possible to subscribe to. Adjusts * walreceiver to not to have any connections if timeline is not Active * remote storage sync to not to schedule uploads if timeline is Broken * not to create timelines if a tenant/timeline is broken * automatically switches timelines' states based on tenant state Does not adjust timeline's gc, checkpointing and layer flush behaviour much, since it's not safe to cancel these processes abruptly and there's task_mgr::shutdown_tasks that does similar thing.	2022-10-21 15:51:48 +00:00
Sergey Melnikov	6ff2c61ae0	Refactor safekeeper s3 config and change it for new account (#2672 )	2022-10-21 13:44:08 +00:00
Arseny Sher	7480a0338a	Determine safekeeper for offloading WAL without etcd election API. This API is rather pointless, as sane choice anyway requires knowledge of peers status and leaders lifetime in any case can intersect, which is fine for us -- so manual elections are straightforward. Here, we deterministically choose among the reasonably caught up safekeepers, shifting by timeline id to spread the load. A step towards custom broker https://github.com/neondatabase/neon/issues/2394	2022-10-21 15:33:27 +03:00
Sergey Melnikov	2709878b8b	Deploy scram proxies into new account (#2643 )	2022-10-21 14:21:22 +03:00
Kirill Bulatov	39e4bdb99e	Actualize tenant and timeline API modifiers (#2661 ) * Actualize tenant and timeline API modifiers * Use anyhow::Result explicitly	2022-10-21 10:58:43 +00:00
Anastasia Lubennikova	52e75fead9	Use anyhow::Result explicitly	2022-10-21 12:47:06 +03:00
Anastasia Lubennikova	a347d2b6ac	#2616 handle 'Unsupported pg_version' error properly	2022-10-21 12:47:06 +03:00
Heikki Linnakangas	fc4ea3553e	test_gc_cutoff.py fixes (#2655 ) * Fix bogus early exit from GC. Commit `91411c415a` added this failpoint, but the early exit was not intentional. * Cleanup test_gc_cutoff.py test. - Remove the 'scale' parameter, this isn't a benchmark - Tweak pgbench and pageserver options to create garbage faster that the the GC can collect away. The test used to take just under 5 minutes, which was uncomfortably close to the default 5 minute test timeout, and annoyingly even without the hard limit. These changes bring it down to about 1-2 minutes. - Improve comments, fix typos - Rename the failpoint. The old name, 'gc-before-save-metadata' implied that the failpoint was before the metadata update, but it was in fact much later in the function. - Move the call to persist the metadata outside the lock, to avoid holding it for too long. To verify that this test still covers the original bug, https://github.com/neondatabase/neon/issues/2539, I commenting out updating the metadata file like this: ``` diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1e857a9a..f8a9f34a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1962,7 +1962,7 @@ impl Timeline { } // Persist the new GC cutoff value in the metadata file, before // we actually remove anything. - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + //self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); ``` It doesn't fail every time with that, but it did fail after about 5 runs.	2022-10-21 02:39:55 +03:00
Dmitry Rodionov	cca1ace651	make launch_wal_receiver infallible	2022-10-21 00:40:12 +03:00
Sergey Melnikov	30984c163c	Fix race between pushing image to ECR and copying to dockerhub (#2662 )	2022-10-20 23:01:01 +03:00
Konstantin Knizhnik	7404777efc	Pin pages with speculative insert tuples to prevent their reconstruction because spec_token is not wal logged (#2657 ) * Pin pages with speculative insert tuples to prevent their reconstruction because spec_token is not wal logged refer ##2587 * Bump postgres versions	2022-10-20 20:06:05 +03:00
Heikki Linnakangas	eb1bdcc6cf	If an FSM or VM page cannot be reconstructed, fill it with zeros. If we cannot reconstruct an FSM or VM page, while creating image layers, fill it with zeros instead. That should always be safe, for the FSM and VM, in the sense that you won't lose actual user data. It will get cleaned up by VACUUM later. We had a bug with FSM/VM truncation, where we truncated the FSM and VM at WAL replay to a smaller size than PostgreSQL originally did. We thought was harmless, as the FSM and VM are not critical for correctness and can be zeroed out or truncated without affecting user data. However, it lead to a situation where PostgreSQL created incremental WAL records for pages that we had already truncated away in the pageserver, and when we tried to replay those WAL records, that failed. That lead to a permanent error in image layer creation, and prevented it from ever finishing. See https://github.com/neondatabase/neon/issues/2601. With this patch, those pages will be filled with zeros in the image layer, which allows the image layer creation to finish.	2022-10-20 17:27:01 +03:00
Arthur Petukhovsky	f5ab9f761b	Remove flaky checks in test_delete_force (#2567 )	2022-10-20 17:14:32 +04:00
Kirill Bulatov	306a47c4fa	Use uninit mark files during timeline init for atomic creation (#2489 ) Part of https://github.com/neondatabase/neon/pull/2239 Regular, from scratch, timeline creation involves initdb to be run in a separate directory, data from this directory to be imported into pageserver and, finally, timeline-related background tasks to start. This PR ensures we don't leave behind any directories that are not marked as temporary and that pageserver removes such directories on restart, allowing timeline creation to be retried with the same IDs, if needed. It would be good to later rewrite the logic to use a temporary directory, similar what tenant creation does. Yet currently it's harder than this change, so not done.	2022-10-20 14:19:17 +03:00
Kirill Bulatov	84c5f681b0	Fix test feature detection (#2659 ) Follow-up of #2636 and #2654 , fixing the test detection feature. Pageserver currently outputs features as ``` /target/debug/pageserver --version Neon page server git:7734929a8202c8cc41596a861ffbe0b51b5f3cb9 failpoints: true, features: ["testing", "profiling"] ```	2022-10-20 13:44:03 +03:00
Kirill Bulatov	50297bef9f	RFC about Tenant / Timeline guard objects (#2660 ) Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2022-10-20 12:49:54 +03:00
Andrés	9211923bef	Pageserver Python tests should not fail if the server is built with no testing feature (#2636 ) Co-authored-by: andres <andres.rodriguez@outlook.es>	2022-10-20 10:46:57 +03:00
bojanserafimov	7734929a82	Remove stale todos (#2630 )	2022-10-19 22:59:22 +00:00