Add safety notes, benchmark. Optimize checksum calculation

Turn off data-checksums for old tenants by default and explicitly enable for new ones
Verify checksum of the page and WAL records before sending to the redo process
2026-05-17 05:00:38 +00:00 · 2022-07-07 20:45:50 +02:00 · 2022-07-07 19:44:47 +02:00 · 2022-07-07 19:44:47 +02:00 · 2022-07-07 19:44:47 +02:00 · 2022-07-07 19:44:47 +02:00
119 changed files with 2823 additions and 1083 deletions
--- a/.circleci/ansible/ansible.cfg
+++ b/.circleci/ansible/ansible.cfg
@@ -6,5 +6,7 @@ timeout = 30

 [ssh_connection]
 ssh_args   = -F ./ansible.ssh.cfg
-scp_if_ssh = True
+# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
+# and scp neither worked for me
+transfer_method = piped
 pipelining = True
--- a/.circleci/ansible/ansible.ssh.cfg
+++ b/.circleci/ansible/ansible.ssh.cfg
@@ -1,3 +1,7 @@
+# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
+# (use pre 8.5 option name to cope with old ssh in CI)
+PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
+
 Host tele.zenith.tech
    User admin
    Port 3023
--- a/.circleci/ansible/neon-stress.hosts
+++ b/.circleci/ansible/neon-stress.hosts
@@ -12,6 +12,7 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = neon-stress
 console_mgmt_base_url = http://neon-stress-console.local
 bucket_name           = neon-storage-ireland
 bucket_region         = eu-west-1
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -12,6 +12,7 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = prod-1
 console_mgmt_base_url = http://console-release.local
 bucket_name           = zenith-storage-oregon
 bucket_region         = us-west-2
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -13,6 +13,7 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = us-stage
 console_mgmt_base_url = http://console-staging.local
 bucket_name           = zenith-staging-storage-us-east-1
 bucket_region         = us-east-1
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -100,10 +100,8 @@ jobs:
          name: Rust build << parameters.build_type >>
          command: |
            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
              CARGO_FLAGS="--release --features profiling"
            fi

@@ -112,7 +110,7 @@ jobs:
            export RUSTC_WRAPPER=cachepot
            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
-            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
+            mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
            cachepot -s

      - save_cache:
@@ -128,32 +126,24 @@ jobs:
          name: cargo test
          command: |
            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
              CARGO_FLAGS=--release
            fi

-            "${cov_prefix[@]}" cargo test $CARGO_FLAGS
+            cargo test $CARGO_FLAGS

        # Install the rust binaries, for use by test jobs
      - run:
          name: Install rust binaries
          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
            binaries=$(
-              "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
+              cargo metadata --format-version=1 --no-deps |
              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
            )

            test_exe_paths=$(
-              "${cov_prefix[@]}" cargo test --message-format=json --no-run |
+              cargo test --message-format=json --no-run |
              jq -r '.executable | select(. != null)'
            )

@@ -166,34 +156,15 @@ jobs:
              SRC=target/$BUILD_TYPE/$bin
              DST=/tmp/zenith/bin/$bin
              cp $SRC $DST
-              echo $DST >> /tmp/zenith/etc/binaries.list
            done

-            # Install test executables (for code coverage)
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              for bin in $test_exe_paths; do
-                SRC=$bin
-                DST=/tmp/zenith/test_bin/$(basename $bin)
-                cp $SRC $DST
-                echo $DST >> /tmp/zenith/etc/binaries.list
-              done
-            fi
-
        # Install the postgres binaries, for use by test jobs
      - run:
          name: Install postgres binaries
          command: |
            cp -a tmp_install /tmp/zenith/pg_install

-      - run:
-          name: Merge coverage data
-          command: |
-            # This will speed up workspace uploads
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
-            fi
-
-        # Save the rust binaries and coverage data for other jobs in this workflow.
+      # Save rust binaries for other jobs in the workflow
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
@@ -286,7 +257,7 @@ jobs:
          # no_output_timeout, specified here.
          no_output_timeout: 10m
          environment:
-            - ZENITH_BIN: /tmp/zenith/bin
+            - NEON_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
            - TEST_OUTPUT: /tmp/test_output
            # this variable will be embedded in perf test report
@@ -314,12 +285,6 @@ jobs:

            export GITHUB_SHA=$CIRCLE_SHA1

-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
            # Run the tests.
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
@@ -330,7 +295,7 @@ jobs:
            # -n4 uses four processes to run tests via pytest-xdist
            # -s is not used to prevent pytest from capturing output, because tests are running
            # in parallel and logs are mixed between different tests
-            "${cov_prefix[@]}" ./scripts/pytest \
+            ./scripts/pytest \
              --junitxml=$TEST_OUTPUT/junit.xml \
              --tb=short \
              --verbose \
@@ -359,67 +324,12 @@ jobs:
      # The store_test_results step tells CircleCI where to find the junit.xml file.
      - store_test_results:
          path: /tmp/test_output
-      - run:
-          name: Merge coverage data
-          command: |
-            # This will speed up workspace uploads
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
-            fi
-      # Save coverage data (if any)
+      # Save data (if any)
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
            - "*"

-  coverage-report:
-    executor: neon-xlarge-executor
-    steps:
-      - attach_workspace:
-          at: /tmp/zenith
-      - checkout
-      - restore_cache:
-          name: Restore rust cache
-          keys:
-            # Require an exact match. While an out of date cache might speed up the build,
-            # there's no way to clean out old packages, so the cache grows every time something
-            # changes.
-            - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
-      - run:
-          name: Build coverage report
-          command: |
-            COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
-
-            scripts/coverage \
-              --dir=/tmp/zenith/coverage report \
-              --input-objects=/tmp/zenith/etc/binaries.list \
-              --commit-url=$COMMIT_URL \
-              --format=github
-      - run:
-          name: Upload coverage report
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-            REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
-            COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
-
-            scripts/git-upload \
-              --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
-              --message="Add code coverage for $COMMIT_URL" \
-              copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
-
-            # Add link to the coverage report to the commit
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"success\",
-                \"context\": \"zenith-coverage\",
-                \"description\": \"Coverage report is ready\",
-                \"target_url\": \"$REPORT_URL\"
-              }"
-
  # Build neondatabase/neon:latest image and push it to Docker hub
  docker-image:
    docker:
@@ -585,8 +495,8 @@ jobs:
          name: Re-deploy proxy
          command: |
            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
-            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  deploy-neon-stress:
    docker:
@@ -688,50 +598,6 @@ jobs:
            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

-  # Trigger a new remote CI job
-  remote-ci-trigger:
-    docker:
-      - image: cimg/base:2021.04
-    parameters:
-      remote_repo:
-        type: string
-    environment:
-      REMOTE_REPO: << parameters.remote_repo >>
-    steps:
-      - run:
-          name: Set PR's status to pending
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"pending\",
-                \"context\": \"neon-cloud-e2e\",
-                \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-              }"
-      - run:
-          name: Request a remote CI test
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-
-            curl -f -X POST \
-            https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"ref\": \"main\",
-                \"inputs\": {
-                  \"ci_job_name\": \"neon-cloud-e2e\",
-                  \"commit_hash\": \"$CIRCLE_SHA1\",
-                  \"remote_repo\": \"$LOCAL_REPO\"
-                }
-              }"
-
 workflows:
  build_and_test:
    jobs:
@@ -774,12 +640,6 @@ workflows:
          save_perf_report: true
          requires:
            - build-neon-release
-      - coverage-report:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          requires:
-            # TODO: consider adding more
-            - other-tests-debug
      - docker-image:
          # Context gives an ability to login
          context: Docker Hub
@@ -880,14 +740,3 @@ workflows:
                - release
          requires:
            - docker-image-release
-      - remote-ci-trigger:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          remote_repo: "neondatabase/cloud"
-          requires:
-            # XXX: Successful build doesn't mean everything is OK, but
-            # the job to be triggered takes so much time to complete (~22 min)
-            # that it's better not to wait for the commented-out steps
-            - build-neon-release
-            # - pg_regress-tests-release
-            # - other-tests-release
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -2,25 +2,29 @@ name: 'Run python test'
 description: 'Runs a Neon python test set, performing all the required preparations before'

 inputs:
-  # Select the type of Rust build. Must be "release" or "debug".
  build_type:
+    description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
    required: true
  rust_toolchain:
+    description: 'Rust toolchain version to fetch the caches'
    required: true
-  # This parameter is required, to prevent the mistake of running all tests in one job.
  test_selection:
+    description: 'A python test suite to run'
    required: true
-  # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
  extra_params:
+    description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
    required: false
    default: ''
  needs_postgres_source:
+    description: 'Set to true if the test suite requires postgres source checked out'
    required: false
    default: 'false'
  run_in_parallel:
+    description: 'Whether to run tests in parallel'
    required: false
    default: 'true'
  save_perf_report:
+    description: 'Whether to upload the performance report'
    required: false
    default: 'false'

@@ -60,7 +64,7 @@ runs:

    - name: Run pytest
      env:
-        ZENITH_BIN: /tmp/neon/bin
+        NEON_BIN: /tmp/neon/bin
        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
        TEST_OUTPUT: /tmp/test_output
        # this variable will be embedded in perf test report
@@ -81,14 +85,14 @@ runs:
          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
        fi
        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
-          if [[ "$GITHUB_REF" == "main" ]]; then
+          if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
            mkdir -p "$PERF_REPORT_DIR"
            EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
          fi
        fi

        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
-          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
          cov_prefix=()
        fi
@@ -111,9 +115,26 @@ runs:
          -rA $TEST_SELECTION $EXTRA_PARAMS

        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
-          if [[ "$GITHUB_REF" == "main" ]]; then
+          if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
            export REPORT_FROM="$PERF_REPORT_DIR"
            export REPORT_TO=local
            scripts/generate_and_push_perf_report.sh
          fi
        fi
+
+    - name: Delete all data but logs
+      shell: bash -ex {0}
+      if: always()
+      run: |
+        du -sh /tmp/test_output/*
+        find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
+        du -sh /tmp/test_output/*
+
+    - name: Upload python test logs
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        if-no-files-found: error
+        name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
+        path: /tmp/test_output/
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -0,0 +1,17 @@
+name: 'Merge and upload coverage data'
+description: 'Compresses and uploads the coverage data as an artifact'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Merge coverage data
+      shell: bash -ex {0}
+      run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+    - name: Upload coverage data
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        if-no-files-found: error
+        name: coverage-data-artifact
+        path: /tmp/coverage/
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,13 +1,28 @@
-name: build_and_test
-on: [ push ]
+name: Test
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+
 defaults:
  run:
    shell: bash -ex {0}

+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
 jobs:
  build-postgres:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -34,7 +49,7 @@ jobs:

      - name: Build postgres
        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: COPT='-Werror' mold -run make postgres -j$(nproc)
+        run: mold -run make postgres -j$(nproc)

      # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
      - name: Prepare postgres artifact
@@ -52,6 +67,7 @@ jobs:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-postgres ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -85,44 +101,39 @@ jobs:
            ~/.cargo/registry/
            ~/.cargo/git/
            target/
-          key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+          key: |
+            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-

      - name: Run cargo build
        run: |
          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
            CARGO_FLAGS=
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=()
            CARGO_FLAGS="--release --features profiling"
          fi

-          export CACHEPOT_BUCKET=zenith-rust-cachepot
-          export RUSTC_WRAPPER=cachepot
-          export AWS_ACCESS_KEY_ID="${{ secrets.AWS_ACCESS_KEY_ID }}"
-          export AWS_SECRET_ACCESS_KEY="${{ secrets.AWS_SECRET_ACCESS_KEY }}"
-          export HOME=/home/runner
          "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
-          cachepot -s

      - name: Run cargo test
        run: |
-          export HOME=/home/runner
          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
            CARGO_FLAGS=
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=()
            CARGO_FLAGS=--release
          fi
-          
+
          "${cov_prefix[@]}" cargo test $CARGO_FLAGS

      - name: Install rust binaries
        run: |
-          export HOME=/home/runner
          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=()
          fi
@@ -137,39 +148,36 @@ jobs:
            jq -r '.executable | select(. != null)'
          )

-          mkdir -p /tmp/neon/bin
-          mkdir -p /tmp/neon/test_bin
-          mkdir -p /tmp/neon/etc
+          mkdir -p /tmp/neon/bin/
+          mkdir -p /tmp/neon/test_bin/
+          mkdir -p /tmp/neon/etc/
+
+          # Keep bloated coverage data files away from the rest of the artifact
+          mkdir -p /tmp/coverage/

          # Install target binaries
          for bin in $binaries; do
            SRC=target/$BUILD_TYPE/$bin
            DST=/tmp/neon/bin/$bin
-            cp $SRC $DST
-            echo $DST >> /tmp/neon/etc/binaries.list
+            cp "$SRC" "$DST"
          done

-          # Install test executables (for code coverage)
+          # Install test executables and write list of all binaries (for code coverage)
          if [[ $BUILD_TYPE == "debug" ]]; then
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
            for bin in $test_exe_paths; do
              SRC=$bin
              DST=/tmp/neon/test_bin/$(basename $bin)
-              cp $SRC $DST
-              echo $DST >> /tmp/neon/etc/binaries.list
+              cp "$SRC" "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
            done
          fi

      - name: Install postgres binaries
        run: cp -a tmp_install /tmp/neon/pg_install

-      - name: Merge coverage data
-        run: |
-          export HOME=/home/runner
-          # This will speed up workspace uploads
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage merge
-          fi
-
      - name: Prepare neon artifact
        run: tar -C /tmp/neon/ -czf ./neon.tgz .

@@ -181,38 +189,17 @@ jobs:
          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
          path: ./neon.tgz

-  check-codestyle-python:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    strategy:
-      matrix:
-        rust_toolchain: [ 1.58 ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 1
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data

-      - name: Cache poetry deps
-        id: cache_poetry
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: Run yapf to ensure code format
-        run: poetry run yapf --recursive --diff .
-
-      - name: Run mypy to check types
-        run: poetry run mypy .

  pg_regress-tests:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-neon ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -231,10 +218,15 @@ jobs:
          test_selection: batch_pg_regress
          needs_postgres_source: true

+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
  other-tests:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-neon ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -252,10 +244,15 @@ jobs:
          rust_toolchain: ${{ matrix.rust_toolchain }}
          test_selection: batch_others

+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
  benchmarks:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-neon ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ release ]
        rust_toolchain: [ 1.58 ]
@@ -273,4 +270,123 @@ jobs:
          rust_toolchain: ${{ matrix.rust_toolchain }}
          test_selection: performance
          run_in_parallel: false
-          # save_perf_report: true
+          save_perf_report: true
+        env:
+          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+      # XXX: no coverage data handling here, since benchmarks are run on release builds,
+      # while coverage is currently collected for the debug ones
+
+  coverage-report:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ other-tests, pg_regress-tests ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Restore cargo deps cache
+        id: cache_cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry/
+            ~/.cargo/git/
+            target/
+          key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+
+      - name: Get Neon artifact for restoration
+        uses: actions/download-artifact@v3
+        with:
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          path: ./neon-artifact/
+
+      - name: Extract Neon artifact
+        run: |
+          mkdir -p /tmp/neon/
+          tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
+          rm -rf ./neon-artifact/
+
+      - name: Restore coverage data
+        uses: actions/download-artifact@v3
+        with:
+          name: coverage-data-artifact
+          path: /tmp/coverage/
+
+      - name: Merge coverage data
+        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+      - name: Build and upload coverage report
+        run: |
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
+
+          scripts/coverage \
+            --dir=/tmp/coverage report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --commit-url=$COMMIT_URL \
+            --format=github
+
+          REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
+
+          scripts/git-upload \
+            --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
+            --message="Add code coverage for $COMMIT_URL" \
+            copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
+
+          # Add link to the coverage report to the commit
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"success\",
+              \"context\": \"neon-coverage\",
+              \"description\": \"Coverage report is ready\",
+              \"target_url\": \"$REPORT_URL\"
+            }"
+
+  trigger-e2e-tests:
+   runs-on: [ self-hosted, Linux, k8s-runner ]
+   needs: [ build-neon ]
+   steps:
+     - name: Set PR's status to pending and request a remote CI test
+       run: |
+         COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+         COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+
+         REMOTE_REPO="${{ github.repository_owner }}/cloud"
+
+         curl -f -X POST \
+         https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+         -H "Accept: application/vnd.github.v3+json" \
+         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+         --data \
+           "{
+             \"state\": \"pending\",
+             \"context\": \"neon-cloud-e2e\",
+             \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+           }"
+
+         curl -f -X POST \
+         https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+         -H "Accept: application/vnd.github.v3+json" \
+         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+         --data \
+           "{
+             \"ref\": \"main\",
+             \"inputs\": {
+               \"ci_job_name\": \"neon-cloud-e2e\",
+               \"commit_hash\": \"$COMMIT_SHA\",
+               \"remote_repo\": \"${{ github.repository }}\"
+             }
+           }"
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -1,4 +1,4 @@
-name: Build and Test
+name: Check code style and build

 on:
  push:
@@ -6,15 +6,27 @@ on:
    - main
  pull_request:

+defaults:
+  run:
+    shell: bash -ex {0}
+
+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+
 jobs:
-  regression-check:
+  check-codestyle-rust:
    strategy:
+      fail-fast: false
      matrix:
        # If we want to duplicate this job for different
        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
        rust_toolchain: [1.58]
        os: [ubuntu-latest, macos-latest]
-    timeout-minutes: 30
+    timeout-minutes: 50
    name: run regression test suite
    runs-on: ${{ matrix.os }}

@@ -92,5 +104,30 @@ jobs:
      - name: Run cargo clippy
        run: ./run_clippy.sh

-      - name: Run cargo test
-        run: cargo test --all --all-targets
+      - name: Ensure all project builds
+        run: cargo build --all --all-targets
+
+  check-codestyle-python:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Cache poetry deps
+        id: cache_poetry
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: Run yapf to ensure code format
+        run: poetry run yapf --recursive --diff .
+
+      - name: Run mypy to check types
+        run: poetry run mypy .
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -0,0 +1,74 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+
+  workflow_dispatch:
+
+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+jobs:
+  test-postgres-client-libs:
+    runs-on: [ ubuntu-latest ]
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+
+    - uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Cache poetry deps
+      id: cache_poetry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -ex {0}
+      run: ./scripts/pysync
+
+    - name: Run pytest
+      env:
+        REMOTE_ENV: 1
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        TEST_OUTPUT: /tmp/test_output
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        # this variable will be embedded in perf test report
+        # and is needed to distinguish different environments
+        PLATFORM: github-actions-selfhosted
+      shell: bash -ex {0}
+      run: |
+        # Test framework expects we have psql binary;
+        # but since we don't really need it in this test, let's mock it
+        mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
+        ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
+          --tb=short \
+          --verbose \
+          -m "remote_cluster" \
+          -rA "test_runner/pg_clients"
+
+    - name: Post to a Slack channel
+      if: failure()
+      id: slack
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -461,6 +461,7 @@ dependencies = [
 "tar",
 "tokio",
 "tokio-postgres",
+ "url",
 "workspace_hack",
 ]

--- a/10
+++ b/10
@@ -1,5 +1,5 @@
 # Build Postgres
-FROM zimg/rust:1.58 AS pg-build
+FROM neondatabase/rust:1.58 AS pg-build
 WORKDIR /pg

 USER root
@@ -14,7 +14,7 @@ RUN set -e \
    && tar -C tmp_install -czf /postgres_install.tar.gz .

 # Build zenith binaries
-FROM zimg/rust:1.58 AS build
+FROM neondatabase/rust:1.58 AS build
 ARG GIT_VERSION=local

 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
@@ -46,9 +46,9 @@ RUN set -e \
    && useradd -d /data zenith \
    && chown -R zenith:zenith /data

-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy      /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy      /usr/local/bin

 COPY --from=pg-build /pg/tmp_install/         /usr/local/
 COPY --from=pg-build /postgres_install.tar.gz /data/
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,6 +1,6 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .circle/config.yml
-FROM zimg/rust:1.58 AS rust-build
+FROM neondatabase/rust:1.58 AS rust-build

 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
 ARG AWS_ACCESS_KEY_ID
@@ -15,4 +15,4 @@ RUN set -e \
 # Final image that only has one binary
 FROM debian:buster-slim

-COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd
+brew install protobuf etcd openssl
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -18,4 +18,5 @@ serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -33,7 +33,7 @@ use std::process::exit;
 use std::sync::{Arc, RwLock};
 use std::{thread, time::Duration};

-use anyhow::Result;
+use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
 use log::{error, info};
@@ -45,6 +45,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::pg_helpers::*;
 use compute_tools::spec::*;
+use url::Url;

 fn main() -> Result<()> {
    // TODO: re-use `utils::logging` later
@@ -131,7 +132,7 @@ fn main() -> Result<()> {

    let compute_state = ComputeNode {
        start_time: Utc::now(),
-        connstr: connstr.to_string(),
+        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
        spec,
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use anyhow::{anyhow, Result};
 use log::error;
 use postgres::Client;
@@ -23,9 +21,8 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
    Ok(())
 }

-pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
-    let connstr = &compute.connstr;
-    let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
+pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
+    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
    if client.is_closed() {
        return Err(anyhow!("connection to postgres closed"));
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -35,7 +35,8 @@ use crate::spec::*;
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
    pub start_time: DateTime<Utc>,
-    pub connstr: String,
+    // Url type maintains proper escaping
+    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
    pub spec: ComputeSpec,
@@ -268,27 +269,32 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin`name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut client = match Client::connect(&self.connstr, NoTls) {
+        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
            Err(e) => {
                info!(
                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                    e
                );
-                let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1);
+                let mut zenith_admin_connstr = self.connstr.clone();

-                let mut client = Client::connect(&zenith_admin_connstr, NoTls)?;
+                zenith_admin_connstr
+                    .set_username("zenith_admin")
+                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+
+                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);

                // reconnect with connsting with expected name
-                Client::connect(&self.connstr, NoTls)?
+                Client::connect(self.connstr.as_str(), NoTls)?
            }
            Ok(client) => client,
        };

        handle_roles(&self.spec, &mut client)?;
        handle_databases(&self.spec, &mut client)?;
+        handle_role_deletions(self, &mut client)?;
        handle_grants(&self.spec, &mut client)?;
        create_writablity_check_data(&mut client)?;

--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
 // XXX: the only expected panic is at `RwLock` unwrap().
-fn watch_compute_activity(compute: &Arc<ComputeNode>) {
+fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.clone();
+    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = Client::connect(&connstr, NoTls);
+    let mut client = Client::connect(connstr, NoTls);
    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);

    info!("watching Postgres activity at {}", connstr);
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
                    info!("connection to postgres closed, trying to reconnect");

                    // Connection is closed, reconnect and try again.
-                    client = Client::connect(&connstr, NoTls);
+                    client = Client::connect(connstr, NoTls);
                    continue;
                }

@@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
                debug!("cannot connect to postgres: {}, retrying", e);

                // Establish a new connection and try again.
-                client = Client::connect(&connstr, NoTls);
+                client = Client::connect(connstr, NoTls);
            }
        }
    }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,3 +1,4 @@
+use std::fmt::Write;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::net::{SocketAddr, TcpStream};
@@ -138,9 +139,11 @@ impl Role {
            // Now we also support SCRAM-SHA-256 and to preserve compatibility
            // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
            if pass.starts_with("SCRAM-SHA-256") {
-                params.push_str(&format!(" PASSWORD '{}'", pass));
+                write!(params, " PASSWORD '{pass}'")
+                    .expect("String is documented to not to error during write operations");
            } else {
-                params.push_str(&format!(" PASSWORD 'md5{}'", pass));
+                write!(params, " PASSWORD 'md5{pass}'")
+                    .expect("String is documented to not to error during write operations");
            }
        } else {
            params.push_str(" PASSWORD NULL");
@@ -158,7 +161,8 @@ impl Database {
    /// it may require a proper quoting too.
    pub fn to_pg_options(&self) -> String {
        let mut params: String = self.options.as_pg_options();
-        params.push_str(&format!(" OWNER {}", &self.owner.quote()));
+        write!(params, " OWNER {}", &self.owner.quote())
+            .expect("String is documented to not to error during write operations");

        params
    }
@@ -244,18 +248,20 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
            bail!("Postgres exited unexpectedly with code {}", code);
        }

-        if pid_path.exists() {
-            let file = BufReader::new(File::open(&pid_path)?);
-            let status = file
-                .lines()
-                .last()
-                .unwrap()
-                .unwrap_or_else(|_| "unknown".to_string());
-            let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
+        // Check that we can open pid file first.
+        if let Ok(file) = File::open(&pid_path) {
+            let file = BufReader::new(file);
+            let last_line = file.lines().last();

-            // Now Postgres is ready to accept connections
-            if status.trim() == "ready" && can_connect {
-                break;
+            // Pid file could be there and we could read it, but it could be empty, for example.
+            if let Some(Ok(line)) = last_line {
+                let status = line.trim();
+                let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
+
+                // Now Postgres is ready to accept connections
+                if status == "ready" && can_connect {
+                    break;
+                }
            }
        }

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,9 +2,10 @@ use std::path::Path;

 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
-use postgres::Client;
+use postgres::{Client, NoTls};
 use serde::Deserialize;

+use crate::compute::ComputeNode;
 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -97,18 +98,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Process delta operations first
    if let Some(ops) = &spec.delta_operations {
-        info!("processing delta operations on roles");
+        info!("processing role renames");
        for op in ops {
            match op.action.as_ref() {
-                // We do not check either role exists or not,
-                // Postgres will take care of it for us
                "delete_role" => {
-                    let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
-
-                    warn!("deleting role '{}'", &op.name);
-                    xact.execute(query.as_str(), &[])?;
+                    // no-op now, roles will be deleted at the end of configuration
                }
-                // Renaming role drops its password, since tole name is
+                // Renaming role drops its password, since role name is
                // used as a salt there.  It is important that this role
                // is recorded with a new `name` in the `roles` list.
                // Follow up roles update will set the new password.
@@ -182,7 +178,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            xact.execute(query.as_str(), &[])?;

            let grant_query = format!(
-                "grant pg_read_all_data, pg_write_all_data to {}",
+                "GRANT pg_read_all_data, pg_write_all_data TO {}",
                name.quote()
            );
            xact.execute(grant_query.as_str(), &[])?;
@@ -197,6 +193,70 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    Ok(())
 }

+/// Reassign all dependent objects and delete requested roles.
+pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
+    let spec = &node.spec;
+
+    // First, reassign all dependent objects to db owners.
+    if let Some(ops) = &spec.delta_operations {
+        info!("reassigning dependent objects of to-be-deleted roles");
+        for op in ops {
+            if op.action == "delete_role" {
+                reassign_owned_objects(node, &op.name)?;
+            }
+        }
+    }
+
+    // Second, proceed with role deletions.
+    let mut xact = client.transaction()?;
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing role deletions");
+        for op in ops {
+            // We do not check either role exists or not,
+            // Postgres will take care of it for us
+            if op.action == "delete_role" {
+                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
+
+                warn!("deleting role '{}'", &op.name);
+                xact.execute(query.as_str(), &[])?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+// Reassign all owned objects in all databases to the owner of the database.
+fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
+    for db in &node.spec.cluster.databases {
+        if db.owner != *role_name {
+            let mut connstr = node.connstr.clone();
+            // database name is always the last and the only component of the path
+            connstr.set_path(&db.name);
+
+            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+
+            // This will reassign all dependent objects to the db owner
+            let reassign_query = format!(
+                "REASSIGN OWNED BY {} TO {}",
+                role_name.quote(),
+                db.owner.quote()
+            );
+            info!(
+                "reassigning objects owned by '{}' in db '{}' to '{}'",
+                role_name, &db.name, &db.owner
+            );
+            client.simple_query(&reassign_query)?;
+
+            // This now will only drop privileges of the role
+            let drop_query = format!("DROP OWNED BY {}", role_name.quote());
+            client.simple_query(&drop_query)?;
+        }
+    }
+
+    Ok(())
+}
+
 /// It follows mostly the same logic as `handle_roles()` excepting that we
 /// does not use an explicit transactions block, since major database operations
 /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
@@ -294,13 +354,26 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    info!("cluster spec grants:");

+    // We now have a separate `web_access` role to connect to the database
+    // via the web interface and proxy link auth. And also we grant a
+    // read / write all data privilege to every role. So also grant
+    // create to everyone.
+    // XXX: later we should stop messing with Postgres ACL in such horrible
+    // ways.
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| r.name.quote())
+        .collect::<Vec<_>>();
+
    for db in &spec.cluster.databases {
        let dbname = &db.name;

        let query: String = format!(
            "GRANT CREATE ON DATABASE {} TO {}",
            dbname.quote(),
-            db.owner.quote()
+            roles.join(", ")
        );
        info!("grant query {}", &query);

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -403,16 +403,6 @@ impl LocalEnv {
                self.pg_distrib_dir.display()
            );
        }
-        for binary in ["pageserver", "safekeeper"] {
-            if !self.zenith_distrib_dir.join(binary).exists() {
-                bail!(
-                    "Can't find binary '{}' in zenith distrib dir '{}'",
-                    binary,
-                    self.zenith_distrib_dir.display()
-                );
-            }
-        }
-
        for binary in ["pageserver", "safekeeper"] {
            if !self.zenith_distrib_dir.join(binary).exists() {
                bail!(
@@ -421,12 +411,6 @@ impl LocalEnv {
                );
            }
        }
-        if !self.pg_distrib_dir.join("bin/postgres").exists() {
-            bail!(
-                "Can't find postgres binary at {}",
-                self.pg_distrib_dir.display()
-            );
-        }

        fs::create_dir(&base_path)?;

--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -427,6 +427,7 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+                data_checksums_enabled: Some(true),
            })
            .send()?
            .error_from_body()?
@@ -436,7 +437,7 @@ impl PageServerNode {
            .map(|id| {
                id.parse().with_context(|| {
                    format!(
-                        "Failed to parse tennat creation response as tenant id: {}",
+                        "Failed to parse tenant creation response as tenant id: {}",
                        id
                    )
                })
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -9,6 +9,7 @@

 use serde::{Deserialize, Serialize};
 use utils::lsn::Lsn;
+use utils::pg_checksum_page::pg_checksum_page;

 include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

@@ -56,3 +57,55 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
    pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
    pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
 }
+
+/// Calculate page checksum and stamp it onto the page.
+/// NB: this will zero out and ignore any existing checksum.
+/// # Safety
+/// See safety notes for `pg_checksum_page`
+pub unsafe fn page_set_checksum(page: &mut [u8], blkno: u32) {
+    let checksum = pg_checksum_page(page, blkno);
+    page[8..10].copy_from_slice(&checksum.to_le_bytes());
+}
+
+/// Check if page checksum is valid.
+/// # Safety
+/// See safety notes for `pg_checksum_page`
+pub unsafe fn page_verify_checksum(page: &[u8], blkno: u32) -> bool {
+    let checksum = pg_checksum_page(page, blkno);
+    checksum == u16::from_le_bytes(page[8..10].try_into().unwrap())
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::pg_constants::BLCKSZ;
+    use crate::{page_set_checksum, page_verify_checksum};
+    use utils::pg_checksum_page::pg_checksum_page;
+
+    #[test]
+    fn set_and_verify_checksum() {
+        // Create a page with some content and without a correct checksum.
+        let mut page: [u8; BLCKSZ as usize] = [0; BLCKSZ as usize];
+        for (i, byte) in page.iter_mut().enumerate().take(BLCKSZ as usize) {
+            *byte = i as u8;
+        }
+
+        // Calculate the checksum.
+        let checksum = unsafe { pg_checksum_page(&page[..], 0) };
+
+        // Sanity check: random bytes in the checksum attribute should not be
+        // a valid checksum.
+        assert_ne!(
+            checksum,
+            u16::from_le_bytes(page[8..10].try_into().unwrap())
+        );
+
+        // Set the actual checksum.
+        unsafe { page_set_checksum(&mut page, 0) };
+
+        // Verify the checksum.
+        assert!(unsafe { page_verify_checksum(&page[..], 0) });
+
+        // Checksum is not valid with another block number.
+        assert!(!unsafe { page_verify_checksum(&page[..], 1) });
+    }
+}
--- a/libs/postgres_ffi/src/waldecoder.rs
+++ b/libs/postgres_ffi/src/waldecoder.rs
@@ -14,7 +14,6 @@ use super::XLogLongPageHeaderData;
 use super::XLogPageHeaderData;
 use super::XLogRecord;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use crc32c::*;
 use log::*;
 use std::cmp::min;
 use thiserror::Error;
@@ -198,18 +197,12 @@ impl WalStreamDecoder {
        }

        // We now have a record in the 'recordbuf' local variable.
-        let xlogrec =
-            XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
-                WalDecodeError {
-                    msg: format!("xlog record deserialization failed {}", e),
-                    lsn: self.lsn,
-                }
-            })?;
+        let xlogrec = XLogRecord::from_buf(&recordbuf).map_err(|e| WalDecodeError {
+            msg: format!("xlog record deserialization failed {}", e),
+            lsn: self.lsn,
+        })?;

-        let mut crc = 0;
-        crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
-        crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
-        if crc != xlogrec.xl_crc {
+        if !wal_record_verify_checksum(&xlogrec, &recordbuf) {
            return Err(WalDecodeError {
                msg: "WAL record crc mismatch".into(),
                lsn: self.lsn,
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -477,6 +477,10 @@ impl XLogRecord {
        XLogRecord::des(buf)
    }

+    pub fn from_buf(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
+        XLogRecord::from_slice(&buf[0..XLOG_SIZE_OF_XLOG_RECORD])
+    }
+
    pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogRecord, DeserializeError> {
        use utils::bin_ser::LeSer;
        XLogRecord::des_from(&mut buf.reader())
@@ -742,3 +746,11 @@ mod tests {
        assert_eq!(checkpoint.nextXid.value, 2048);
    }
 }
+
+pub fn wal_record_verify_checksum(rec: &XLogRecord, recordbuf: &Bytes) -> bool {
+    let mut crc = 0;
+    crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
+    crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
+
+    crc == rec.xl_crc
+}
--- a/libs/postgres_ffi/wal_generate/src/lib.rs
+++ b/libs/postgres_ffi/wal_generate/src/lib.rs
@@ -56,6 +56,7 @@ impl Conf {
            .new_pg_command("initdb")?
            .arg("-D")
            .arg(self.datadir.as_os_str())
+            .arg("--data-checksums")
            .args(&["-U", "postgres", "--no-instructions", "--no-sync"])
            .output()?;
        debug!("initdb output: {:?}", output);
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,8 +12,10 @@ use std::{
    borrow::Cow,
    collections::HashMap,
    ffi::OsStr,
+    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    path::{Path, PathBuf},
+    pin::Pin,
 };

 use anyhow::{bail, Context};
@@ -70,11 +72,7 @@ pub trait RemoteStorage: Send + Sync {

    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
-    async fn download(
-        &self,
-        from: &Self::RemoteObjectId,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
+    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError>;

    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
@@ -83,12 +81,49 @@ pub trait RemoteStorage: Send + Sync {
        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
+    ) -> Result<Download, DownloadError>;

    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
 }

+pub struct Download {
+    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
+    /// Extra key-value data, associated with the current remote file.
+    pub metadata: Option<StorageMetadata>,
+}
+
+impl Debug for Download {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Download")
+            .field("metadata", &self.metadata)
+            .finish()
+    }
+}
+
+#[derive(Debug)]
+pub enum DownloadError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The file was not found in the remote storage.
+    NotFound,
+    /// The file was found in the remote storage, but the download failed.
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for DownloadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DownloadError::BadInput(e) => {
+                write!(f, "Failed to download a remote file due to user input: {e}")
+            }
+            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
+            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"),
+        }
+    }
+}
+
+impl std::error::Error for DownloadError {}
+
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 pub enum GenericRemoteStorage {
@@ -180,7 +215,7 @@ pub struct S3Config {
    pub concurrency_limit: NonZeroUsize,
 }

-impl std::fmt::Debug for S3Config {
+impl Debug for S3Config {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("S3Config")
            .field("bucket_name", &self.bucket_name)
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
 };
 use tracing::*;

-use crate::path_with_suffix_extension;
+use crate::{path_with_suffix_extension, Download, DownloadError};

 use super::{strip_path_prefix, RemoteStorage, StorageMetadata};

@@ -192,15 +192,12 @@ impl RemoteStorage for LocalFs {
        Ok(())
    }

-    async fn download(
-        &self,
-        from: &Self::RemoteObjectId,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
-        let file_path = self.resolve_in_storage(from)?;
-
-        if file_path.exists() && file_path.is_file() {
-            let mut source = io::BufReader::new(
+    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
+        let file_path = self
+            .resolve_in_storage(from)
+            .map_err(DownloadError::BadInput)?;
+        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
+            let source = io::BufReader::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&file_path)
@@ -210,22 +207,20 @@ impl RemoteStorage for LocalFs {
                            "Failed to open source file '{}' to use in the download",
                            file_path.display()
                        )
-                    })?,
+                    })
+                    .map_err(DownloadError::Other)?,
            );
-            io::copy(&mut source, to).await.with_context(|| {
-                format!(
-                    "Failed to download file '{}' from the local storage",
-                    file_path.display()
-                )
-            })?;
-            source.flush().await?;

-            self.read_storage_metadata(&file_path).await
+            let metadata = self
+                .read_storage_metadata(&file_path)
+                .await
+                .map_err(DownloadError::Other)?;
+            Ok(Download {
+                metadata,
+                download_stream: Box::pin(source),
+            })
        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
+            Err(DownloadError::NotFound)
        }
    }

@@ -234,22 +229,19 @@ impl RemoteStorage for LocalFs {
        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
+    ) -> Result<Download, DownloadError> {
        if let Some(end_exclusive) = end_exclusive {
-            ensure!(
-                end_exclusive > start_inclusive,
-                "Invalid range, start ({}) is bigger then end ({:?})",
-                start_inclusive,
-                end_exclusive
-            );
+            if end_exclusive <= start_inclusive {
+                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})")));
+            };
            if start_inclusive == end_exclusive.saturating_sub(1) {
-                return Ok(None);
+                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
-        let file_path = self.resolve_in_storage(from)?;
-
-        if file_path.exists() && file_path.is_file() {
+        let file_path = self
+            .resolve_in_storage(from)
+            .map_err(DownloadError::BadInput)?;
+        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
            let mut source = io::BufReader::new(
                fs::OpenOptions::new()
                    .read(true)
@@ -260,31 +252,31 @@ impl RemoteStorage for LocalFs {
                            "Failed to open source file '{}' to use in the download",
                            file_path.display()
                        )
-                    })?,
+                    })
+                    .map_err(DownloadError::Other)?,
            );
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
-                .context("Failed to seek to the range start in a local storage file")?;
-            match end_exclusive {
-                Some(end_exclusive) => {
-                    io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
-                }
-                None => io::copy(&mut source, to).await,
-            }
-            .with_context(|| {
-                format!(
-                    "Failed to download file '{}' range from the local storage",
-                    file_path.display()
-                )
-            })?;
+                .context("Failed to seek to the range start in a local storage file")
+                .map_err(DownloadError::Other)?;
+            let metadata = self
+                .read_storage_metadata(&file_path)
+                .await
+                .map_err(DownloadError::Other)?;

-            self.read_storage_metadata(&file_path).await
+            Ok(match end_exclusive {
+                Some(end_exclusive) => Download {
+                    metadata,
+                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
+                },
+                None => Download {
+                    metadata,
+                    download_stream: Box::pin(source),
+                },
+            })
        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
+            Err(DownloadError::NotFound)
        }
    }

@@ -352,6 +344,19 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
    Ok(())
 }

+fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
+    if file_path.exists() {
+        ensure!(
+            file_path.is_file(),
+            "file path '{}' is not a file",
+            file_path.display()
+        );
+        Ok(true)
+    } else {
+        Ok(false)
+    }
+}
+
 #[cfg(test)]
 mod pure_tests {
    use tempfile::tempdir;
@@ -518,6 +523,31 @@ mod fs_tests {
    use std::{collections::HashMap, io::Write};
    use tempfile::tempdir;

+    async fn read_and_assert_remote_file_contents(
+        storage: &LocalFs,
+        #[allow(clippy::ptr_arg)]
+        // have to use &PathBuf due to `storage.local_path` parameter requirements
+        remote_storage_path: &PathBuf,
+        expected_metadata: Option<&StorageMetadata>,
+    ) -> anyhow::Result<String> {
+        let mut download = storage
+            .download(remote_storage_path)
+            .await
+            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
+        ensure!(
+            download.metadata.as_ref() == expected_metadata,
+            "Unexpected metadata returned for the downloaded file"
+        );
+
+        let mut contents = String::new();
+        download
+            .download_stream
+            .read_to_string(&mut contents)
+            .await
+            .context("Failed to read remote file contents into string")?;
+        Ok(contents)
+    }
+
    #[tokio::test]
    async fn upload_file() -> anyhow::Result<()> {
        let workdir = tempdir()?.path().to_owned();
@@ -568,15 +598,7 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

-        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage.download(&upload_target, &mut content_bytes).await?;
-        assert!(
-            metadata.is_none(),
-            "No metadata should be returned for no metadata upload"
-        );
-
-        content_bytes.flush().await?;
-        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
+        let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
            contents,
@@ -584,13 +606,9 @@ mod fs_tests {
        );

        let non_existing_path = PathBuf::from("somewhere").join("else");
-        match storage.download(&non_existing_path, &mut io::sink()).await {
-            Ok(_) => panic!("Should not allow downloading non-existing storage files"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&non_existing_path.display().to_string()));
-            }
+        match storage.download(&non_existing_path).await {
+            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
+            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
        Ok(())
    }
@@ -603,58 +621,31 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

-        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage
-            .download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
-            .await?;
-        assert!(
-            metadata.is_none(),
-            "No metadata should be returned for no metadata upload"
-        );
-        full_range_bytes.flush().await?;
+        let full_range_download_contents =
+            read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
-            String::from_utf8(full_range_bytes.into_inner().into_inner())?,
+            full_range_download_contents,
            "Download full range should return the whole upload"
        );

-        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let same_byte = 1_000_000_000;
-        let metadata = storage
-            .download_byte_range(
-                &upload_target,
-                same_byte,
-                Some(same_byte + 1), // exclusive end
-                &mut zero_range_bytes,
-            )
-            .await?;
-        assert!(
-            metadata.is_none(),
-            "No metadata should be returned for no metadata upload"
-        );
-        zero_range_bytes.flush().await?;
-        assert!(
-            zero_range_bytes.into_inner().into_inner().is_empty(),
-            "Zero byte range should not download any part of the file"
-        );
-
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage
-            .download_byte_range(
-                &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
-                &mut first_part_remote,
-            )
+        let mut first_part_download = storage
+            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
        assert!(
-            metadata.is_none(),
+            first_part_download.metadata.is_none(),
            "No metadata should be returned for no metadata upload"
        );

+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut first_part_download.download_stream,
+            &mut first_part_remote,
+        )
+        .await?;
        first_part_remote.flush().await?;
        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -663,20 +654,24 @@ mod fs_tests {
            "First part bytes should be returned when requested"
        );

-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage
+        let mut second_part_download = storage
            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
                Some((first_part_local.len() + second_part_local.len()) as u64),
-                &mut second_part_remote,
            )
            .await?;
        assert!(
-            metadata.is_none(),
+            second_part_download.metadata.is_none(),
            "No metadata should be returned for no metadata upload"
        );

+        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut second_part_download.download_stream,
+            &mut second_part_remote,
+        )
+        .await?;
        second_part_remote.flush().await?;
        let second_part_remote = second_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -696,11 +691,30 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

+        let start = 1_000_000_000;
+        let end = start + 1;
+        match storage
+            .download_byte_range(
+                &upload_target,
+                start,
+                Some(end), // exclusive end
+            )
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading wrong ranges"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("zero bytes"));
+                assert!(error_string.contains(&start.to_string()));
+                assert!(error_string.contains(&end.to_string()));
+            }
+        }
+
        let start = 10000;
        let end = 234;
        assert!(start > end, "Should test an incorrect range");
        match storage
-            .download_byte_range(&upload_target, start, Some(end), &mut io::sink())
+            .download_byte_range(&upload_target, start, Some(end))
            .await
        {
            Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -712,18 +726,6 @@ mod fs_tests {
            }
        }

-        let non_existing_path = PathBuf::from("somewhere").join("else");
-        match storage
-            .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
-            .await
-        {
-            Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&non_existing_path.display().to_string()));
-            }
-        }
        Ok(())
    }

@@ -762,35 +764,26 @@ mod fs_tests {
        let upload_target =
            upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;

-        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
-
-        content_bytes.flush().await?;
-        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
+        let full_range_download_contents =
+            read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
        assert_eq!(
            dummy_contents(upload_name),
-            contents,
+            full_range_download_contents,
            "We should upload and download the same contents"
        );

-        assert_eq!(
-            full_download_metadata.as_ref(),
-            Some(&metadata),
-            "We should get the same metadata back for full download"
-        );
-
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, _) = uploaded_bytes.split_at(3);

-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let partial_download_metadata = storage
-            .download_byte_range(
-                &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
-                &mut first_part_remote,
-            )
+        let mut partial_download_with_metadata = storage
+            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut partial_download_with_metadata.download_stream,
+            &mut first_part_remote,
+        )
+        .await?;
        first_part_remote.flush().await?;
        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -800,8 +793,8 @@ mod fs_tests {
        );

        assert_eq!(
-            partial_download_metadata.as_ref(),
-            Some(&metadata),
+            partial_download_with_metadata.metadata,
+            Some(metadata),
            "We should get the same metadata back for partial download"
        );

@@ -843,7 +836,7 @@ mod fs_tests {
    }

    fn dummy_contents(name: &str) -> String {
-        format!("contents for {}", name)
+        format!("contents for {name}")
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -9,17 +9,17 @@ use std::path::{Path, PathBuf};
 use anyhow::Context;
 use rusoto_core::{
    credential::{InstanceMetadataProvider, StaticProvider},
-    HttpClient, Region,
+    HttpClient, Region, RusotoError,
 };
 use rusoto_s3::{
-    DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
-    StreamingBody, S3,
+    DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest,
+    S3Client, StreamingBody, S3,
 };
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

-use crate::{strip_path_prefix, RemoteStorage, S3Config};
+use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config};

 use super::StorageMetadata;

@@ -187,6 +187,39 @@ impl S3Bucket {
            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
        })
    }
+
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 download")
+            .map_err(DownloadError::Other)?;
+
+        metrics::inc_get_object();
+
+        match self.client.get_object(request).await {
+            Ok(object_output) => match object_output.body {
+                None => {
+                    metrics::inc_get_object_fail();
+                    Err(DownloadError::Other(anyhow::anyhow!(
+                        "Got no body for the S3 object given"
+                    )))
+                }
+                Some(body) => Ok(Download {
+                    metadata: object_output.metadata.map(StorageMetadata),
+                    download_stream: Box::pin(io::BufReader::new(body.into_async_read())),
+                }),
+            },
+            Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound),
+            Err(e) => {
+                metrics::inc_get_object_fail();
+                Err(DownloadError::Other(anyhow::anyhow!(
+                    "Failed to download S3 object: {e}"
+                )))
+            }
+        }
+    }
 }

 #[async_trait::async_trait]
@@ -283,38 +316,13 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn download(
-        &self,
-        from: &Self::RemoteObjectId,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 download")?;
-
-        metrics::inc_get_object();
-
-        let object_output = self
-            .client
-            .get_object(GetObjectRequest {
-                bucket: self.bucket_name.clone(),
-                key: from.key().to_owned(),
-                ..GetObjectRequest::default()
-            })
-            .await
-            .map_err(|e| {
-                metrics::inc_get_object_fail();
-                e
-            })?;
-
-        if let Some(body) = object_output.body {
-            let mut from = io::BufReader::new(body.into_async_read());
-            io::copy(&mut from, to).await?;
-        }
-
-        Ok(object_output.metadata.map(StorageMetadata))
+    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
+        self.download_object(GetObjectRequest {
+            bucket: self.bucket_name.clone(),
+            key: from.key().to_owned(),
+            ..GetObjectRequest::default()
+        })
+        .await
    }

    async fn download_byte_range(
@@ -322,8 +330,7 @@ impl RemoteStorage for S3Bucket {
        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
+    ) -> Result<Download, DownloadError> {
        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
        // and needs both ends to be exclusive
        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
@@ -331,34 +338,14 @@ impl RemoteStorage for S3Bucket {
            Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
            None => format!("bytes={}-", start_inclusive),
        });
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 range download")?;

-        metrics::inc_get_object();
-
-        let object_output = self
-            .client
-            .get_object(GetObjectRequest {
-                bucket: self.bucket_name.clone(),
-                key: from.key().to_owned(),
-                range,
-                ..GetObjectRequest::default()
-            })
-            .await
-            .map_err(|e| {
-                metrics::inc_get_object_fail();
-                e
-            })?;
-
-        if let Some(body) = object_output.body {
-            let mut from = io::BufReader::new(body.into_async_read());
-            io::copy(&mut from, to).await?;
-        }
-
-        Ok(object_output.metadata.map(StorageMetadata))
+        self.download_object(GetObjectRequest {
+            bucket: self.bucket_name.clone(),
+            key: from.key().to_owned(),
+            range,
+            ..GetObjectRequest::default()
+        })
+        .await
    }

    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,6 +1,6 @@
-#![allow(unused)]
-
 use criterion::{criterion_group, criterion_main, Criterion};
+
+use utils::pg_checksum_page::pg_checksum_page;
 use utils::zid;

 pub fn bench_zid_stringify(c: &mut Criterion) {
@@ -18,5 +18,20 @@ pub fn bench_zid_stringify(c: &mut Criterion) {
    });
 }

-criterion_group!(benches, bench_zid_stringify);
+// NB: adding `black_box` around arguments doesn't seem to change anything.
+pub fn pg_checksum_page_basic(c: &mut Criterion) {
+    const BLCKSZ: usize = 8192;
+    let mut page: [u8; BLCKSZ] = [0; BLCKSZ];
+    for (i, byte) in page.iter_mut().enumerate().take(BLCKSZ) {
+        *byte = i as u8;
+    }
+
+    c.bench_function("pg_checksum_page_basic", |b| {
+        b.iter(|| {
+            unsafe { pg_checksum_page(&page[..], 0) };
+        })
+    });
+}
+
+criterion_group!(benches, pg_checksum_page_basic, bench_zid_stringify);
 criterion_main!(benches);
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -5,7 +5,7 @@ DATA_DIR=$3
 PORT=$4
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr $DATA_DIR
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
+env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --data-checksums --sysid=$SYSID
 echo port=$PORT >> $DATA_DIR/postgresql.conf
 REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -54,6 +54,9 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

+// Postgres checksum calculation
+pub mod pg_checksum_page;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/pg_checksum_page.rs
+++ b/libs/utils/src/pg_checksum_page.rs
@@ -0,0 +1,136 @@
+///
+/// Rust implementation of Postgres pg_checksum_page
+/// See: https://github.com/postgres/postgres/blob/88210542106de5b26fe6aa088d1811b68502d224/src/include/storage/checksum_impl.h
+/// for additional comments.
+///
+/// This is not a direct port of pg_checksum_page from Postgres, though.
+/// For example, in the current state it can only produce a valid result
+/// on the little-endian platform and with the standard 8 KB page size.
+///
+
+const BLCKSZ: usize = 8192;
+const N_SUMS: usize = 32;
+// Prime multiplier of FNV-1a hash
+const FNV_PRIME: u32 = 16777619;
+
+// Base offsets to initialize each of the parallel FNV hashes into a
+// different initial state.
+const CHECKSUM_BASE_OFFSETS: [u32; N_SUMS] = [
+    0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C,
+    0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB,
+    0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4,
+    0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756,
+];
+
+// Calculate one round of the checksum.
+fn checksum_comp(checksum: u32, value: u32) -> u32 {
+    let tmp = checksum ^ value;
+    tmp.wrapping_mul(FNV_PRIME) ^ (tmp >> 17)
+}
+
+/// Compute the checksum for a Postgres page.
+///
+/// The page must be adequately aligned (at least on a 4-byte boundary).
+///
+/// The checksum includes the block number (to detect the case where a page is
+/// somehow moved to a different location), the page header (excluding the
+/// checksum itself), and the page data.
+///
+/// As in C implementation in Postgres, the checksum attribute on the page is
+/// excluded from the calculation and preserved.
+///
+/// NB: after doing any modifications run `cargo bench`. The baseline on the more
+/// or less recent Intel laptop is around 700ns. If it's significantly higher,
+/// then it's worth looking into.
+///
+/// # Arguments
+/// * `data` - the page to checksum
+/// * `blkno` - the block number of the page
+///
+/// # Safety
+/// This function is safe to call only if:
+/// * `data` is strictly a standard 8 KB Postgres page
+/// * it's called on the little-endian platform
+pub unsafe fn pg_checksum_page(data: &[u8], blkno: u32) -> u16 {
+    let page = std::mem::transmute::<&[u8], &[u32]>(data);
+    let mut checksum: u32 = 0;
+    let mut sums = CHECKSUM_BASE_OFFSETS;
+
+    // Calculate the checksum of the first 'row' of the page. Do it separately as
+    // we do an expensive comparison here, which is not required for the rest of the
+    // page. Putting it into the main loop slows it down ~3 times.
+    for (j, sum) in sums.iter_mut().enumerate().take(N_SUMS) {
+        // Third 32-bit chunk of the page contains the checksum in the lower half
+        // (assuming we are on little-endian machine), which we need to zero out.
+        // See also `PageHeaderData` for reference.
+        let chunk: u32 = if j == 2 {
+            page[j] & 0xFFFF_0000
+        } else {
+            page[j]
+        };
+
+        *sum = checksum_comp(*sum, chunk);
+    }
+
+    // Main checksum calculation loop
+    for i in 1..(BLCKSZ / (4 * N_SUMS)) {
+        for (j, sum) in sums.iter_mut().enumerate().take(N_SUMS) {
+            *sum = checksum_comp(*sum, page[i * N_SUMS + j]);
+        }
+    }
+
+    // Finally, add in two rounds of zeroes for additional mixing
+    for _i in 0..2 {
+        for s in sums.iter_mut().take(N_SUMS) {
+            *s = checksum_comp(*s, 0);
+        }
+    }
+
+    // Xor fold partial checksums together
+    for sum in sums {
+        checksum ^= sum;
+    }
+
+    // Mix in the block number to detect transposed pages
+    checksum ^= blkno;
+
+    // Reduce to a uint16 (to fit in the pd_checksum field) with an offset of
+    // one. That avoids checksums of zero, which seems like a good idea.
+    ((checksum % 65535) + 1) as u16
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{pg_checksum_page, BLCKSZ};
+
+    #[test]
+    fn page_with_and_without_checksum() {
+        // Create a page with some content and without a correct checksum.
+        let mut page: [u8; BLCKSZ] = [0; BLCKSZ];
+        for (i, byte) in page.iter_mut().enumerate().take(BLCKSZ) {
+            *byte = i as u8;
+        }
+
+        // Calculate the checksum.
+        let checksum = unsafe { pg_checksum_page(&page[..], 0) };
+
+        // Zero the checksum attribute on the page.
+        page[8..10].copy_from_slice(&[0u8; 2]);
+
+        // Calculate the checksum again, should be the same.
+        let new_checksum = unsafe { pg_checksum_page(&page[..], 0) };
+        assert_eq!(checksum, new_checksum);
+
+        // Set the correct checksum into the page.
+        page[8..10].copy_from_slice(&checksum.to_le_bytes());
+
+        // Calculate the checksum again, should be the same.
+        let new_checksum = unsafe { pg_checksum_page(&page[..], 0) };
+        assert_eq!(checksum, new_checksum);
+
+        // Check that we protect from the page transposition, i.e. page is the
+        // same, but in the wrong place.
+        let wrong_blockno_checksum = unsafe { pg_checksum_page(&page[..], 1) };
+        assert_ne!(checksum, wrong_blockno_checksum);
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -263,6 +263,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    // start profiler (if enabled)
    let profiler_guard = profiling::init_profiler(conf);

+    pageserver::tenant_tasks::init_tenant_task_pool()?;
+
    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
        AuthType::Trust | AuthType::MD5 => None,
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -38,6 +38,7 @@ pub struct TenantCreateRequest {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub data_checksums_enabled: Option<bool>,
 }

 #[serde_as]
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -494,6 +494,8 @@ components:
          type: string
        compaction_threshold:
          type: string
+        data_checksums_enabled:
+          type: boolean
    TenantConfigInfo:
      type: object
      properties:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -412,6 +412,9 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    tenant_conf.compaction_target_size = request_data.compaction_target_size;
    tenant_conf.compaction_threshold = request_data.compaction_threshold;

+    // Turn on data checksums for all new tenants
+    tenant_conf.data_checksums_enabled = Some(request_data.data_checksums_enabled.unwrap_or(true));
+
    if let Some(compaction_period) = request_data.compaction_period {
        tenant_conf.compaction_period =
            Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -516,10 +516,23 @@ pub fn import_file<R: Repository, Reader: Read>(
        // Parse zenith signal file to set correct previous LSN
        let bytes = read_all_bytes(reader)?;
        // zenith.signal format is "PREV LSN: prev_lsn"
-        let zenith_signal = std::str::from_utf8(&bytes)?;
-        let zenith_signal = zenith_signal.split(':').collect::<Vec<_>>();
-        let prev_lsn = zenith_signal[1].trim().parse::<Lsn>()?;
+        // TODO write serialization and deserialization in the same place.
+        let zenith_signal = std::str::from_utf8(&bytes)?.trim();
+        let prev_lsn = match zenith_signal {
+            "PREV LSN: none" => Lsn(0),
+            "PREV LSN: invalid" => Lsn(0),
+            other => {
+                let split = other.split(':').collect::<Vec<_>>();
+                split[1]
+                    .trim()
+                    .parse::<Lsn>()
+                    .context("can't parse zenith.signal")?
+            }
+        };

+        // zenith.signal is not necessarily the last file, that we handle
+        // but it is ok to call `finish_write()`, because final `modification.commit()`
+        // will update lsn once more to the final one.
        let writer = modification.tline.tline.writer();
        writer.finish_write(prev_lsn);

--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -158,6 +158,18 @@ pub struct LayeredRepository {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

+    // Allows us to gracefully cancel operations that edit the directory
+    // that backs this layered repository. Usage:
+    //
+    // Use `let _guard = file_lock.try_read()` while writing any files.
+    // Use `let _guard = file_lock.write().unwrap()` to wait for all writes to finish.
+    //
+    // TODO try_read this lock during checkpoint as well to prevent race
+    //      between checkpoint and detach/delete.
+    // TODO try_read this lock for all gc/compaction operations, not just
+    //      ones scheduled by the tenant task manager.
+    pub file_lock: RwLock<()>,
+
    // Overridden tenant-specific config parameters.
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
@@ -220,23 +232,32 @@ impl Repository for LayeredRepository {

    fn create_empty_timeline(
        &self,
-        timelineid: ZTimelineId,
+        timeline_id: ZTimelineId,
        initdb_lsn: Lsn,
    ) -> Result<Arc<LayeredTimeline>> {
        let mut timelines = self.timelines.lock().unwrap();
+        let vacant_timeline_entry = match timelines.entry(timeline_id) {
+            Entry::Occupied(_) => bail!("Timeline already exists"),
+            Entry::Vacant(vacant_entry) => vacant_entry,
+        };
+
+        let timeline_path = self.conf.timeline_path(&timeline_id, &self.tenant_id);
+        if timeline_path.exists() {
+            bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.")
+        }

        // Create the timeline directory, and write initial metadata to file.
-        crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenant_id))?;
+        crashsafe_dir::create_dir_all(timeline_path)?;

        let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
-        Self::save_metadata(self.conf, timelineid, self.tenant_id, &metadata, true)?;
+        Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?;

        let timeline = LayeredTimeline::new(
            self.conf,
            Arc::clone(&self.tenant_conf),
            metadata,
            None,
-            timelineid,
+            timeline_id,
            self.tenant_id,
            Arc::clone(&self.walredo_mgr),
            self.upload_layers,
@@ -245,12 +266,7 @@ impl Repository for LayeredRepository {

        // Insert if not exists
        let timeline = Arc::new(timeline);
-        match timelines.entry(timelineid) {
-            Entry::Occupied(_) => bail!("Timeline already exists"),
-            Entry::Vacant(vacant) => {
-                vacant.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)))
-            }
-        };
+        vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)));

        Ok(timeline)
    }
@@ -337,16 +353,12 @@ impl Repository for LayeredRepository {
        // compactions.  We don't want to block everything else while the
        // compaction runs.
        let timelines = self.timelines.lock().unwrap();
-        let mut timelines_to_compact = timelines
+        let timelines_to_compact = timelines
            .iter()
            .map(|(timelineid, timeline)| (*timelineid, timeline.clone()))
            .collect::<Vec<_>>();
        drop(timelines);

-        // Sort to prevent deadlock
-        timelines_to_compact.sort_by(|a, b| a.0.cmp(&b.0));
-
-        // Compact all timelines in order
        for (timelineid, timeline) in &timelines_to_compact {
            let _entered =
                info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered();
@@ -689,6 +701,7 @@ impl LayeredRepository {
    ) -> LayeredRepository {
        LayeredRepository {
            tenant_id,
+            file_lock: RwLock::new(()),
            conf,
            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
            timelines: Mutex::new(HashMap::new()),
@@ -1914,15 +1927,28 @@ impl LayeredTimeline {
                } else {
                    Lsn(0)
                };
+                // Let's consider an example:
+                //
+                // delta layer with LSN range 71-81
+                // delta layer with LSN range 81-91
+                // delta layer with LSN range 91-101
+                // image layer at LSN 100
+                //
+                // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer,
+                // there's no need to create a new one. We check this case explicitly, to avoid passing
+                // a bogus range to count_deltas below, with start > end. It's even possible that there
+                // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
+                // after we read last_record_lsn, which is passed here in the 'lsn' argument.
+                if img_lsn < lsn {
+                    let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;

-                let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
-
-                debug!(
-                    "range {}-{}, has {} deltas on this timeline",
-                    img_range.start, img_range.end, num_deltas
-                );
-                if num_deltas >= self.get_image_creation_threshold() {
-                    return Ok(true);
+                    debug!(
+                        "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
+                        img_range.start, img_range.end, num_deltas, img_lsn, lsn
+                    );
+                    if num_deltas >= self.get_image_creation_threshold() {
+                        return Ok(true);
+                    }
                }
            }
        }
@@ -2214,6 +2240,9 @@ impl LayeredTimeline {
                    LsnForTimestamp::Past(lsn) => {
                        debug!("past({})", lsn);
                    }
+                    LsnForTimestamp::NoData(lsn) => {
+                        debug!("nodata({})", lsn);
+                    }
                }
                debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
            }
--- a/pageserver/src/layered_repository/blob_io.rs
+++ b/pageserver/src/layered_repository/blob_io.rs
@@ -34,7 +34,7 @@ pub trait BlobCursor {
    ) -> Result<(), std::io::Error>;
 }

-impl<'a, R> BlobCursor for BlockCursor<R>
+impl<R> BlobCursor for BlockCursor<R>
 where
    R: BlockReader,
 {
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -445,7 +445,10 @@ impl ImageLayerWriter {
            },
        );
        info!("new image layer {}", path.display());
-        let mut file = VirtualFile::create(&path)?;
+        let mut file = VirtualFile::open_with_options(
+            &path,
+            std::fs::OpenOptions::new().write(true).create_new(true),
+        )?;
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
        let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,7 +13,7 @@ pub mod repository;
 pub mod storage_sync;
 pub mod tenant_config;
 pub mod tenant_mgr;
-pub mod tenant_threads;
+pub mod tenant_tasks;
 pub mod thread_mgr;
 pub mod timelines;
 pub mod virtual_file;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -554,7 +554,7 @@ impl PageServerHandler {
        // Create empty timeline
        info!("creating new timeline");
        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
-        let timeline = repo.create_empty_timeline(timeline_id, Lsn(0))?;
+        let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
        let repartition_distance = repo.get_checkpoint_distance();
        let mut datadir_timeline =
            DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
@@ -951,7 +951,10 @@ impl postgres_backend::Handler for PageServerHandler {

            match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) {
                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
+                Err(e) => {
+                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
+                }
            };
        } else if query_string.starts_with("import wal ") {
            // Import the `pg_wal` section of a basebackup.
@@ -970,7 +973,10 @@ impl postgres_backend::Handler for PageServerHandler {

            match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) {
                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
+                Err(e) => {
+                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
+                }
            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
@@ -1151,6 +1157,7 @@ impl postgres_backend::Handler for PageServerHandler {
                LsnForTimestamp::Present(lsn) => format!("{}", lsn),
                LsnForTimestamp::Future(_lsn) => "future".into(),
                LsnForTimestamp::Past(_lsn) => "past".into(),
+                LsnForTimestamp::NoData(_lsn) => "nodata".into(),
            };
            pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -51,6 +51,7 @@ pub enum LsnForTimestamp {
    Present(Lsn),
    Future(Lsn),
    Past(Lsn),
+    NoData(Lsn),
 }

 impl<R: Repository> DatadirTimeline<R> {
@@ -263,7 +264,7 @@ impl<R: Repository> DatadirTimeline<R> {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
                // just after importing a cluster.
-                bail!("no commit timestamps found");
+                Ok(LsnForTimestamp::NoData(max_lsn))
            }
            (true, false) => {
                // Didn't find any commit timestamps larger than the request
--- a/pageserver/src/profiling.rs
+++ b/pageserver/src/profiling.rs
@@ -81,6 +81,12 @@ mod profiling_impl {

    pub struct DummyProfilerGuard;

+    impl Drop for DummyProfilerGuard {
+        fn drop(&mut self) {
+            // do nothing, this exists to calm Clippy down
+        }
+    }
+
    pub fn profpoint_start(
        _conf: &PageServerConf,
        _point: ProfilingConfig,
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -225,7 +225,7 @@ pub trait Repository: Send + Sync {
    /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
    fn create_empty_timeline(
        &self,
-        timelineid: ZTimelineId,
+        timeline_id: ZTimelineId,
        initdb_lsn: Lsn,
    ) -> Result<Arc<Self::Timeline>>;

@@ -473,6 +473,7 @@ pub mod repo_harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
+                data_checksums_enabled: Some(tenant_conf.data_checksums_enabled),
            }
        }
    }
@@ -636,6 +637,19 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn no_duplicate_timelines() -> Result<()> {
+        let repo = RepoHarness::create("no_duplicate_timelines")?.load();
+        let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+
+        match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
+            Ok(_) => panic!("duplicate timeline creation should fail"),
+            Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
+        }
+
+        Ok(())
+    }
+
    /// Convenience function to create a page image with given string as the only content
    pub fn test_value(s: &str) -> Value {
        let mut buf = BytesMut::new();
--- a/pageserver/src/storage_sync/download.rs
+++ b/pageserver/src/storage_sync/download.rs
@@ -44,13 +44,23 @@ where
                index_part_path.display()
            )
        })?;
+
+    let mut index_part_download =
+        storage
+            .download(&part_storage_path)
+            .await
+            .with_context(|| {
+                format!("Failed to open download stream for for storage path {part_storage_path:?}")
+            })?;
    let mut index_part_bytes = Vec::new();
-    storage
-        .download(&part_storage_path, &mut index_part_bytes)
-        .await
-        .with_context(|| {
-            format!("Failed to download an index part from storage path {part_storage_path:?}")
-        })?;
+    io::copy(
+        &mut index_part_download.download_stream,
+        &mut index_part_bytes,
+    )
+    .await
+    .with_context(|| {
+        format!("Failed to download an index part from storage path {part_storage_path:?}")
+    })?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
        format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'")
@@ -162,15 +172,19 @@ where
                            temp_file_path.display()
                        )
                    })?;
-
-                storage
-                    .download(&layer_storage_path, &mut destination_file)
+                let mut download = storage
+                    .download(&layer_storage_path)
                    .await
                    .with_context(|| {
                        format!(
-                            "Failed to download a layer from storage path '{layer_storage_path:?}'"
+                            "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
                        )
                    })?;
+                io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+                    format!(
+                        "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
+                    )
+                })?;

                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
                // A file will not be closed immediately when it goes out of scope if there are any IO operations
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -37,7 +37,11 @@ pub mod defaults {
    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000;
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
+
+    // Turn off data checksums by default to do not affect old tenants.
+    // We turn it on explicitly for all new tenants.
+    pub const DEFAULT_DATA_CHECKSUMS: bool = false;
 }

 /// Per-tenant configuration options
@@ -83,6 +87,7 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
+    pub data_checksums_enabled: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -105,6 +110,7 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    pub lagging_wal_timeout: Option<Duration>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub data_checksums_enabled: Option<bool>,
 }

 impl TenantConfOpt {
@@ -135,6 +141,9 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
+            data_checksums_enabled: self
+                .data_checksums_enabled
+                .unwrap_or(global_conf.data_checksums_enabled),
        }
    }

@@ -172,6 +181,9 @@ impl TenantConfOpt {
        if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
            self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
        }
+        if let Some(data_checksums_enabled) = other.data_checksums_enabled {
+            self.data_checksums_enabled = Some(data_checksums_enabled);
+        }
    }
 }

@@ -199,6 +211,7 @@ impl TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            data_checksums_enabled: DEFAULT_DATA_CHECKSUMS,
        }
    }

@@ -229,6 +242,7 @@ impl TenantConf {
            .unwrap(),
            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .unwrap(),
+            data_checksums_enabled: defaults::DEFAULT_DATA_CHECKSUMS,
        }
    }
 }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -11,7 +11,7 @@ use crate::tenant_config::TenantConfOpt;
 use crate::thread_mgr::ThreadKind;
 use crate::timelines::CreateRepo;
 use crate::walredo::PostgresRedoManager;
-use crate::{thread_mgr, timelines, walreceiver};
+use crate::{tenant_config, thread_mgr, timelines, walreceiver};
 use crate::{DatadirTimelineImpl, RepositoryImpl};
 use anyhow::{bail, Context};
 use serde::{Deserialize, Serialize};
@@ -230,8 +230,6 @@ pub fn shutdown_all_tenants() {
    drop(m);

    thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
-    thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
-    thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);

    // Ok, no background threads running anymore. Flush any remaining data in
    // memory to disk.
@@ -268,7 +266,14 @@ pub fn create_tenant_repository(
            Ok(None)
        }
        Entry::Vacant(v) => {
-            let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
+            let data_checksums_enabled = tenant_conf
+                .data_checksums_enabled
+                .unwrap_or(tenant_config::defaults::DEFAULT_DATA_CHECKSUMS);
+            let wal_redo_manager = Arc::new(PostgresRedoManager::new(
+                conf,
+                data_checksums_enabled,
+                tenant_id,
+            ));
            let repo = timelines::create_repo(
                conf,
                tenant_conf,
@@ -330,44 +335,12 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
        }
        (TenantState::Idle, TenantState::Active) => {
            info!("activating tenant {tenant_id}");
-            let compactor_spawn_result = thread_mgr::spawn(
-                ThreadKind::Compactor,
-                Some(tenant_id),
-                None,
-                "Compactor thread",
-                false,
-                move || crate::tenant_threads::compact_loop(tenant_id),
-            );
-            if compactor_spawn_result.is_err() {
-                let mut m = tenants_state::write_tenants();
-                m.get_mut(&tenant_id)
-                    .with_context(|| format!("Tenant not found for id {tenant_id}"))?
-                    .state = old_state;
-                drop(m);
-            }
-            compactor_spawn_result?;

-            let gc_spawn_result = thread_mgr::spawn(
-                ThreadKind::GarbageCollector,
-                Some(tenant_id),
-                None,
-                "GC thread",
-                false,
-                move || crate::tenant_threads::gc_loop(tenant_id),
-            )
-            .map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature
-            .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));
-
-            if let Err(e) = &gc_spawn_result {
-                let mut m = tenants_state::write_tenants();
-                m.get_mut(&tenant_id)
-                    .with_context(|| format!("Tenant not found for id {tenant_id}"))?
-                    .state = old_state;
-                drop(m);
-                error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
-                thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
-                return gc_spawn_result;
-            }
+            // Spawn gc and compaction loops. The loops will shut themselves
+            // down when they notice that the tenant is inactive.
+            // TODO maybe use tokio::sync::watch instead?
+            crate::tenant_tasks::start_compaction_loop(tenant_id)?;
+            crate::tenant_tasks::start_gc_loop(tenant_id)?;
        }
        (TenantState::Idle, TenantState::Stopping) => {
            info!("stopping idle tenant {tenant_id}");
@@ -379,8 +352,10 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
                Some(tenant_id),
                None,
            );
-            thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None);
-            thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
+
+            // Wait until all gc/compaction tasks finish
+            let repo = get_repository_for_tenant(tenant_id)?;
+            let _guard = repo.file_lock.write().unwrap();
        }
    }

@@ -599,10 +574,16 @@ fn load_local_repo(
    tenant_id: ZTenantId,
    remote_index: &RemoteIndex,
 ) -> anyhow::Result<Arc<RepositoryImpl>> {
+    // Restore tenant config
+    let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?;
+
    let mut m = tenants_state::write_tenants();
    let tenant = m.entry(tenant_id).or_insert_with(|| {
+        let data_checksums_enabled = tenant_conf
+            .data_checksums_enabled
+            .unwrap_or(tenant_config::defaults::DEFAULT_DATA_CHECKSUMS);
        // Set up a WAL redo manager, for applying WAL records.
-        let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
+        let walredo_mgr = PostgresRedoManager::new(conf, data_checksums_enabled, tenant_id);

        // Set up an object repository, for actual data storage.
        let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
@@ -620,8 +601,6 @@ fn load_local_repo(
        }
    });

-    // Restore tenant config
-    let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?;
    tenant.repo.update_tenant_config(tenant_conf)?;

    Ok(Arc::clone(&tenant.repo))
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -0,0 +1,286 @@
+//! This module contains functions to serve per-tenant background processes,
+//! such as compaction and GC
+
+use std::collections::HashMap;
+use std::ops::ControlFlow;
+use std::time::Duration;
+
+use crate::repository::Repository;
+use crate::tenant_mgr::TenantState;
+use crate::thread_mgr::ThreadKind;
+use crate::{tenant_mgr, thread_mgr};
+use anyhow::{self, Context};
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
+use metrics::{register_int_counter_vec, IntCounterVec};
+use once_cell::sync::{Lazy, OnceCell};
+use tokio::sync::mpsc;
+use tokio::sync::watch;
+use tracing::*;
+use utils::zid::ZTenantId;
+
+static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_tenant_task_events",
+        "Number of task start/stop/fail events.",
+        &["event"],
+    )
+    .expect("Failed to register tenant_task_events metric")
+});
+
+///
+/// Compaction task's main loop
+///
+async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
+    loop {
+        trace!("waking up");
+
+        // Run blocking part of the task
+        let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
+            // Break if tenant is not active
+            if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            // Break if we're not allowed to write to disk
+            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+            // TODO do this inside repo.compaction_iteration instead.
+            let _guard = match repo.file_lock.try_read() {
+                Ok(g) => g,
+                Err(_) => return Ok(ControlFlow::Break(())),
+            };
+
+            // Run compaction
+            let compaction_period = repo.get_compaction_period();
+            repo.compaction_iteration()?;
+            Ok(ControlFlow::Continue(compaction_period))
+        })
+        .await;
+
+        // Decide whether to sleep or break
+        let sleep_duration = match period {
+            Ok(Ok(ControlFlow::Continue(period))) => period,
+            Ok(Ok(ControlFlow::Break(()))) => break,
+            Ok(Err(e)) => {
+                error!("Compaction failed, retrying: {}", e);
+                Duration::from_secs(2)
+            }
+            Err(e) => {
+                error!("Compaction join error, retrying: {}", e);
+                Duration::from_secs(2)
+            }
+        };
+
+        // Sleep
+        tokio::select! {
+            _ = cancel.changed() => {
+                trace!("received cancellation request");
+                break;
+            },
+            _ = tokio::time::sleep(sleep_duration) => {},
+        }
+    }
+
+    trace!(
+        "compaction loop stopped. State is {:?}",
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+}
+
+static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
+static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
+
+/// Spawn a task that will periodically schedule garbage collection until
+/// the tenant becomes inactive. This should be called on tenant
+/// activation.
+pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
+    START_GC_LOOP
+        .get()
+        .context("Failed to get START_GC_LOOP")?
+        .blocking_send(tenantid)
+        .context("Failed to send to START_GC_LOOP channel")?;
+    Ok(())
+}
+
+/// Spawn a task that will periodically schedule compaction until
+/// the tenant becomes inactive. This should be called on tenant
+/// activation.
+pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
+    START_COMPACTION_LOOP
+        .get()
+        .context("failed to get START_COMPACTION_LOOP")?
+        .blocking_send(tenantid)
+        .context("failed to send to START_COMPACTION_LOOP")?;
+    Ok(())
+}
+
+/// Spawn the TenantTaskManager
+/// This needs to be called before start_gc_loop or start_compaction_loop
+pub fn init_tenant_task_pool() -> anyhow::Result<()> {
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .thread_name("tenant-task-worker")
+        .enable_all()
+        .build()?;
+
+    let (gc_send, mut gc_recv) = mpsc::channel::<ZTenantId>(100);
+    START_GC_LOOP
+        .set(gc_send)
+        .expect("Failed to set START_GC_LOOP");
+
+    let (compaction_send, mut compaction_recv) = mpsc::channel::<ZTenantId>(100);
+    START_COMPACTION_LOOP
+        .set(compaction_send)
+        .expect("Failed to set START_COMPACTION_LOOP");
+
+    // TODO this is getting repetitive
+    let mut gc_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
+    let mut compaction_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
+
+    thread_mgr::spawn(
+        ThreadKind::TenantTaskManager,
+        None,
+        None,
+        "Tenant task manager main thread",
+        true,
+        move || {
+            runtime.block_on(async move {
+                let mut futures = FuturesUnordered::new();
+                loop {
+                    tokio::select! {
+                        _ = thread_mgr::shutdown_watcher() => {
+                            // Send cancellation to all tasks
+                            for (_, cancel) in gc_loops.drain() {
+                                cancel.send(()).ok();
+                            }
+                            for (_, cancel) in compaction_loops.drain() {
+                                cancel.send(()).ok();
+                            }
+
+                            // Exit after all tasks finish
+                            while let Some(result) = futures.next().await {
+                                match result {
+                                    Ok(()) => {
+                                        TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
+                                    },
+                                    Err(e) => {
+                                        TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
+                                        error!("loop join error {}", e)
+                                    },
+                                }
+                            }
+                            break;
+                        },
+                        tenantid = gc_recv.recv() => {
+                            let tenantid = tenantid.expect("Gc task channel closed unexpectedly");
+
+                            // Spawn new task, request cancellation of the old one if exists
+                            let (cancel_send, cancel_recv) = watch::channel(());
+                            let handle = tokio::spawn(gc_loop(tenantid, cancel_recv)
+                                .instrument(info_span!("gc loop", tenant = %tenantid)));
+                            if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) {
+                                old_cancel_send.send(()).ok();
+                            }
+
+                            // Update metrics, remember handle
+                            TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                            futures.push(handle);
+                        },
+                        tenantid = compaction_recv.recv() => {
+                            let tenantid = tenantid.expect("Compaction task channel closed unexpectedly");
+
+                            // Spawn new task, request cancellation of the old one if exists
+                            let (cancel_send, cancel_recv) = watch::channel(());
+                            let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv)
+                                .instrument(info_span!("compaction loop", tenant = %tenantid)));
+                            if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) {
+                                old_cancel_send.send(()).ok();
+                            }
+
+                            // Update metrics, remember handle
+                            TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                            futures.push(handle);
+                        },
+                        result = futures.next() => {
+                            // Log and count any unhandled panics
+                            match result {
+                                Some(Ok(())) => {
+                                    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
+                                },
+                                Some(Err(e)) => {
+                                    TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
+                                    error!("loop join error {}", e)
+                                },
+                                None => {},
+                            };
+                        },
+                    }
+                }
+            });
+            Ok(())
+        },
+    )?;
+
+    Ok(())
+}
+
+///
+/// GC task's main loop
+///
+async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
+    loop {
+        trace!("waking up");
+
+        // Run blocking part of the task
+        let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
+            // Break if tenant is not active
+            if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            // Break if we're not allowed to write to disk
+            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+            // TODO do this inside repo.gc_iteration instead.
+            let _guard = match repo.file_lock.try_read() {
+                Ok(g) => g,
+                Err(_) => return Ok(ControlFlow::Break(())),
+            };
+
+            // Run gc
+            let gc_period = repo.get_gc_period();
+            let gc_horizon = repo.get_gc_horizon();
+            if gc_horizon > 0 {
+                repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
+            }
+
+            Ok(ControlFlow::Continue(gc_period))
+        })
+        .await;
+
+        // Decide whether to sleep or break
+        let sleep_duration = match period {
+            Ok(Ok(ControlFlow::Continue(period))) => period,
+            Ok(Ok(ControlFlow::Break(()))) => break,
+            Ok(Err(e)) => {
+                error!("Gc failed, retrying: {}", e);
+                Duration::from_secs(2)
+            }
+            Err(e) => {
+                error!("Gc join error, retrying: {}", e);
+                Duration::from_secs(2)
+            }
+        };
+
+        // Sleep
+        tokio::select! {
+            _ = cancel.changed() => {
+                trace!("received cancellation request");
+                break;
+            },
+            _ = tokio::time::sleep(sleep_duration) => {},
+        }
+    }
+    trace!(
+        "GC loop stopped. State is {:?}",
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+}
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -1,79 +0,0 @@
-//! This module contains functions to serve per-tenant background processes,
-//! such as compaction and GC
-use crate::repository::Repository;
-use crate::tenant_mgr;
-use crate::tenant_mgr::TenantState;
-use anyhow::Result;
-use std::time::Duration;
-use tracing::*;
-use utils::zid::ZTenantId;
-
-///
-/// Compaction thread's main loop
-///
-pub fn compact_loop(tenantid: ZTenantId) -> Result<()> {
-    if let Err(err) = compact_loop_ext(tenantid) {
-        error!("compact loop terminated with error: {:?}", err);
-        Err(err)
-    } else {
-        Ok(())
-    }
-}
-
-fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> {
-    loop {
-        if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
-            break;
-        }
-        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        let compaction_period = repo.get_compaction_period();
-
-        std::thread::sleep(compaction_period);
-        trace!("compaction thread for tenant {} waking up", tenantid);
-
-        // Compact timelines
-        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        repo.compaction_iteration()?;
-    }
-
-    trace!(
-        "compaction thread stopped for tenant {} state is {:?}",
-        tenantid,
-        tenant_mgr::get_tenant_state(tenantid)
-    );
-    Ok(())
-}
-
-///
-/// GC thread's main loop
-///
-pub fn gc_loop(tenantid: ZTenantId) -> Result<()> {
-    loop {
-        if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
-            break;
-        }
-
-        trace!("gc thread for tenant {} waking up", tenantid);
-        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        let gc_horizon = repo.get_gc_horizon();
-        // Garbage collect old files that are not needed for PITR anymore
-        if gc_horizon > 0 {
-            repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
-        }
-
-        // TODO Write it in more adequate way using
-        // condvar.wait_timeout() or something
-        let mut sleep_time = repo.get_gc_period().as_secs();
-        while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
-        {
-            sleep_time -= 1;
-            std::thread::sleep(Duration::from_secs(1));
-        }
-    }
-    trace!(
-        "GC thread stopped for tenant {} state is {:?}",
-        tenantid,
-        tenant_mgr::get_tenant_state(tenantid)
-    );
-    Ok(())
-}
--- a/pageserver/src/thread_mgr.rs
+++ b/pageserver/src/thread_mgr.rs
@@ -94,11 +94,8 @@ pub enum ThreadKind {
    // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
    WalReceiverManager,

-    // Thread that handles compaction of all timelines for a tenant.
-    Compactor,
-
-    // Thread that handles GC of a tenant
-    GarbageCollector,
+    // Thread that schedules new compaction and gc jobs
+    TenantTaskManager,

    // Thread that flushes frozen in-memory layers to disk
    LayerFlushThread,
--- a/pageserver/src/timelines.rs
+++ b/pageserver/src/timelines.rs
@@ -253,6 +253,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
        .args(&["-D", &initdbpath.to_string_lossy()])
        .args(&["-U", &conf.superuser])
        .args(&["-E", "utf8"])
+        .arg("--data-checksums")
        .arg("--no-instructions")
        // This is only used for a temporary installation that is deleted shortly after,
        // so no need to fsync it
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -24,7 +24,7 @@
 use anyhow::Context;
 use postgres_ffi::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
-use postgres_ffi::{page_is_new, page_set_lsn};
+use postgres_ffi::{page_is_new, page_set_checksum, page_set_lsn};

 use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
@@ -313,6 +313,8 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            if !page_is_new(&image) {
                page_set_lsn(&mut image, lsn)
            }
+            unsafe { page_set_checksum(&mut image, blk.blkno) };
+
            assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?;
        } else {
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -91,7 +91,6 @@ pub fn init_wal_receiver_main_thread(

    let runtime = tokio::runtime::Builder::new_multi_thread()
        .thread_name("wal-receiver-runtime-thread")
-        .worker_threads(40)
        .enable_all()
        .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true)))
        .build()
@@ -178,7 +177,7 @@ async fn shutdown_all_wal_connections(
 /// That may lead to certain events not being observed by the listener.
 #[derive(Debug)]
 struct TaskHandle<E> {
-    handle: JoinHandle<()>,
+    handle: JoinHandle<Result<(), String>>,
    events_receiver: watch::Receiver<TaskEvent<E>>,
    cancellation: watch::Sender<()>,
 }
@@ -205,8 +204,8 @@ impl<E: Clone> TaskHandle<E> {

        let sender = Arc::clone(&events_sender);
        let handle = tokio::task::spawn(async move {
-            let task_result = task(sender, cancellation_receiver).await;
-            events_sender.send(TaskEvent::End(task_result)).ok();
+            events_sender.send(TaskEvent::Started).ok();
+            task(sender, cancellation_receiver).await
        });

        TaskHandle {
@@ -216,6 +215,16 @@ impl<E: Clone> TaskHandle<E> {
        }
    }

+    async fn next_task_event(&mut self) -> TaskEvent<E> {
+        select! {
+            next_task_event = self.events_receiver.changed() => match next_task_event {
+                Ok(()) => self.events_receiver.borrow().clone(),
+                Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await,
+            },
+            task_completion_result = join_on_handle(&mut self.handle) => task_completion_result,
+        }
+    }
+
    /// Aborts current task, waiting for it to finish.
    async fn shutdown(self) {
        self.cancellation.send(()).ok();
@@ -225,6 +234,19 @@ impl<E: Clone> TaskHandle<E> {
    }
 }

+async fn join_on_handle<E>(handle: &mut JoinHandle<Result<(), String>>) -> TaskEvent<E> {
+    match handle.await {
+        Ok(task_result) => TaskEvent::End(task_result),
+        Err(e) => {
+            if e.is_cancelled() {
+                TaskEvent::End(Ok(()))
+            } else {
+                TaskEvent::End(Err(format!("WAL receiver task panicked: {e}")))
+            }
+        }
+    }
+}
+
 /// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery.
 /// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled.
 /// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled.
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -104,49 +104,29 @@ async fn connection_manager_loop_step(

            Some(wal_connection_update) = async {
                match walreceiver_state.wal_connection.as_mut() {
-                    Some(wal_connection) => {
-                        let receiver = &mut wal_connection.connection_task.events_receiver;
-                        Some(match receiver.changed().await {
-                            Ok(()) => receiver.borrow().clone(),
-                            Err(_cancellation_error) => TaskEvent::End(Ok(())),
-                        })
-                    }
+                    Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
                    None => None,
                }
            } => {
-                let (connection_update, reset_connection_attempts) = match &wal_connection_update {
-                    TaskEvent::Started => (Some(Utc::now().naive_utc()), true),
-                    TaskEvent::NewEvent(replication_feedback) => (Some(DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc()), true),
+                let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard");
+                match &wal_connection_update {
+                    TaskEvent::Started => {
+                        wal_connection.latest_connection_update = Utc::now().naive_utc();
+                        *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1;
+                    },
+                    TaskEvent::NewEvent(replication_feedback) => {
+                        wal_connection.latest_connection_update = DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc();
+                        // reset connection attempts here only, the only place where both nodes
+                        // explicitly confirmn with replication feedback that they are connected to each other
+                        walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
+                    },
                    TaskEvent::End(end_result) => {
-                        let should_reset_connection_attempts = match end_result {
-                            Ok(()) => {
-                                debug!("WAL receiving task finished");
-                                true
-                            },
-                            Err(e) => {
-                                warn!("WAL receiving task failed: {e}");
-                                false
-                            },
+                        match end_result {
+                            Ok(()) => debug!("WAL receiving task finished"),
+                            Err(e) => warn!("WAL receiving task failed: {e}"),
                        };
                        walreceiver_state.wal_connection = None;
-                        (None, should_reset_connection_attempts)
                    },
-                };
-
-                if let Some(connection_update) = connection_update {
-                    match &mut walreceiver_state.wal_connection {
-                        Some(wal_connection) => {
-                            wal_connection.latest_connection_update = connection_update;
-
-                            let attempts_entry = walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0);
-                            if reset_connection_attempts {
-                                *attempts_entry = 0;
-                            } else {
-                                *attempts_entry += 1;
-                            }
-                        },
-                        None => error!("Received connection update for WAL connection that is not active, update: {wal_connection_update:?}"),
-                    }
                }
            },

@@ -406,10 +386,8 @@ impl WalreceiverState {
            Some(existing_wal_connection) => {
                let connected_sk_node = existing_wal_connection.sk_id;

-                let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = self
-                    .applicable_connection_candidates()
-                    .filter(|&(sk_id, _, _)| sk_id != connected_sk_node)
-                    .max_by_key(|(_, info, _)| info.commit_lsn)?;
+                let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) =
+                    self.select_connection_candidate(Some(connected_sk_node))?;

                let now = Utc::now().naive_utc();
                if let Ok(latest_interaciton) =
@@ -462,9 +440,8 @@ impl WalreceiverState {
                }
            }
            None => {
-                let (new_sk_id, _, new_wal_producer_connstr) = self
-                    .applicable_connection_candidates()
-                    .max_by_key(|(_, info, _)| info.commit_lsn)?;
+                let (new_sk_id, _, new_wal_producer_connstr) =
+                    self.select_connection_candidate(None)?;
                return Some(NewWalConnectionCandidate {
                    safekeeper_id: new_sk_id,
                    wal_producer_connstr: new_wal_producer_connstr,
@@ -476,6 +453,49 @@ impl WalreceiverState {
        None
    }

+    /// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
+    /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
+    ///
+    /// The candidate that is chosen:
+    /// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent)
+    /// * has greatest data Lsn among the ones that are left
+    ///
+    /// NOTE:
+    /// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but
+    /// otherwise to reset the connection attempts, a successful connection to that node is needed.
+    /// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored.
+    fn select_connection_candidate(
+        &self,
+        node_to_omit: Option<NodeId>,
+    ) -> Option<(NodeId, &SkTimelineInfo, String)> {
+        let all_candidates = self
+            .applicable_connection_candidates()
+            .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
+            .collect::<Vec<_>>();
+
+        let smallest_attempts_allowed = all_candidates
+            .iter()
+            .map(|(sk_id, _, _)| {
+                self.wal_connection_attempts
+                    .get(sk_id)
+                    .copied()
+                    .unwrap_or(0)
+            })
+            .min()?;
+
+        all_candidates
+            .into_iter()
+            .filter(|(sk_id, _, _)| {
+                smallest_attempts_allowed
+                    >= self
+                        .wal_connection_attempts
+                        .get(sk_id)
+                        .copied()
+                        .unwrap_or(0)
+            })
+            .max_by_key(|(_, info, _)| info.commit_lsn)
+    }
+
    fn applicable_connection_candidates(
        &self,
    ) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, String)> {
@@ -500,15 +520,25 @@ impl WalreceiverState {
    }

    fn cleanup_old_candidates(&mut self) {
-        self.wal_stream_candidates.retain(|_, etcd_info| {
+        let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
+
+        self.wal_stream_candidates.retain(|node_id, etcd_info| {
            if let Ok(time_since_latest_etcd_update) =
                (Utc::now().naive_utc() - etcd_info.latest_update).to_std()
            {
-                time_since_latest_etcd_update < self.lagging_wal_timeout
+                let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
+                if !should_retain {
+                    node_ids_to_remove.push(*node_id);
+                }
+                should_retain
            } else {
                true
            }
        });
+
+        for node_id in node_ids_to_remove {
+            self.wal_connection_attempts.remove(&node_id);
+        }
    }
 }

@@ -843,6 +873,64 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("candidate_with_many_connection_failures")?;
+        let mut state = dummy_state(&harness);
+        let now = Utc::now().naive_utc();
+
+        let current_lsn = Lsn(100_000).align();
+        let bigger_lsn = Lsn(current_lsn.0 + 100).align();
+
+        state.wal_connection = None;
+        state.wal_stream_candidates = HashMap::from([
+            (
+                NodeId(0),
+                EtcdSkTimeline {
+                    timeline: SkTimelineInfo {
+                        last_log_term: None,
+                        flush_lsn: None,
+                        commit_lsn: Some(bigger_lsn),
+                        backup_lsn: None,
+                        remote_consistent_lsn: None,
+                        peer_horizon_lsn: None,
+                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
+                    },
+                    etcd_version: 0,
+                    latest_update: now,
+                },
+            ),
+            (
+                NodeId(1),
+                EtcdSkTimeline {
+                    timeline: SkTimelineInfo {
+                        last_log_term: None,
+                        flush_lsn: None,
+                        commit_lsn: Some(current_lsn),
+                        backup_lsn: None,
+                        remote_consistent_lsn: None,
+                        peer_horizon_lsn: None,
+                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
+                    },
+                    etcd_version: 0,
+                    latest_update: now,
+                },
+            ),
+        ]);
+        state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]);
+
+        let candidate_with_less_errors = state
+            .next_connection_candidate()
+            .expect("Expected one candidate selected, but got none");
+        assert_eq!(
+            candidate_with_less_errors.safekeeper_id,
+            NodeId(1),
+            "Should select the node with less connection errors"
+        );
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> {
        let harness = RepoHarness::create("connection_no_etcd_data_candidate")?;
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -48,7 +48,8 @@ use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset;
 use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
-use postgres_ffi::pg_constants;
+use postgres_ffi::xlog_utils::wal_record_verify_checksum;
+use postgres_ffi::{page_verify_checksum, pg_constants, XLogRecord};

 ///
 /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
@@ -131,6 +132,7 @@ lazy_static! {
 pub struct PostgresRedoManager {
    tenantid: ZTenantId,
    conf: &'static PageServerConf,
+    data_checksums_enabled: bool,

    process: Mutex<Option<PostgresRedoProcess>>,
 }
@@ -229,11 +231,16 @@ impl PostgresRedoManager {
    ///
    /// Create a new PostgresRedoManager.
    ///
-    pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
+    pub fn new(
+        conf: &'static PageServerConf,
+        data_checksums_enabled: bool,
+        tenantid: ZTenantId,
+    ) -> PostgresRedoManager {
        // The actual process is launched lazily, on first request.
        PostgresRedoManager {
            tenantid,
            conf,
+            data_checksums_enabled,
            process: Mutex::new(None),
        }
    }
@@ -268,7 +275,13 @@ impl PostgresRedoManager {
        // Relational WAL records are applied using wal-redo-postgres
        let buf_tag = BufferTag { rel, blknum };
        let result = process
-            .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
+            .apply_wal_records(
+                buf_tag,
+                base_img,
+                records,
+                wal_redo_timeout,
+                self.data_checksums_enabled,
+            )
            .map_err(WalRedoError::IoError);

        let end_time = Instant::now();
@@ -619,6 +632,7 @@ impl PostgresRedoProcess {
        info!("running initdb in {:?}", datadir.display());
        let initdb = Command::new(conf.pg_bin_dir().join("initdb"))
            .args(&["-D", &datadir.to_string_lossy()])
+            .arg("--data-checksums")
            .arg("-N")
            .env_clear()
            .env("LD_LIBRARY_PATH", conf.pg_lib_dir())
@@ -716,6 +730,7 @@ impl PostgresRedoProcess {
        base_img: Option<Bytes>,
        records: &[(Lsn, ZenithWalRecord)],
        wal_redo_timeout: Duration,
+        data_checksums_enabled: bool,
    ) -> Result<Bytes, std::io::Error> {
        // Serialize all the messages to send the WAL redo process first.
        //
@@ -725,6 +740,15 @@ impl PostgresRedoProcess {
        let mut writebuf: Vec<u8> = Vec::new();
        build_begin_redo_for_block_msg(tag, &mut writebuf);
        if let Some(img) = base_img {
+            // Checksums could be not stamped for old tenants, so check them only if they
+            // are enabled (this is controlled by per-tenant config).
+            if data_checksums_enabled && !unsafe { page_verify_checksum(&img, tag.blknum) } {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("block {} of relation {} is invalid", tag.blknum, tag.rel),
+                ));
+            }
+
            build_push_page_msg(tag, &img, &mut writebuf);
        }
        for (lsn, rec) in records.iter() {
@@ -733,6 +757,27 @@ impl PostgresRedoProcess {
                rec: postgres_rec,
            } = rec
            {
+                let xlogrec = XLogRecord::from_buf(postgres_rec).map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::InvalidData,
+                        format!(
+                            "could not deserialize WAL record for relation {} at LSN {}: {}",
+                            tag.rel, lsn, e
+                        ),
+                    )
+                })?;
+                // WAL records always have a checksum, check it before sending to redo process.
+                // It doesn't do these checks itself.
+                if !wal_record_verify_checksum(&xlogrec, postgres_rec) {
+                    return Err(std::io::Error::new(
+                        std::io::ErrorKind::InvalidData,
+                        format!(
+                            "WAL record for relation {} at LSN {} is invalid",
+                            tag.rel, lsn
+                        ),
+                    ));
+                }
+
                build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
            } else {
                return Err(Error::new(
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -49,6 +49,12 @@ impl UserFacingError for ConsoleAuthError {
    }
 }

+impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError {
+    fn from(e: &auth::credentials::ClientCredsParseError) -> Self {
+        ConsoleAuthError::BadProjectName(e.clone())
+    }
+}
+
 // TODO: convert into an enum with "error"
 #[derive(Serialize, Deserialize, Debug)]
 struct GetRoleSecretResponse {
@@ -92,14 +98,9 @@ impl<'a> Api<'a> {

    async fn get_auth_info(&self) -> Result<AuthInfo> {
        let mut url = self.endpoint.clone();
-        let project_name = self
-            .creds
-            .project_name
-            .as_ref()
-            .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?;
        url.path_segments_mut().push("proxy_get_role_secret");
        url.query_pairs_mut()
-            .append_pair("project", project_name)
+            .append_pair("project", self.creds.project_name.as_ref()?)
            .append_pair("role", &self.creds.user);

        // TODO: use a proper logger
@@ -121,12 +122,8 @@ impl<'a> Api<'a> {
    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(&self) -> Result<DatabaseInfo> {
        let mut url = self.endpoint.clone();
-        let project_name = self
-            .creds
-            .project_name
-            .as_ref()
-            .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?;
        url.path_segments_mut().push("proxy_wake_compute");
+        let project_name = self.creds.project_name.as_ref()?;
        url.query_pairs_mut().append_pair("project", project_name);

        // TODO: use a proper logger
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -115,7 +115,7 @@ mod tests {
            Ok(())
        });

-        let () = waiter.await?;
+        waiter.await?;
        notifier.await?
    }
 }
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -5,6 +5,11 @@ use anyhow::Context;
 use anyhow::Error;
 use anyhow::Result;
 use etcd_broker::subscription_value::SkTimelineInfo;
+use etcd_broker::LeaseKeepAliveStream;
+use etcd_broker::LeaseKeeper;
+
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
 use std::time::Duration;
 use tokio::spawn;
 use tokio::task::JoinHandle;
@@ -21,7 +26,7 @@ use utils::zid::{NodeId, ZTenantTimelineId};

 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
-const LEASE_TTL_SEC: i64 = 5;
+const LEASE_TTL_SEC: i64 = 10;

 pub fn thread_main(conf: SafeKeeperConf) {
    let runtime = runtime::Builder::new_current_thread()
@@ -154,13 +159,48 @@ pub fn get_candiate_name(system_id: NodeId) -> String {
    format!("id_{system_id}")
 }

+async fn push_sk_info(
+    zttid: ZTenantTimelineId,
+    mut client: Client,
+    key: String,
+    sk_info: SkTimelineInfo,
+    mut lease: Lease,
+) -> anyhow::Result<(ZTenantTimelineId, Lease)> {
+    let put_opts = PutOptions::new().with_lease(lease.id);
+    client
+        .put(
+            key.clone(),
+            serde_json::to_string(&sk_info)?,
+            Some(put_opts),
+        )
+        .await
+        .with_context(|| format!("failed to push safekeeper info to {}", key))?;
+
+    // revive the lease
+    lease
+        .keeper
+        .keep_alive()
+        .await
+        .context("failed to send LeaseKeepAliveRequest")?;
+    lease
+        .ka_stream
+        .message()
+        .await
+        .context("failed to receive LeaseKeepAliveResponse")?;
+
+    Ok((zttid, lease))
+}
+
+struct Lease {
+    id: i64,
+    keeper: LeaseKeeper,
+    ka_stream: LeaseKeepAliveStream,
+}
+
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
    let mut client = Client::connect(&conf.broker_endpoints, None).await?;
-
-    // Get and maintain lease to automatically delete obsolete data
-    let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
-    let (mut keeper, mut ka_stream) = client.lease_keep_alive(lease.id()).await?;
+    let mut leases: HashMap<ZTenantTimelineId, Lease> = HashMap::new();

    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
    loop {
@@ -168,33 +208,46 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
        // is under plain mutex. That's ok, all this code is not performance
        // sensitive and there is no risk of deadlock as we don't await while
        // lock is held.
-        for zttid in GlobalTimelines::get_active_timelines() {
-            if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
-                let sk_info = tli.get_public_info(&conf)?;
-                let put_opts = PutOptions::new().with_lease(lease.id());
-                client
-                    .put(
-                        timeline_safekeeper_path(
-                            conf.broker_etcd_prefix.clone(),
-                            zttid,
-                            conf.my_id,
-                        ),
-                        serde_json::to_string(&sk_info)?,
-                        Some(put_opts),
-                    )
-                    .await
-                    .context("failed to push safekeeper info")?;
+        let active_tlis = GlobalTimelines::get_active_timelines();
+
+        // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data.
+        for zttid in active_tlis.iter() {
+            if let Entry::Vacant(v) = leases.entry(*zttid) {
+                let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
+                let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?;
+                v.insert(Lease {
+                    id: lease.id(),
+                    keeper,
+                    ka_stream,
+                });
            }
        }
-        // revive the lease
-        keeper
-            .keep_alive()
-            .await
-            .context("failed to send LeaseKeepAliveRequest")?;
-        ka_stream
-            .message()
-            .await
-            .context("failed to receive LeaseKeepAliveResponse")?;
+        leases.retain(|zttid, _| active_tlis.contains(zttid));
+
+        // Push data concurrently to not suffer from latency, with many timelines it can be slow.
+        let handles = active_tlis
+            .iter()
+            .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid))
+            .map(|tli| {
+                let sk_info = tli.get_public_info(&conf);
+                let key = timeline_safekeeper_path(
+                    conf.broker_etcd_prefix.clone(),
+                    tli.zttid,
+                    conf.my_id,
+                );
+                let lease = leases.remove(&tli.zttid).unwrap();
+                tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease))
+            })
+            .collect::<Vec<_>>();
+        for h in handles {
+            let (zttid, lease) = h.await??;
+            // It is ugly to pull leases from hash and then put it back, but
+            // otherwise we have to resort to long living per tli tasks (which
+            // would generate a lot of errors when etcd is down) as task wants to
+            // have 'static objects, we can't borrow to it.
+            leases.insert(zttid, lease);
+        }
+
        sleep(push_interval).await;
    }
 }
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -239,6 +239,19 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
            remote_consistent_lsn: Lsn(0),
            peers: Peers(vec![]),
        });
+    } else if version == 5 {
+        info!("reading safekeeper control file version {}", version);
+        let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?;
+        if oldstate.timeline_start_lsn != Lsn(0) {
+            return Ok(oldstate);
+        }
+
+        // set special timeline_start_lsn because we don't know the real one
+        info!("setting timeline_start_lsn and local_start_lsn to Lsn(1)");
+        oldstate.timeline_start_lsn = Lsn(1);
+        oldstate.local_start_lsn = Lsn(1);
+
+        return Ok(oldstate);
    }
    bail!("unsupported safekeeper control file version {}", version)
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -28,7 +28,7 @@ use utils::{
 };

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 5;
+pub const SK_FORMAT_VERSION: u32 = 6;
 const SK_PROTOCOL_VERSION: u32 = 2;
 const UNKNOWN_SERVER_VERSION: u32 = 0;

--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use tokio::sync::watch;

 use std::cmp::{max, min};
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::fs::{self};

 use std::sync::{Arc, Mutex, MutexGuard};
@@ -445,9 +445,9 @@ impl Timeline {
    }

    /// Prepare public safekeeper info for reporting.
-    pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result<SkTimelineInfo> {
+    pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
        let shared_state = self.mutex.lock().unwrap();
-        Ok(SkTimelineInfo {
+        SkTimelineInfo {
            last_log_term: Some(shared_state.sk.get_epoch()),
            flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
            // note: this value is not flushed to control file yet and can be lost
@@ -460,7 +460,7 @@ impl Timeline {
            peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
            safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
            backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
-        })
+        }
    }

    /// Update timeline state with peer safekeeper data.
@@ -625,6 +625,8 @@ impl GlobalTimelines {
        zttid: ZTenantTimelineId,
        create: bool,
    ) -> Result<Arc<Timeline>> {
+        let _enter = info_span!("", timeline = %zttid.tenant_id).entered();
+
        let mut state = TIMELINES_STATE.lock().unwrap();

        match state.timelines.get(&zttid) {
@@ -667,7 +669,7 @@ impl GlobalTimelines {
    }

    /// Get ZTenantTimelineIDs of all active timelines.
-    pub fn get_active_timelines() -> Vec<ZTenantTimelineId> {
+    pub fn get_active_timelines() -> HashSet<ZTenantTimelineId> {
        let state = TIMELINES_STATE.lock().unwrap();
        state
            .timelines
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -2,18 +2,16 @@ use anyhow::{Context, Result};
 use etcd_broker::subscription_key::{
    NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind,
 };
-use tokio::io::AsyncRead;
 use tokio::task::JoinHandle;

 use std::cmp::min;
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
+use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;

-use postgres_ffi::xlog_utils::{
-    XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, MAX_SEND_SIZE, PG_TLI,
-};
+use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI};
 use remote_storage::{GenericRemoteStorage, RemoteStorage};
 use tokio::fs::File;
 use tokio::runtime::Builder;
@@ -452,45 +450,41 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
 pub async fn read_object(
    file_path: PathBuf,
    offset: u64,
-) -> (impl AsyncRead, JoinHandle<Result<()>>) {
-    let storage = REMOTE_STORAGE.get().expect("failed to get remote storage");
+) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
+    let download = match REMOTE_STORAGE
+        .get()
+        .context("Failed to get remote storage")?
+        .as_ref()
+        .context("No remote storage configured")?
+    {
+        GenericRemoteStorage::Local(local_storage) => {
+            let source = local_storage.remote_object_id(&file_path)?;

-    let (mut pipe_writer, pipe_reader) = tokio::io::duplex(MAX_SEND_SIZE);
-
-    let copy_result = tokio::spawn(async move {
-        let res = match storage.as_ref().unwrap() {
-            GenericRemoteStorage::Local(local_storage) => {
-                let source = local_storage.remote_object_id(&file_path)?;
-
-                info!(
-                    "local download about to start from {} at offset {}",
-                    source.display(),
-                    offset
-                );
-                local_storage
-                    .download_byte_range(&source, offset, None, &mut pipe_writer)
-                    .await
-            }
-            GenericRemoteStorage::S3(s3_storage) => {
-                let s3key = s3_storage.remote_object_id(&file_path)?;
-
-                info!(
-                    "S3 download about to start from {:?} at offset {}",
-                    s3key, offset
-                );
-                s3_storage
-                    .download_byte_range(&s3key, offset, None, &mut pipe_writer)
-                    .await
-            }
-        };
-
-        if let Err(e) = res {
-            error!("failed to download WAL segment from remote storage: {}", e);
-            Err(e)
-        } else {
-            Ok(())
+            info!(
+                "local download about to start from {} at offset {}",
+                source.display(),
+                offset
+            );
+            local_storage
+                .download_byte_range(&source, offset, None)
+                .await
        }
-    });
+        GenericRemoteStorage::S3(s3_storage) => {
+            let s3key = s3_storage.remote_object_id(&file_path)?;

-    (pipe_reader, copy_result)
+            info!(
+                "S3 download about to start from {:?} at offset {}",
+                s3key, offset
+            );
+            s3_storage.download_byte_range(&s3key, offset, None).await
+        }
+    }
+    .with_context(|| {
+        format!(
+            "Failed to open WAL segment download stream for local storage path {}",
+            file_path.display()
+        )
+    })?;
+
+    Ok(download.download_stream)
 }
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -604,8 +604,7 @@ impl WalReader {

        // Try to open remote file, if remote reads are enabled
        if self.enable_remote_read {
-            let (reader, _) = read_object(wal_file_path, xlogoff as u64).await;
-            return Ok(Box::pin(reader));
+            return read_object(wal_file_path, xlogoff as u64).await;
        }

        bail!("WAL segment is not found")
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,6 +28,10 @@ strict = true
 # There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
 ignore_missing_imports = true

+[mypy-pg8000.*]
+# Used only in testing clients
+ignore_missing_imports = true
+
 [mypy-cached_property.*]
 ignore_missing_imports = true

--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -37,7 +37,7 @@ You can run all the tests with:

 If you want to run all the tests in a particular file:

-`./scripts/pytest test_pgbench.py`
+`./scripts/pytest test_runner/batch_others/test_restart_compute.py`

 If you want to run all tests that have the string "bench" in their names:

@@ -45,7 +45,7 @@ If you want to run all tests that have the string "bench" in their names:

 Useful environment variables:

-`ZENITH_BIN`: The directory where zenith binaries can be found.
+`NEON_BIN`: The directory where neon binaries can be found.
 `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
--- a/test_runner/batch_others/test_ancestor_branch.py
+++ b/test_runner/batch_others/test_ancestor_branch.py
@@ -1,6 +1,3 @@
-from contextlib import closing
-
-import psycopg2.extras
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
--- a/test_runner/batch_others/test_auth.py
+++ b/test_runner/batch_others/test_auth.py
@@ -1,8 +1,6 @@
 from contextlib import closing
-from typing import Iterator
-from uuid import UUID, uuid4
+from uuid import uuid4
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
-from requests.exceptions import HTTPError
 import pytest


--- a/test_runner/batch_others/test_backpressure.py
+++ b/test_runner/batch_others/test_backpressure.py
@@ -1,11 +1,9 @@
 from contextlib import closing, contextmanager
 import psycopg2.extras
 import pytest
-from fixtures.neon_fixtures import PgProtocol, NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.log_helper import log
-import os
 import time
-import asyncpg
 from fixtures.neon_fixtures import Postgres
 import threading

--- a/test_runner/batch_others/test_basebackup_error.py
+++ b/test_runner/batch_others/test_basebackup_error.py
@@ -1,8 +1,6 @@
 import pytest
-from contextlib import closing

 from fixtures.neon_fixtures import NeonEnv
-from fixtures.log_helper import log


 #
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,4 +1,3 @@
-import subprocess
 from contextlib import closing

 import psycopg2.extras
--- a/test_runner/batch_others/test_fullbackup.py
+++ b/test_runner/batch_others/test_fullbackup.py
@@ -1,16 +1,10 @@
-import subprocess
 from contextlib import closing

-import psycopg2.extras
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres
 from fixtures.neon_fixtures import pg_distrib_dir
 import os
-from fixtures.utils import mkdir_if_needed, subprocess_capture
-import shutil
-import getpass
-import pwd
+from fixtures.utils import subprocess_capture

 num_rows = 1000

@@ -46,19 +40,20 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder,
    psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}

    # Get and unpack fullbackup from pageserver
-    restored_dir_path = os.path.join(env.repo_dir, "restored_datadir")
+    restored_dir_path = env.repo_dir / "restored_datadir"
    os.mkdir(restored_dir_path, 0o750)
    query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}"
    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
    result_basepath = pg_bin.run_capture(cmd, env=psql_env)
    tar_output_file = result_basepath + ".stdout"
-    subprocess_capture(str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", restored_dir_path])
+    subprocess_capture(str(env.repo_dir),
+                       ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)])

    # HACK
    # fullbackup returns neon specific pg_control and first WAL segment
    # use resetwal to overwrite it
    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal')
-    cmd = [pg_resetwal_path, "-D", restored_dir_path]
+    cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
    pg_bin.run_capture(cmd, env=psql_env)

    # Restore from the backup and find the data we inserted
--- a/test_runner/batch_others/test_import.py
+++ b/test_runner/batch_others/test_import.py
@@ -191,3 +191,8 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
    # Check it's the same as the first fullbackup
    # TODO pageserver should be checking checksum
    assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
+
+    # Check that gc works
+    psconn = env.pageserver.connect()
+    pscur = psconn.cursor()
+    pscur.execute(f"do_gc {tenant.hex} {timeline} 0")
--- a/test_runner/batch_others/test_remote_storage.py
+++ b/test_runner/batch_others/test_remote_storage.py
@@ -1,5 +1,5 @@
 # It's possible to run any regular test with the local fs remote storage via
-# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ......
+# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......

 import shutil, os
 from contextlib import closing
--- a/test_runner/batch_others/test_restart_compute.py
+++ b/test_runner/batch_others/test_restart_compute.py
@@ -1,74 +0,0 @@
-import pytest
-
-from contextlib import closing
-from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.log_helper import log
-
-
-#
-# Test restarting and recreating a postgres instance
-#
-@pytest.mark.parametrize('with_safekeepers', [False, True])
-def test_restart_compute(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool):
-    neon_env_builder.auth_enabled = True
-    if with_safekeepers:
-        neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    env.neon_cli.create_branch('test_restart_compute')
-    pg = env.postgres.create_start('test_restart_compute')
-    log.info("postgres is running on 'test_restart_compute' branch")
-
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute('CREATE TABLE t(key int primary key, value text)')
-            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-            cur.execute('SELECT sum(key) FROM t')
-            r = cur.fetchone()
-            assert r == (5000050000, )
-            log.info(f"res = {r}")
-
-    # Remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute')
-
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            # We can still see the row
-            cur.execute('SELECT sum(key) FROM t')
-            r = cur.fetchone()
-            assert r == (5000050000, )
-            log.info(f"res = {r}")
-
-            # Insert another row
-            cur.execute("INSERT INTO t VALUES (100001, 'payload2')")
-            cur.execute('SELECT count(*) FROM t')
-
-            r = cur.fetchone()
-            assert r == (100001, )
-            log.info(f"res = {r}")
-
-    # Again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute')
-
-    # That select causes lots of FPI's and increases probability of wakeepers
-    # lagging behind after query completion
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            # We can still see the rows
-            cur.execute('SELECT count(*) FROM t')
-
-            r = cur.fetchone()
-            assert r == (100001, )
-            log.info(f"res = {r}")
-
-    # And again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute')
-
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            # We can still see the rows
-            cur.execute('SELECT count(*) FROM t')
-
-            r = cur.fetchone()
-            assert r == (100001, )
-            log.info(f"res = {r}")
--- a/test_runner/batch_others/test_tenant_relocation.py
+++ b/test_runner/batch_others/test_tenant_relocation.py
@@ -10,8 +10,8 @@ from typing import Optional
 import signal
 import pytest

-from fixtures.neon_fixtures import PgProtocol, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir
-from fixtures.utils import lsn_from_hex
+from fixtures.neon_fixtures import PgProtocol, PortDistributor, Postgres, NeonEnvBuilder, Etcd, NeonPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, neon_binpath, pg_distrib_dir, base_dir
+from fixtures.utils import lsn_from_hex, subprocess_capture


 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
@@ -101,13 +101,23 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve
    log.info('load thread stopped')


-@pytest.mark.skip(
-    reason=
-    "needs to replace callmemaybe call with better idea how to migrate timelines between pageservers"
-)
+@pytest.mark.parametrize(
+    'method',
+    [
+        # A minor migration involves no storage breaking changes.
+        # It is done by attaching the tenant to a new pageserver.
+        'minor',
+        # A major migration involves exporting a postgres datadir
+        # basebackup and importing it into the new pageserver.
+        # This kind of migration can tolerate breaking changes
+        # to storage format
+        pytest.param('major', marks=pytest.mark.xfail(reason="Not implemented")),
+    ])
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
 def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
                           port_distributor: PortDistributor,
+                           test_output_dir,
+                           method: str,
                           with_load: str):
    neon_env_builder.enable_local_fs_remote_storage()

@@ -157,8 +167,11 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,

        load_stop_event = threading.Event()
        load_ok_event = threading.Event()
-        load_thread = threading.Thread(target=load,
-                                       args=(tenant_pg, load_stop_event, load_ok_event))
+        load_thread = threading.Thread(
+            target=load,
+            args=(tenant_pg, load_stop_event, load_ok_event),
+            daemon=True,  # To make sure the child dies when the parent errors
+        )
        load_thread.start()

    # run checkpoint manually to be sure that data landed in remote storage
@@ -188,30 +201,47 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
                               new_pageserver_http_port,
                               neon_env_builder.broker):

-        # call to attach timeline to new pageserver
-        new_pageserver_http.timeline_attach(tenant, timeline)
-        # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint
-        new_timeline_detail = wait_until(
-            number_of_iterations=5,
-            interval=1,
-            func=lambda: assert_local(new_pageserver_http, tenant, timeline))
+        # Migrate either by attaching from s3 or import/export basebackup
+        if method == "major":
+            cmd = [
+                "python",
+                os.path.join(base_dir, "scripts/export_import_between_pageservers.py"),
+                "--tenant-id",
+                tenant.hex,
+                "--from-host",
+                "localhost",
+                "--from-http-port",
+                str(pageserver_http.port),
+                "--from-pg-port",
+                str(env.pageserver.service_port.pg),
+                "--to-host",
+                "localhost",
+                "--to-http-port",
+                str(new_pageserver_http_port),
+                "--to-pg-port",
+                str(new_pageserver_pg_port),
+                "--psql-path",
+                os.path.join(pg_distrib_dir, "bin", "psql"),
+                "--work-dir",
+                os.path.join(test_output_dir),
+            ]
+            subprocess_capture(str(env.repo_dir), cmd, check=True)
+        elif method == "minor":
+            # call to attach timeline to new pageserver
+            new_pageserver_http.timeline_attach(tenant, timeline)

-        # when load is active these checks can break because lsns are not static
-        # so lets check with some margin
-        assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']),
-                                lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']),
-                                0.03)
+            # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint
+            new_timeline_detail = wait_until(
+                number_of_iterations=5,
+                interval=1,
+                func=lambda: assert_local(new_pageserver_http, tenant, timeline))

-        # callmemaybe to start replication from safekeeper to the new pageserver
-        # when there is no load there is a clean checkpoint and no wal delta
-        # needs to be streamed to the new pageserver
-        # TODO (rodionov) use attach to start replication
-        with pg_cur(PgProtocol(host='localhost', port=new_pageserver_pg_port)) as cur:
-            # "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'"
-            safekeeper_connstring = f"host=localhost port={env.safekeepers[0].port.pg} options='-c ztimelineid={timeline} ztenantid={tenant} pageserver_connstr=postgresql://no_user:@localhost:{new_pageserver_pg_port}'"
-            cur.execute("callmemaybe {} {} {}".format(tenant.hex,
-                                                      timeline.hex,
-                                                      safekeeper_connstring))
+            # when load is active these checks can break because lsns are not static
+            # so lets check with some margin
+            assert_abs_margin_ratio(
+                lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']),
+                lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']),
+                0.03)

        tenant_pg.stop()

--- a/test_runner/batch_others/test_tenant_tasks.py
+++ b/test_runner/batch_others/test_tenant_tasks.py
@@ -0,0 +1,70 @@
+from fixtures.neon_fixtures import NeonEnvBuilder, wait_until
+from uuid import UUID
+import time
+
+
+def get_only_element(l):
+    assert len(l) == 1
+    return l[0]
+
+
+# Test that gc and compaction tenant tasks start and stop correctly
+def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
+    # The gc and compaction loops don't bother to watch for tenant state
+    # changes while sleeping, so we use small periods to make this test
+    # run faster. With default settings we'd have to wait longer for tasks
+    # to notice state changes and shut down.
+    # TODO fix this behavior in the pageserver
+    tenant_config = "{gc_period = '1 s', compaction_period = '1 s'}"
+    neon_env_builder.pageserver_config_override = f"tenant_config={tenant_config}"
+    name = "test_tenant_tasks"
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    def get_state(tenant):
+        all_states = client.tenant_list()
+        matching = [t for t in all_states if t["id"] == tenant.hex]
+        return get_only_element(matching)["state"]
+
+    def get_metric_value(name):
+        metrics = client.get_metrics()
+        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
+        if len(relevant) == 0:
+            return 0
+        line = get_only_element(relevant)
+        value = line.lstrip(name).strip()
+        return int(value)
+
+    def detach_all_timelines(tenant):
+        timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)]
+        for t in timelines:
+            client.timeline_detach(tenant, t)
+
+    def assert_idle(tenant):
+        assert get_state(tenant) == "Idle"
+
+    # Create tenant, start compute
+    tenant, _ = env.neon_cli.create_tenant()
+    timeline = env.neon_cli.create_timeline(name, tenant_id=tenant)
+    pg = env.postgres.create_start(name, tenant_id=tenant)
+    assert (get_state(tenant) == "Active")
+
+    # Stop compute
+    pg.stop()
+
+    # Detach all tenants and wait for them to go idle
+    # TODO they should be already idle since there are no active computes
+    for tenant_info in client.tenant_list():
+        tenant_id = UUID(tenant_info["id"])
+        detach_all_timelines(tenant_id)
+        wait_until(10, 0.2, lambda: assert_idle(tenant_id))
+
+    # Assert that all tasks finish quickly after tenants go idle
+    def assert_tasks_finish():
+        tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}')
+        tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}')
+        tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}')
+        assert tasks_started == tasks_ended
+        assert tasks_panicked == 0
+
+    wait_until(10, 0.2, assert_tasks_finish)
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -1,3 +1,4 @@
+import pathlib
 import pytest
 import random
 import time
@@ -14,7 +15,7 @@ from dataclasses import dataclass, field
 from multiprocessing import Process, Value
 from pathlib import Path
 from fixtures.neon_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol
-from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex
+from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex
 from fixtures.log_helper import log
 from typing import List, Optional, Any
 from uuid import uuid4
@@ -645,7 +646,7 @@ class ProposerPostgres(PgProtocol):
    def create_dir_config(self, safekeepers: str):
        """ Create dir and config for running --sync-safekeepers """

-        mkdir_if_needed(self.pg_data_dir_path())
+        pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True)
        with open(self.config_file_path(), "w") as f:
            cfg = [
                "synchronous_standby_names = 'walproposer'\n",
@@ -681,7 +682,7 @@ class ProposerPostgres(PgProtocol):
    def initdb(self):
        """ Run initdb """

-        args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()]
+        args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path(), "--data-checksums"]
        self.pg_bin.run(args)

    def start(self):
@@ -828,7 +829,7 @@ class SafekeeperEnv:

        self.timeline_id = uuid.uuid4()
        self.tenant_id = uuid.uuid4()
-        mkdir_if_needed(str(self.repo_dir))
+        self.repo_dir.mkdir(exist_ok=True)

        # Create config and a Safekeeper object for each safekeeper
        self.safekeepers = []
@@ -847,8 +848,8 @@ class SafekeeperEnv:
            http=self.port_distributor.get_port(),
        )

-        safekeeper_dir = os.path.join(self.repo_dir, f"sk{i}")
-        mkdir_if_needed(safekeeper_dir)
+        safekeeper_dir = self.repo_dir / f"sk{i}"
+        safekeeper_dir.mkdir(exist_ok=True)

        args = [
            self.bin_safekeeper,
@@ -857,7 +858,7 @@ class SafekeeperEnv:
            "--listen-http",
            f"127.0.0.1:{port.http}",
            "-D",
-            safekeeper_dir,
+            str(safekeeper_dir),
            "--id",
            str(i),
            "--broker-endpoints",
--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -1,5 +1,6 @@
 import asyncio
 import uuid
+
 import asyncpg
 import random
 import time
@@ -7,7 +8,7 @@ import time
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper
 from fixtures.log_helper import getLogger
 from fixtures.utils import lsn_from_hex, lsn_to_hex
-from typing import List
+from typing import List, Optional

 log = getLogger('root.safekeeper_async')

@@ -234,3 +235,156 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
    # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments
    # are not removed before broadcasted to all safekeepers, with the help of replication slot
    asyncio.run(run_restarts_under_load(env, pg, env.safekeepers, period_time=15, iterations=5))
+
+
+def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
+    pg = Postgres(
+        env,
+        tenant_id=env.initial_tenant,
+        port=env.port_distributor.get_port(),
+        # In these tests compute has high probability of terminating on its own
+        # before our stop() due to lost consensus leadership.
+        check_stop_result=False)
+
+    # embed current time in node name
+    node_name = pgdir_name or f'pg_node_{time.time()}'
+    return pg.create_start(branch_name=branch,
+                           node_name=node_name,
+                           config_lines=['log_statement=all'])
+
+
+async def exec_compute_query(env: NeonEnv,
+                             branch: str,
+                             query: str,
+                             pgdir_name: Optional[str] = None):
+    with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg:
+        before_conn = time.time()
+        conn = await pg.connect_async()
+        res = await conn.fetch(query)
+        await conn.close()
+        after_conn = time.time()
+        log.info(f'{query} took {after_conn - before_conn}s')
+        return res
+
+
+async def run_compute_restarts(env: NeonEnv,
+                               queries=16,
+                               batch_insert=10000,
+                               branch='test_compute_restarts'):
+    cnt = 0
+    sum = 0
+
+    await exec_compute_query(env, branch, 'CREATE TABLE t (i int)')
+
+    for i in range(queries):
+        if i % 4 == 0:
+            await exec_compute_query(
+                env, branch, f'INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})')
+            sum += batch_insert
+            cnt += batch_insert
+        elif (i % 4 == 1) or (i % 4 == 3):
+            # Note that select causes lots of FPI's and increases probability of safekeepers
+            # standing at different LSNs after compute termination.
+            actual_sum = (await exec_compute_query(env, branch, 'SELECT SUM(i) FROM t'))[0][0]
+            assert actual_sum == sum, f'Expected sum={sum}, actual={actual_sum}'
+        elif i % 4 == 2:
+            await exec_compute_query(env, branch, 'UPDATE t SET i = i + 1')
+            sum += cnt
+
+
+# Add a test which creates compute for every query, and then destroys it right after.
+def test_compute_restarts(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch('test_compute_restarts')
+    asyncio.run(run_compute_restarts(env))
+
+
+class BackgroundCompute(object):
+    def __init__(self, index: int, env: NeonEnv, branch: str):
+        self.index = index
+        self.env = env
+        self.branch = branch
+        self.running = False
+        self.stopped = False
+        self.total_tries = 0
+        self.successful_queries: List[int] = []
+
+    async def run(self):
+        if self.running:
+            raise Exception('BackgroundCompute is already running')
+
+        self.running = True
+        i = 0
+        while not self.stopped:
+            try:
+                verify_key = (self.index << 16) + i
+                i += 1
+                self.total_tries += 1
+                res = await exec_compute_query(
+                    self.env,
+                    self.branch,
+                    f'INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key',
+                    pgdir_name=f'bgcompute{self.index}_key{verify_key}',
+                )
+                log.info(f'result: {res}')
+                if len(res) != 1:
+                    raise Exception('No result returned')
+                if res[0][0] != verify_key:
+                    raise Exception('Wrong result returned')
+                self.successful_queries.append(verify_key)
+            except Exception as e:
+                log.info(f'BackgroundCompute {self.index} query failed: {e}')
+
+            # With less sleep, there is a very big chance of not committing
+            # anything or only 1 xact during test run.
+            await asyncio.sleep(2 * random.random())
+        self.running = False
+
+
+async def run_concurrent_computes(env: NeonEnv,
+                                  num_computes=10,
+                                  run_seconds=20,
+                                  branch='test_concurrent_computes'):
+    await exec_compute_query(
+        env,
+        branch,
+        'CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)')
+
+    computes = [BackgroundCompute(i, env, branch) for i in range(num_computes)]
+    background_tasks = [asyncio.create_task(compute.run()) for compute in computes]
+
+    await asyncio.sleep(run_seconds)
+    for compute in computes[1:]:
+        compute.stopped = True
+    log.info("stopped all tasks but one")
+
+    # work for some time with only one compute -- it should be able to make some xacts
+    await asyncio.sleep(8)
+    computes[0].stopped = True
+
+    await asyncio.gather(*background_tasks)
+
+    result = await exec_compute_query(env, branch, 'SELECT * FROM query_log')
+    # we should have inserted something while single compute was running
+    assert len(result) >= 4
+    log.info(f'Executed {len(result)} queries')
+    for row in result:
+        log.info(f'{row[0]} {row[1]} {row[2]}')
+
+    # ensure everything reported as committed wasn't lost
+    for compute in computes:
+        for verify_key in compute.successful_queries:
+            assert verify_key in [row[2] for row in result]
+
+
+# Run multiple computes concurrently, creating-destroying them after single
+# query. Ensure we don't lose any xacts reported as committed and be able to
+# progress once only one compute remains.
+def test_concurrent_computes(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch('test_concurrent_computes')
+    asyncio.run(run_concurrent_computes(env))
--- a/test_runner/batch_others/test_wal_restore.py
+++ b/test_runner/batch_others/test_wal_restore.py
@@ -1,19 +1,17 @@
 import os
-import subprocess
+from pathlib import Path

 from fixtures.neon_fixtures import (NeonEnvBuilder,
                                    VanillaPostgres,
                                    PortDistributor,
                                    PgBin,
                                    base_dir,
-                                    vanilla_pg,
                                    pg_distrib_dir)
-from fixtures.log_helper import log


 def test_wal_restore(neon_env_builder: NeonEnvBuilder,
                     pg_bin: PgBin,
-                     test_output_dir,
+                     test_output_dir: Path,
                     port_distributor: PortDistributor):
    env = neon_env_builder.init_start()
    env.neon_cli.create_branch("test_wal_restore")
@@ -22,13 +20,13 @@ def test_wal_restore(neon_env_builder: NeonEnvBuilder,
    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
    env.neon_cli.pageserver_stop()
    port = port_distributor.get_port()
-    data_dir = os.path.join(test_output_dir, 'pgsql.restored')
+    data_dir = test_output_dir / 'pgsql.restored'
    with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored:
        pg_bin.run_capture([
            os.path.join(base_dir, 'libs/utils/scripts/restore_from_wal.sh'),
            os.path.join(pg_distrib_dir, 'bin'),
-            os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)),
-            data_dir,
+            str(test_output_dir / 'repo' / 'safekeepers' / 'sk1' / str(tenant_id) / '*'),
+            str(data_dir),
            str(port)
        ])
        restored.start()
--- a/test_runner/batch_pg_regress/test_isolation.py
+++ b/test_runner/batch_pg_regress/test_isolation.py
@@ -1,13 +1,13 @@
 import os
+from pathlib import Path
 import pytest
-from fixtures.utils import mkdir_if_needed
 from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir


 # The isolation tests run for a long time, especially in debug mode,
 # so use a larger-than-default timeout.
@pytest.mark.timeout(1800)
-def test_isolation(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys):
+def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys):
    env = neon_simple_env

    env.neon_cli.create_branch("test_isolation", "empty")
@@ -17,9 +17,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys):
    pg.safe_psql('CREATE DATABASE isolation_regression')

    # Create some local directories for pg_isolation_regress to run in.
-    runpath = os.path.join(test_output_dir, 'regress')
-    mkdir_if_needed(runpath)
-    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+    runpath = test_output_dir / 'regress'
+    (runpath / 'testtablespace').mkdir(parents=True)

    # Compute all the file locations that pg_isolation_regress will need.
    build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation')
--- a/test_runner/batch_pg_regress/test_neon_regress.py
+++ b/test_runner/batch_pg_regress/test_neon_regress.py
@@ -1,6 +1,6 @@
 import os
+from pathlib import Path

-from fixtures.utils import mkdir_if_needed
 from fixtures.neon_fixtures import (NeonEnv,
                                    check_restored_datadir_content,
                                    base_dir,
@@ -8,7 +8,7 @@ from fixtures.neon_fixtures import (NeonEnv,
 from fixtures.log_helper import log


-def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys):
+def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys):
    env = neon_simple_env

    env.neon_cli.create_branch("test_neon_regress", "empty")
@@ -17,9 +17,8 @@ def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys)
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
-    runpath = os.path.join(test_output_dir, 'regress')
-    mkdir_if_needed(runpath)
-    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+    runpath = test_output_dir / 'regress'
+    (runpath / 'testtablespace').mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
    # This test runs neon specific tests
--- a/test_runner/batch_pg_regress/test_pg_regress.py
+++ b/test_runner/batch_pg_regress/test_pg_regress.py
@@ -1,13 +1,13 @@
 import os
+import pathlib
 import pytest
-from fixtures.utils import mkdir_if_needed
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content, base_dir, pg_distrib_dir


 # The pg_regress tests run for a long time, especially in debug mode,
 # so use a larger-than-default timeout.
@pytest.mark.timeout(1800)
-def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, capsys):
+def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_bin, capsys):
    env = neon_simple_env

    env.neon_cli.create_branch("test_pg_regress", "empty")
@@ -16,9 +16,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, caps
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
-    runpath = os.path.join(test_output_dir, 'regress')
-    mkdir_if_needed(runpath)
-    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+    runpath = test_output_dir / 'regress'
+    (runpath / 'testtablespace').mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
    build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress')
@@ -51,7 +50,7 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, caps

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
-        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
+        pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -35,12 +35,7 @@ from typing_extensions import Literal
 import requests
 import backoff  # type: ignore

-from .utils import (etcd_path,
-                    get_self_dir,
-                    mkdir_if_needed,
-                    subprocess_capture,
-                    lsn_from_hex,
-                    lsn_to_hex)
+from .utils import (etcd_path, get_self_dir, subprocess_capture, lsn_from_hex, lsn_to_hex)
 from fixtures.log_helper import log
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
@@ -50,7 +45,7 @@ A fixture is created with the decorator @pytest.fixture decorator.
 See docs: https://docs.pytest.org/en/6.2.x/fixture.html

 There are several environment variables that can control the running of tests:
-ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.
+NEON_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.

 There's no need to import this file to use it. It should be declared as a plugin
 inside conftest.py, and that makes it available to all tests.
@@ -127,7 +122,7 @@ def pytest_configure(config):
        top_output_dir = env_test_output
    else:
        top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
-    mkdir_if_needed(top_output_dir)
+    pathlib.Path(top_output_dir).mkdir(exist_ok=True)

    # Find the postgres installation.
    global pg_distrib_dir
@@ -151,7 +146,7 @@ def pytest_configure(config):
        return
    # Find the neon binaries.
    global neon_binpath
-    env_neon_bin = os.environ.get('ZENITH_BIN')
+    env_neon_bin = os.environ.get('NEON_BIN')
    if env_neon_bin:
        neon_binpath = env_neon_bin
    else:
@@ -1165,6 +1160,7 @@ class NeonCli:
        node_name: str,
        tenant_id: Optional[uuid.UUID] = None,
        destroy=False,
+        check_return_code=True,
    ) -> 'subprocess.CompletedProcess[str]':
        args = [
            'pg',
@@ -1177,7 +1173,7 @@ class NeonCli:
        if node_name is not None:
            args.append(node_name)

-        return self.raw_cli(args)
+        return self.raw_cli(args, check_return_code=check_return_code)

    def raw_cli(self,
                arguments: List[str],
@@ -1193,6 +1189,8 @@ class NeonCli:
        >>> result = env.neon_cli.raw_cli(...)
        >>> assert result.stderr == ""
        >>> log.info(result.stdout)
+
+        If `check_return_code`, on non-zero exit code logs failure and raises.
        """

        assert type(arguments) == list
@@ -1218,27 +1216,27 @@ class NeonCli:
            env_vars[var] = val

        # Intercept CalledProcessError and print more info
-        try:
-            res = subprocess.run(args,
-                                 env=env_vars,
-                                 check=True,
-                                 universal_newlines=True,
-                                 stdout=subprocess.PIPE,
-                                 stderr=subprocess.PIPE)
+        res = subprocess.run(args,
+                             env=env_vars,
+                             check=False,
+                             universal_newlines=True,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE)
+        if not res.returncode:
            log.info(f"Run success: {res.stdout}")
-        except subprocess.CalledProcessError as exc:
+        elif check_return_code:
            # this way command output will be in recorded and shown in CI in failure message
            msg = f"""\
-            Run failed: {exc}
-              stdout: {exc.stdout}
-              stderr: {exc.stderr}
+            Run {res.args} failed:
+              stdout: {res.stdout}
+              stderr: {res.stderr}
            """
            log.info(msg)
+            raise Exception(msg) from subprocess.CalledProcessError(res.returncode,
+                                                                    res.args,
+                                                                    res.stdout,
+                                                                    res.stderr)

-            raise Exception(msg) from exc
-
-        if check_return_code:
-            res.check_returncode()
        return res


@@ -1316,7 +1314,7 @@ def append_pageserver_param_overrides(

 class PgBin:
    """ A helper class for executing postgres binaries """
-    def __init__(self, log_dir: str):
+    def __init__(self, log_dir: Path):
        self.log_dir = log_dir
        self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin')
        self.env = os.environ.copy()
@@ -1367,22 +1365,27 @@ class PgBin:
        self._fixpath(command)
        log.info('Running command "{}"'.format(' '.join(command)))
        env = self._build_env(env)
-        return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs)
+        return subprocess_capture(str(self.log_dir),
+                                  command,
+                                  env=env,
+                                  cwd=cwd,
+                                  check=True,
+                                  **kwargs)


@pytest.fixture(scope='function')
-def pg_bin(test_output_dir: str) -> PgBin:
+def pg_bin(test_output_dir: Path) -> PgBin:
    return PgBin(test_output_dir)


 class VanillaPostgres(PgProtocol):
-    def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int, init=True):
+    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
        super().__init__(host='localhost', port=port, dbname='postgres')
        self.pgdatadir = pgdatadir
        self.pg_bin = pg_bin
        self.running = False
        if init:
-            self.pg_bin.run_capture(['initdb', '-D', pgdatadir])
+            self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)])
        self.configure([f"port = {port}\n"])

    def configure(self, options: List[str]):
@@ -1398,12 +1401,13 @@ class VanillaPostgres(PgProtocol):
        if log_path is None:
            log_path = os.path.join(self.pgdatadir, "pg.log")

-        self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, '-l', log_path, 'start'])
+        self.pg_bin.run_capture(
+            ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start'])

    def stop(self):
        assert self.running
        self.running = False
-        self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, 'stop'])
+        self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop'])

    def get_subdir_size(self, subdir) -> int:
        """Return size of pgdatadir subdirectory in bytes."""
@@ -1418,9 +1422,9 @@ class VanillaPostgres(PgProtocol):


@pytest.fixture(scope='function')
-def vanilla_pg(test_output_dir: str,
+def vanilla_pg(test_output_dir: Path,
               port_distributor: PortDistributor) -> Iterator[VanillaPostgres]:
-    pgdatadir = os.path.join(test_output_dir, "pgdata-vanilla")
+    pgdatadir = test_output_dir / "pgdata-vanilla"
    pg_bin = PgBin(test_output_dir)
    port = port_distributor.get_port()
    with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg:
@@ -1457,7 +1461,7 @@ class RemotePostgres(PgProtocol):


@pytest.fixture(scope='function')
-def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]:
+def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]:
    pg_bin = PgBin(test_output_dir)

    connstr = os.getenv("BENCHMARK_CONNSTR")
@@ -1525,7 +1529,11 @@ def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]:

 class Postgres(PgProtocol):
    """ An object representing a running postgres daemon. """
-    def __init__(self, env: NeonEnv, tenant_id: uuid.UUID, port: int):
+    def __init__(self,
+                 env: NeonEnv,
+                 tenant_id: uuid.UUID,
+                 port: int,
+                 check_stop_result: bool = True):
        super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres')
        self.env = env
        self.running = False
@@ -1533,6 +1541,7 @@ class Postgres(PgProtocol):
        self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
        self.tenant_id = tenant_id
        self.port = port
+        self.check_stop_result = check_stop_result
        # path to conf is <repo_dir>/pgdatadirs/tenants/<tenant_id>/<node_name>/postgresql.conf

    def create(
@@ -1584,8 +1593,6 @@ class Postgres(PgProtocol):
                                                port=self.port)
        self.running = True

-        log.info(f"stdout: {run_result.stdout}")
-
        return self

    def pg_data_dir_path(self) -> str:
@@ -1649,7 +1656,9 @@ class Postgres(PgProtocol):

        if self.running:
            assert self.node_name is not None
-            self.env.neon_cli.pg_stop(self.node_name, self.tenant_id)
+            self.env.neon_cli.pg_stop(self.node_name,
+                                      self.tenant_id,
+                                      check_return_code=self.check_stop_result)
            self.running = False

        return self
@@ -1661,7 +1670,10 @@ class Postgres(PgProtocol):
        """

        assert self.node_name is not None
-        self.env.neon_cli.pg_stop(self.node_name, self.tenant_id, True)
+        self.env.neon_cli.pg_stop(self.node_name,
+                                  self.tenant_id,
+                                  True,
+                                  check_return_code=self.check_stop_result)
        self.node_name = None
        self.running = False

@@ -1680,6 +1692,8 @@ class Postgres(PgProtocol):
        Returns self.
        """

+        started_at = time.time()
+
        self.create(
            branch_name=branch_name,
            node_name=node_name,
@@ -1687,6 +1701,8 @@ class Postgres(PgProtocol):
            lsn=lsn,
        ).start()

+        log.info(f"Postgres startup took {time.time() - started_at} seconds")
+
        return self

    def __enter__(self):
@@ -1924,9 +1940,12 @@ class Etcd:
    datadir: str
    port: int
    peer_port: int
-    binary_path: Path = etcd_path()
+    binary_path: Path = field(init=False)
    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon

+    def __post_init__(self):
+        self.binary_path = etcd_path()
+
    def client_url(self):
        return f'http://127.0.0.1:{self.port}'

@@ -1980,11 +1999,13 @@ class Etcd:
            self.handle.wait()


-def get_test_output_dir(request: Any) -> str:
+def get_test_output_dir(request: Any) -> pathlib.Path:
    """ Compute the working directory for an individual test. """
    test_name = request.node.name
-    test_dir = os.path.join(str(top_output_dir), test_name)
+    test_dir = pathlib.Path(top_output_dir) / test_name.replace("/", "-")
    log.info(f'get_test_output_dir is {test_dir}')
+    # make mypy happy
+    assert isinstance(test_dir, pathlib.Path)
    return test_dir


@@ -1998,14 +2019,14 @@ def get_test_output_dir(request: Any) -> str:
 # this fixture ensures that the directory exists.  That works because
 # 'autouse' fixtures are run before other fixtures.
@pytest.fixture(scope='function', autouse=True)
-def test_output_dir(request: Any) -> str:
+def test_output_dir(request: Any) -> pathlib.Path:
    """ Create the working directory for an individual test. """

    # one directory per test
    test_dir = get_test_output_dir(request)
    log.info(f'test_output_dir is {test_dir}')
    shutil.rmtree(test_dir, ignore_errors=True)
-    mkdir_if_needed(test_dir)
+    test_dir.mkdir()
    return test_dir


@@ -2051,7 +2072,7 @@ def should_skip_file(filename: str) -> bool:
 #
 # Test helpers
 #
-def list_files_to_compare(pgdata_dir: str):
+def list_files_to_compare(pgdata_dir: pathlib.Path):
    pgdata_files = []
    for root, _file, filenames in os.walk(pgdata_dir):
        for filename in filenames:
@@ -2068,7 +2089,7 @@ def list_files_to_compare(pgdata_dir: str):


 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postgres):
+def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres):

    # Get the timeline ID. We need it for the 'basebackup' command
    with closing(pg.connect()) as conn:
@@ -2080,8 +2101,8 @@ def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postg
    pg.stop()

    # Take a basebackup from pageserver
-    restored_dir_path = os.path.join(env.repo_dir, f"{pg.node_name}_restored_datadir")
-    mkdir_if_needed(restored_dir_path)
+    restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir"
+    restored_dir_path.mkdir(exist_ok=True)

    pg_bin = PgBin(test_output_dir)
    psql_path = os.path.join(pg_bin.pg_bin_path, 'psql')
@@ -2108,7 +2129,7 @@ def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postg

    # list files we're going to compare
    assert pg.pgdata_dir
-    pgdata_files = list_files_to_compare(pg.pgdata_dir)
+    pgdata_files = list_files_to_compare(pathlib.Path(pg.pgdata_dir))
    restored_files = list_files_to_compare(restored_dir_path)

    # check that file sets are equal
@@ -2140,7 +2161,7 @@ def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postg
    assert (mismatch, error) == ([], [])


-def wait_until(number_of_iterations: int, interval: int, func):
+def wait_until(number_of_iterations: int, interval: float, func):
    """
    Wait until 'func' returns successfully, without exception. Returns the last return value
    from the the function.
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -12,18 +12,6 @@ def get_self_dir() -> str:
    return os.path.dirname(os.path.abspath(__file__))


-def mkdir_if_needed(path: str) -> None:
-    """ Create a directory if it doesn't already exist
-
-    Note this won't try to create intermediate directories.
-    """
-    try:
-        os.mkdir(path)
-    except FileExistsError:
-        pass
-    assert os.path.isdir(path)
-
-
 def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
    """ Run a process and capture its output

--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -80,6 +80,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it
            thread.join()


+@pytest.mark.timeout(1000)
@pytest.mark.parametrize("n_tables", [5])
@pytest.mark.parametrize("scale", get_scales_matrix(5))
@pytest.mark.parametrize("num_iters", [10])
@@ -121,6 +122,7 @@ def start_pgbench_simple_update_workload(env: PgCompare, duration: int):
        env.flush()


+@pytest.mark.timeout(1000)
@pytest.mark.parametrize("scale", get_scales_matrix(100))
@pytest.mark.parametrize("duration", get_durations_matrix())
 def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, duration: int):
@@ -158,6 +160,7 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int):
        ])


+@pytest.mark.timeout(1000)
@pytest.mark.parametrize("scale", get_scales_matrix(1000))
 def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int):
    env = pg_compare
--- a/test_runner/pg_clients/csharp/npgsql/.dockerignore
+++ b/test_runner/pg_clients/csharp/npgsql/.dockerignore
@@ -0,0 +1,2 @@
+bin/
+obj/
--- a/test_runner/pg_clients/csharp/npgsql/.gitignore
+++ b/test_runner/pg_clients/csharp/npgsql/.gitignore
@@ -0,0 +1,2 @@
+bin/
+obj/
--- a/test_runner/pg_clients/csharp/npgsql/Dockerfile
+++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile
@@ -0,0 +1,14 @@
+FROM mcr.microsoft.com/dotnet/sdk:6.0 AS build
+WORKDIR /source
+
+COPY *.csproj .
+RUN dotnet restore
+
+COPY . .
+RUN dotnet publish -c release -o /app --no-restore
+
+FROM mcr.microsoft.com/dotnet/runtime:6.0
+WORKDIR /app
+COPY --from=build /app .
+
+ENTRYPOINT ["dotnet", "csharp-npgsql.dll"]
--- a/test_runner/pg_clients/csharp/npgsql/Program.cs
+++ b/test_runner/pg_clients/csharp/npgsql/Program.cs
@@ -0,0 +1,19 @@
+using Npgsql;
+
+var host = Environment.GetEnvironmentVariable("NEON_HOST");
+var database = Environment.GetEnvironmentVariable("NEON_DATABASE");
+var user = Environment.GetEnvironmentVariable("NEON_USER");
+var password = Environment.GetEnvironmentVariable("NEON_PASSWORD");
+
+var connString = $"Host={host};Username={user};Password={password};Database={database}";
+
+await using var conn = new NpgsqlConnection(connString);
+await conn.OpenAsync();
+
+await using (var cmd = new NpgsqlCommand("SELECT 1", conn))
+await using (var reader = await cmd.ExecuteReaderAsync())
+{
+    while (await reader.ReadAsync())
+        Console.WriteLine(reader.GetInt32(0));
+}
+await conn.CloseAsync();
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -0,0 +1,14 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Npgsql" Version="6.0.5" />
+  </ItemGroup>
+
+</Project>
--- a/test_runner/pg_clients/java/jdbc/.gitignore
+++ b/test_runner/pg_clients/java/jdbc/.gitignore
@@ -0,0 +1 @@
+
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -0,0 +1,10 @@
+FROM openjdk:17
+WORKDIR /source
+
+COPY . .
+
+WORKDIR /app
+RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.4.0.jar && \
+    javac -d /app /source/Example.java
+
+CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"]
--- a/test_runner/pg_clients/java/jdbc/Example.java
+++ b/test_runner/pg_clients/java/jdbc/Example.java
@@ -0,0 +1,31 @@
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.util.Properties;
+
+public class Example
+{
+    public static void main( String[] args ) throws Exception
+    {
+        String host = System.getenv("NEON_HOST");
+        String database = System.getenv("NEON_DATABASE");
+        String user = System.getenv("NEON_USER");
+        String password = System.getenv("NEON_PASSWORD");
+
+        String url = "jdbc:postgresql://%s/%s".formatted(host, database);
+        Properties props = new Properties();
+        props.setProperty("user", user);
+        props.setProperty("password", password);
+
+        Connection conn = DriverManager.getConnection(url, props);
+        Statement st = conn.createStatement();
+        ResultSet rs = st.executeQuery("SELECT 1");
+        while (rs.next())
+        {
+            System.out.println(rs.getString(1));
+        }
+        rs.close();
+        st.close();
+    }
+}
--- a/test_runner/pg_clients/python/asyncpg/Dockerfile
+++ b/test_runner/pg_clients/python/asyncpg/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.10
+WORKDIR /source
+
+COPY . .
+
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
+
+CMD ["python3", "asyncpg_example.py"]
--- a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py
+++ b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py
@@ -0,0 +1,30 @@
+#! /usr/bin/env python3
+
+import asyncio
+import os
+
+import asyncpg
+
+
+async def run(**kwargs) -> asyncpg.Record:
+    conn = await asyncpg.connect(
+        **kwargs,
+        statement_cache_size=0,  # Prepared statements doesn't work pgbouncer
+    )
+    rv = await conn.fetchrow("SELECT 1")
+    await conn.close()
+
+    return rv
+
+
+if __name__ == "__main__":
+    kwargs = {
+        k.lstrip("NEON_").lower(): v
+        for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD")
+        if (v := os.environ.get(k, None)) is not None
+    }
+
+    loop = asyncio.new_event_loop()
+    row = loop.run_until_complete(run(**kwargs))
+
+    print(row[0])
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alexey Kondratov	0eca1d19de	Add safety notes, benchmark. Optimize checksum calculation	2022-07-07 20:45:50 +02:00
Alexey Kondratov	53b9cb915e	Turn off data-checksums for old tenants by default and explicitly enable for new ones	2022-07-07 19:44:47 +02:00
Alexey Kondratov	cc6ffb558d	Verify checksum of the page and WAL records before sending to the redo process	2022-07-07 19:44:47 +02:00
Alexey Kondratov	b135dbb85d	Bump vendor/postgres	2022-07-07 19:44:47 +02:00
Alexey Kondratov	6059801943	Enable Postgres data checksums (neondatabase/cloud#536 ) We need checksums to verify data integrity, when we read it from untrusted place (e.g. local disk) or via untrusted communication channel (e.g. network). At the same time, we trust pageserver <-> redo process communication channel, as it is just a pipe. Here we enable calculation of data checksums in the wal redo process and when we extract FPI during WAL injestion. Compute node (Postgres) will verify checksum of every page after receiving it back from pageserver. So it is pretty similar to how vanilla Postgres checks them. There are two other places where we should verify checksums to detect data corruption earlier: - when we receive WAL records from safekeepers (already implemented, see: WalStreamDecoder::poll_decode) - when we write layer files to disk and read back in memory from local disk or S3	2022-07-07 19:44:47 +02:00
Konstantin Knizhnik	2501afba6e	Calculate postgres checksum for FPI stored in pageserver (neondatabase/cloud#536 )	2022-07-07 19:44:47 +02:00
Andrey Taranik	ae116ff0a9	update timeout for proxy deploy (#2047 )	2022-07-07 18:09:57 +03:00
Heikki Linnakangas	e6ea049165	If an error happens during import of base backup or WAL, log it. We only sent the error to the client, with no trace in the pageserver log. Log it, similar to how we log errors in GetPage@LSN requests.	2022-07-07 16:05:13 +03:00
Alexey Kondratov	747d009bb4	Fix panic while waiting for Postgres readiness in the compute_ctl (#2021 ) We were reading Postgres pid file and looking for the 'ready' status, but it could be empty or we could not read it. So add all the checks.	2022-07-07 11:56:58 +02:00
Alexander Bayandin	cb5df3c627	github/actions: set missing VIP_VAP_ACCESS_TOKEN (#2045 )	2022-07-07 10:47:03 +01:00
Heikki Linnakangas	0e3456351f	Shrink thread pools used for WAL receivers and background tasks. I noticed that the pageserver has a very large virtual memory size, several GB, even though it doesn't actually use that much memory. That's not much of a problem normally, but I hit it because I wanted to run tests with a limited virtual memory size, by calling setrlimit(RLIMIT_AS), but the highest limit you can set is 2 GB. I was not able to start pageserver with a limit of 2 GB. On Linux, each thread allocates 32 MB of virtual memory. I read this on some random forum on the Internet, but unfortunately could not find the source again now. Empirically, reducing the number of threads clearly helps to bring down the virtual memory size. Aside from the virtual memory usage, it seems excessive to launch 40 threads in both of those thread pools. The tokio default is to have as many worker threads as there are CPU cores in the system. That seems like a fine heuristic for us, too, so remove the explicit setting of the pool size and rely on the default. Note that the GC and compaction tasks are actually run with tokio spawn_blocking, so the threads that are actually doing the work, and possibly waiting on I/O, are not consuming threads from the thread pool. The WAL receiver work is done in the tokio worker threads, but the WAL receivers are more CPU bound so that seems OK. Also remove the explicit maxinum on blocking tasks. I'm not sure what the right value for that would be, or whether the value we set (100) would be better than the tokio default (512). Since the value was arbitrary, let's just rely on the tokio default for that, too.	2022-07-06 22:36:38 +03:00
Alexander Bayandin	1faf49da0f	github/actions: set PERF_TEST_RESULT_CONNSTR from secrets (#2040 )	2022-07-06 19:24:06 +01:00
bojanserafimov	4a96259bdd	Add export/import test (#2036 )	2022-07-06 13:45:26 -04:00
bojanserafimov	242af75653	Fix signal file parsing (#2042 )	2022-07-06 13:45:02 -04:00
Arthur Petukhovsky	8fabdc6708	Add tests with concurrent computes. Removes test_restart_compute, as added test_compute_restarts is stronger.	2022-07-06 18:07:29 +04:00
Alexander Bayandin	07df7c2edd	github/actions: fix storing perf data for main (#2038 )	2022-07-06 13:15:15 +01:00
Kirill Bulatov	50821c0a3c	Return download stream directly from the remote storage API	2022-07-05 21:45:15 +03:00
Andrey Taranik	68adfe0fc8	inventory file fix for neon-stress env	2022-07-05 21:29:03 +04:00
Dmitry Rodionov	cfdf79aceb	harden create_empty_timeline Reorder checks so it checks whether the timeline exists before writing something to disk, possibly replacing valid content	2022-07-05 16:44:18 +03:00
bojanserafimov	32560e75d2	Enable relocation test (#1974 )	2022-07-05 08:27:57 -04:00
Heikki Linnakangas	bb69e0920c	Do not overwrite an existing image layer. See github issues #1594 and #1690 Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2022-07-05 14:45:31 +03:00
Alexander Bayandin	05f6a1394d	Add tests for different Postgres client libraries (#2008 ) * Add tests for different postgres clients * test/fixtures: sanitize test name for test_output_dir * test/fixtures: do not look for etcd before runtime * Add workflow for testing Postgres client libraries	2022-07-05 12:22:58 +01:00
Heikki Linnakangas	844832ffe4	Bump vendor/postgres Contains changes from two PRs in vendor/postgres: - https://github.com/neondatabase/postgres/pull/163 - https://github.com/neondatabase/postgres/pull/176	2022-07-05 10:55:03 +03:00
bojanserafimov	d29c545b5d	Gc/compaction thread pool, take 2 (#1933 ) Decrease the number of pageserver threads by running gc and compaction in a blocking tokio thread pool	2022-07-05 02:06:40 -04:00
Kirill Bulatov	6abdb12724	Fix 1.62 Clippy errors	2022-07-04 23:46:37 +03:00
Alexander Bayandin	7898e72990	Remove duplicated checks from LocalEnv	2022-07-04 22:35:00 +03:00
Dmitry Rodionov	65704708fa	remove unused imports, make more use of pathlib.Path	2022-07-01 18:56:51 +03:00
Arseny Sher	6100a02d0f	Prefix WAL files in s3 with environment name. It wasn't merged to prod yet, so safe to enable.	2022-07-01 19:21:28 +04:00
Arseny Sher	97fed38213	Fix `cadaca010c` for older ssh clients.	2022-07-01 19:20:59 +04:00
Arseny Sher	cadaca010c	Make ansible to work with storage nodes through teleport from local box.	2022-07-01 16:58:34 +03:00
Bojan Serafimov	f09c09438a	Fix gc after import	2022-07-01 11:10:49 +03:00
Dmitry Rodionov	00fc696606	replace extra urlencode dependency with already present url library	2022-06-30 14:32:15 +03:00
Kirill Bulatov	1d0706cf25	Fix walreceiver connection selection mechanism * Avoid reconnecting to safekeeper immediately after its failure by limiting candidates to those with fewest connection attempts. Thus we don't have to wait lagging_wal_timeout (10s by default) before switch happens even if no new changes are generated, and current test_restarts_under_load expects some commits to happen within 4s. * Make default max_lsn_wal_lag larger, otherwise we constant reconnections happen during normal work. * Fix wal_connection_attempts maintanance, preventing busy loop of reconnections.	2022-06-30 00:40:12 +03:00
Dmitry Ivanov	5ee19b0758	Fix bloated coverage uploads (#2005 ) Move coverage data to a better directory, merge it better and don't publish it from CircleCI pipeline	2022-06-29 17:59:19 +03:00
Kirill Bulatov	cef90d9220	Disable cachepot for GH Actions builds (#2007 )	2022-06-29 17:56:02 +03:00
Kirill Bulatov	4a05413a4c	More code coverage fixes in GH Actions (#2002 )	2022-06-27 22:40:20 +03:00
Kirill Bulatov	dd61f3558f	Fix coverage upload credentials retrieval (#2001 )	2022-06-27 20:41:09 +03:00
Kirill Bulatov	8a714f1ebf	Add coverage to GH actions and rework part of them (#1987 )	2022-06-27 19:15:56 +03:00
Arseny Sher	137291dc24	Push to etcd from safekeeper many timelines concurrently. Mitigates latency fee, making push throughput 1-1.5 order of magnitude bigger. Also make leases per timeline, not per whole safekeeper, avoiding storing garbage in etcd for deleted timelines while safekeeper is alive.	2022-06-27 16:30:21 +03:00
Kirill Bulatov	eb8926083e	Use the updated base build Docker image (#1972 )	2022-06-27 13:12:58 +03:00
Johan Eliasson	26bca6ddba	Add `openssl` to OSX dependencies (#1994 )	2022-06-26 21:54:07 +03:00
Arthur Petukhovsky	55192384c3	Fix zero timeline_start_lsn (#1981 ) * Fix zero timeline_start_lsn * Log more info on control file upgrade * Fix formatting Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>	2022-06-24 13:59:37 +03:00
KlimentSerafimov	392cd8b1fc	Refactored extracting project_name in console.rs. (#1982 )	2022-06-24 05:57:33 -04:00
Alexey Kondratov	3cc531d093	Fix CREATE EXTENSION for non-db-owner users (#1408 ) Previously, we were granting create only to db owner, but now we have a dedicated 'web_access' role to connect via web UI and proxy link auth. We anyway grant read / write all data to all roles, so let's grant create to everyone too. This creates some provelege objects in each db, which we need to drop before deleting the role. So now we reassign all owned objects to each db owner before deletion. This also fixes deletion of roles that created some data in any db previously. Will be tested by https://github.com/neondatabase/cloud/pull/1673 Later we should stop messing with Postgres ACL that much.	2022-06-23 21:36:53 +02:00
bojanserafimov	84b9fcbbd5	Increase a few test timeouts (#1977 )	2022-06-23 11:51:56 -04:00