revert fix for #707

update PITR
update test_branch_and_gc test
2026-06-03 21:40:39 +00:00 · 2022-07-05 09:53:01 -04:00 · 2022-07-05 09:51:18 -04:00 · 2022-07-04 19:41:09 -04:00 · 2022-07-04 15:26:43 -04:00 · 2022-07-04 15:25:01 -04:00
49 changed files with 848 additions and 461 deletions
--- a/.circleci/ansible/ansible.cfg
+++ b/.circleci/ansible/ansible.cfg
@@ -6,5 +6,7 @@ timeout = 30

 [ssh_connection]
 ssh_args   = -F ./ansible.ssh.cfg
-scp_if_ssh = True
+# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
+# and scp neither worked for me
+transfer_method = piped
 pipelining = True
--- a/.circleci/ansible/ansible.ssh.cfg
+++ b/.circleci/ansible/ansible.ssh.cfg
@@ -1,3 +1,7 @@
+# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
+# (use pre 8.5 option name to cope with old ssh in CI)
+PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
+
 Host tele.zenith.tech
    User admin
    Port 3023
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -12,6 +12,7 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = prod-1
 console_mgmt_base_url = http://console-release.local
 bucket_name           = zenith-storage-oregon
 bucket_region         = us-west-2
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -13,6 +13,7 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = us-stage
 console_mgmt_base_url = http://console-staging.local
 bucket_name           = zenith-staging-storage-us-east-1
 bucket_region         = us-east-1
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -100,10 +100,8 @@ jobs:
          name: Rust build << parameters.build_type >>
          command: |
            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
              CARGO_FLAGS="--release --features profiling"
            fi

@@ -112,7 +110,7 @@ jobs:
            export RUSTC_WRAPPER=cachepot
            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
-            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
+            mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
            cachepot -s

      - save_cache:
@@ -128,32 +126,24 @@ jobs:
          name: cargo test
          command: |
            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
              CARGO_FLAGS=--release
            fi

-            "${cov_prefix[@]}" cargo test $CARGO_FLAGS
+            cargo test $CARGO_FLAGS

        # Install the rust binaries, for use by test jobs
      - run:
          name: Install rust binaries
          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
            binaries=$(
-              "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
+              cargo metadata --format-version=1 --no-deps |
              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
            )

            test_exe_paths=$(
-              "${cov_prefix[@]}" cargo test --message-format=json --no-run |
+              cargo test --message-format=json --no-run |
              jq -r '.executable | select(. != null)'
            )

@@ -166,34 +156,15 @@ jobs:
              SRC=target/$BUILD_TYPE/$bin
              DST=/tmp/zenith/bin/$bin
              cp $SRC $DST
-              echo $DST >> /tmp/zenith/etc/binaries.list
            done

-            # Install test executables (for code coverage)
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              for bin in $test_exe_paths; do
-                SRC=$bin
-                DST=/tmp/zenith/test_bin/$(basename $bin)
-                cp $SRC $DST
-                echo $DST >> /tmp/zenith/etc/binaries.list
-              done
-            fi
-
        # Install the postgres binaries, for use by test jobs
      - run:
          name: Install postgres binaries
          command: |
            cp -a tmp_install /tmp/zenith/pg_install

-      - run:
-          name: Merge coverage data
-          command: |
-            # This will speed up workspace uploads
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
-            fi
-
-        # Save the rust binaries and coverage data for other jobs in this workflow.
+      # Save rust binaries for other jobs in the workflow
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
@@ -286,7 +257,7 @@ jobs:
          # no_output_timeout, specified here.
          no_output_timeout: 10m
          environment:
-            - ZENITH_BIN: /tmp/zenith/bin
+            - NEON_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
            - TEST_OUTPUT: /tmp/test_output
            # this variable will be embedded in perf test report
@@ -314,12 +285,6 @@ jobs:

            export GITHUB_SHA=$CIRCLE_SHA1

-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
            # Run the tests.
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
@@ -330,7 +295,7 @@ jobs:
            # -n4 uses four processes to run tests via pytest-xdist
            # -s is not used to prevent pytest from capturing output, because tests are running
            # in parallel and logs are mixed between different tests
-            "${cov_prefix[@]}" ./scripts/pytest \
+            ./scripts/pytest \
              --junitxml=$TEST_OUTPUT/junit.xml \
              --tb=short \
              --verbose \
@@ -359,67 +324,12 @@ jobs:
      # The store_test_results step tells CircleCI where to find the junit.xml file.
      - store_test_results:
          path: /tmp/test_output
-      - run:
-          name: Merge coverage data
-          command: |
-            # This will speed up workspace uploads
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
-            fi
-      # Save coverage data (if any)
+      # Save data (if any)
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
            - "*"

-  coverage-report:
-    executor: neon-xlarge-executor
-    steps:
-      - attach_workspace:
-          at: /tmp/zenith
-      - checkout
-      - restore_cache:
-          name: Restore rust cache
-          keys:
-            # Require an exact match. While an out of date cache might speed up the build,
-            # there's no way to clean out old packages, so the cache grows every time something
-            # changes.
-            - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
-      - run:
-          name: Build coverage report
-          command: |
-            COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
-
-            scripts/coverage \
-              --dir=/tmp/zenith/coverage report \
-              --input-objects=/tmp/zenith/etc/binaries.list \
-              --commit-url=$COMMIT_URL \
-              --format=github
-      - run:
-          name: Upload coverage report
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-            REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
-            COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
-
-            scripts/git-upload \
-              --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
-              --message="Add code coverage for $COMMIT_URL" \
-              copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
-
-            # Add link to the coverage report to the commit
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"success\",
-                \"context\": \"zenith-coverage\",
-                \"description\": \"Coverage report is ready\",
-                \"target_url\": \"$REPORT_URL\"
-              }"
-
  # Build neondatabase/neon:latest image and push it to Docker hub
  docker-image:
    docker:
@@ -688,50 +598,6 @@ jobs:
            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

-  # Trigger a new remote CI job
-  remote-ci-trigger:
-    docker:
-      - image: cimg/base:2021.04
-    parameters:
-      remote_repo:
-        type: string
-    environment:
-      REMOTE_REPO: << parameters.remote_repo >>
-    steps:
-      - run:
-          name: Set PR's status to pending
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"pending\",
-                \"context\": \"neon-cloud-e2e\",
-                \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-              }"
-      - run:
-          name: Request a remote CI test
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-
-            curl -f -X POST \
-            https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"ref\": \"main\",
-                \"inputs\": {
-                  \"ci_job_name\": \"neon-cloud-e2e\",
-                  \"commit_hash\": \"$CIRCLE_SHA1\",
-                  \"remote_repo\": \"$LOCAL_REPO\"
-                }
-              }"
-
 workflows:
  build_and_test:
    jobs:
@@ -774,12 +640,6 @@ workflows:
          save_perf_report: true
          requires:
            - build-neon-release
-      - coverage-report:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          requires:
-            # TODO: consider adding more
-            - other-tests-debug
      - docker-image:
          # Context gives an ability to login
          context: Docker Hub
@@ -880,14 +740,3 @@ workflows:
                - release
          requires:
            - docker-image-release
-      - remote-ci-trigger:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          remote_repo: "neondatabase/cloud"
-          requires:
-            # XXX: Successful build doesn't mean everything is OK, but
-            # the job to be triggered takes so much time to complete (~22 min)
-            # that it's better not to wait for the commented-out steps
-            - build-neon-release
-            # - pg_regress-tests-release
-            # - other-tests-release
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -2,25 +2,29 @@ name: 'Run python test'
 description: 'Runs a Neon python test set, performing all the required preparations before'

 inputs:
-  # Select the type of Rust build. Must be "release" or "debug".
  build_type:
+    description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
    required: true
  rust_toolchain:
+    description: 'Rust toolchain version to fetch the caches'
    required: true
-  # This parameter is required, to prevent the mistake of running all tests in one job.
  test_selection:
+    description: 'A python test suite to run'
    required: true
-  # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
  extra_params:
+    description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
    required: false
    default: ''
  needs_postgres_source:
+    description: 'Set to true if the test suite requires postgres source checked out'
    required: false
    default: 'false'
  run_in_parallel:
+    description: 'Whether to run tests in parallel'
    required: false
    default: 'true'
  save_perf_report:
+    description: 'Whether to upload the performance report'
    required: false
    default: 'false'

@@ -60,7 +64,7 @@ runs:

    - name: Run pytest
      env:
-        ZENITH_BIN: /tmp/neon/bin
+        NEON_BIN: /tmp/neon/bin
        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
        TEST_OUTPUT: /tmp/test_output
        # this variable will be embedded in perf test report
@@ -88,7 +92,7 @@ runs:
        fi

        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
-          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
          cov_prefix=()
        fi
@@ -117,3 +121,20 @@ runs:
            scripts/generate_and_push_perf_report.sh
          fi
        fi
+
+    - name: Delete all data but logs
+      shell: bash -ex {0}
+      if: always()
+      run: |
+        du -sh /tmp/test_output/*
+        find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
+        du -sh /tmp/test_output/*
+
+    - name: Upload python test logs
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        if-no-files-found: error
+        name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
+        path: /tmp/test_output/
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -0,0 +1,17 @@
+name: 'Merge and upload coverage data'
+description: 'Compresses and uploads the coverage data as an artifact'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Merge coverage data
+      shell: bash -ex {0}
+      run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+    - name: Upload coverage data
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        if-no-files-found: error
+        name: coverage-data-artifact
+        path: /tmp/coverage/
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,13 +1,28 @@
-name: build_and_test
-on: [ push ]
+name: Test
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+
 defaults:
  run:
    shell: bash -ex {0}

+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
 jobs:
  build-postgres:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -34,7 +49,7 @@ jobs:

      - name: Build postgres
        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: COPT='-Werror' mold -run make postgres -j$(nproc)
+        run: mold -run make postgres -j$(nproc)

      # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
      - name: Prepare postgres artifact
@@ -52,6 +67,7 @@ jobs:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-postgres ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -85,44 +101,39 @@ jobs:
            ~/.cargo/registry/
            ~/.cargo/git/
            target/
-          key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+          key: |
+            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-

      - name: Run cargo build
        run: |
          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
            CARGO_FLAGS=
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=()
            CARGO_FLAGS="--release --features profiling"
          fi

-          export CACHEPOT_BUCKET=zenith-rust-cachepot
-          export RUSTC_WRAPPER=cachepot
-          export AWS_ACCESS_KEY_ID="${{ secrets.AWS_ACCESS_KEY_ID }}"
-          export AWS_SECRET_ACCESS_KEY="${{ secrets.AWS_SECRET_ACCESS_KEY }}"
-          export HOME=/home/runner
          "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
-          cachepot -s

      - name: Run cargo test
        run: |
-          export HOME=/home/runner
          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
            CARGO_FLAGS=
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=()
            CARGO_FLAGS=--release
          fi
-          
+
          "${cov_prefix[@]}" cargo test $CARGO_FLAGS

      - name: Install rust binaries
        run: |
-          export HOME=/home/runner
          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=()
          fi
@@ -137,39 +148,36 @@ jobs:
            jq -r '.executable | select(. != null)'
          )

-          mkdir -p /tmp/neon/bin
-          mkdir -p /tmp/neon/test_bin
-          mkdir -p /tmp/neon/etc
+          mkdir -p /tmp/neon/bin/
+          mkdir -p /tmp/neon/test_bin/
+          mkdir -p /tmp/neon/etc/
+
+          # Keep bloated coverage data files away from the rest of the artifact
+          mkdir -p /tmp/coverage/

          # Install target binaries
          for bin in $binaries; do
            SRC=target/$BUILD_TYPE/$bin
            DST=/tmp/neon/bin/$bin
-            cp $SRC $DST
-            echo $DST >> /tmp/neon/etc/binaries.list
+            cp "$SRC" "$DST"
          done

-          # Install test executables (for code coverage)
+          # Install test executables and write list of all binaries (for code coverage)
          if [[ $BUILD_TYPE == "debug" ]]; then
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
            for bin in $test_exe_paths; do
              SRC=$bin
              DST=/tmp/neon/test_bin/$(basename $bin)
-              cp $SRC $DST
-              echo $DST >> /tmp/neon/etc/binaries.list
+              cp "$SRC" "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
            done
          fi

      - name: Install postgres binaries
        run: cp -a tmp_install /tmp/neon/pg_install

-      - name: Merge coverage data
-        run: |
-          export HOME=/home/runner
-          # This will speed up workspace uploads
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage merge
-          fi
-
      - name: Prepare neon artifact
        run: tar -C /tmp/neon/ -czf ./neon.tgz .

@@ -181,38 +189,17 @@ jobs:
          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
          path: ./neon.tgz

-  check-codestyle-python:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    strategy:
-      matrix:
-        rust_toolchain: [ 1.58 ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 1
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data

-      - name: Cache poetry deps
-        id: cache_poetry
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: Run yapf to ensure code format
-        run: poetry run yapf --recursive --diff .
-
-      - name: Run mypy to check types
-        run: poetry run mypy .

  pg_regress-tests:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-neon ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -231,10 +218,15 @@ jobs:
          test_selection: batch_pg_regress
          needs_postgres_source: true

+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
  other-tests:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-neon ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ debug, release ]
        rust_toolchain: [ 1.58 ]
@@ -252,10 +244,15 @@ jobs:
          rust_toolchain: ${{ matrix.rust_toolchain }}
          test_selection: batch_others

+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
  benchmarks:
    runs-on: [ self-hosted, Linux, k8s-runner ]
    needs: [ build-neon ]
    strategy:
+      fail-fast: false
      matrix:
        build_type: [ release ]
        rust_toolchain: [ 1.58 ]
@@ -273,4 +270,120 @@ jobs:
          rust_toolchain: ${{ matrix.rust_toolchain }}
          test_selection: performance
          run_in_parallel: false
-          # save_perf_report: true
+          save_perf_report: true
+      # XXX: no coverage data handling here, since benchmarks are run on release builds,
+      # while coverage is currently collected for the debug ones
+
+  coverage-report:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ other-tests, pg_regress-tests ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Restore cargo deps cache
+        id: cache_cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry/
+            ~/.cargo/git/
+            target/
+          key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+
+      - name: Get Neon artifact for restoration
+        uses: actions/download-artifact@v3
+        with:
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          path: ./neon-artifact/
+
+      - name: Extract Neon artifact
+        run: |
+          mkdir -p /tmp/neon/
+          tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
+          rm -rf ./neon-artifact/
+
+      - name: Restore coverage data
+        uses: actions/download-artifact@v3
+        with:
+          name: coverage-data-artifact
+          path: /tmp/coverage/
+
+      - name: Merge coverage data
+        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+      - name: Build and upload coverage report
+        run: |
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
+
+          scripts/coverage \
+            --dir=/tmp/coverage report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --commit-url=$COMMIT_URL \
+            --format=github
+
+          REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
+
+          scripts/git-upload \
+            --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
+            --message="Add code coverage for $COMMIT_URL" \
+            copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
+
+          # Add link to the coverage report to the commit
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"success\",
+              \"context\": \"neon-coverage\",
+              \"description\": \"Coverage report is ready\",
+              \"target_url\": \"$REPORT_URL\"
+            }"
+
+  trigger-e2e-tests:
+   runs-on: [ self-hosted, Linux, k8s-runner ]
+   needs: [ build-neon ]
+   steps:
+     - name: Set PR's status to pending and request a remote CI test
+       run: |
+         COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+         COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+
+         REMOTE_REPO="${{ github.repository_owner }}/cloud"
+
+         curl -f -X POST \
+         https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+         -H "Accept: application/vnd.github.v3+json" \
+         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+         --data \
+           "{
+             \"state\": \"pending\",
+             \"context\": \"neon-cloud-e2e\",
+             \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+           }"
+
+         curl -f -X POST \
+         https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+         -H "Accept: application/vnd.github.v3+json" \
+         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+         --data \
+           "{
+             \"ref\": \"main\",
+             \"inputs\": {
+               \"ci_job_name\": \"neon-cloud-e2e\",
+               \"commit_hash\": \"$COMMIT_SHA\",
+               \"remote_repo\": \"${{ github.repository }}\"
+             }
+           }"
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -1,4 +1,4 @@
-name: Build and Test
+name: Check code style and build

 on:
  push:
@@ -6,15 +6,27 @@ on:
    - main
  pull_request:

+defaults:
+  run:
+    shell: bash -ex {0}
+
+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+
 jobs:
-  regression-check:
+  check-codestyle-rust:
    strategy:
+      fail-fast: false
      matrix:
        # If we want to duplicate this job for different
        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
        rust_toolchain: [1.58]
        os: [ubuntu-latest, macos-latest]
-    timeout-minutes: 30
+    timeout-minutes: 50
    name: run regression test suite
    runs-on: ${{ matrix.os }}

@@ -92,5 +104,30 @@ jobs:
      - name: Run cargo clippy
        run: ./run_clippy.sh

-      - name: Run cargo test
-        run: cargo test --all --all-targets
+      - name: Ensure all project builds
+        run: cargo build --all --all-targets
+
+  check-codestyle-python:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Cache poetry deps
+        id: cache_poetry
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: Run yapf to ensure code format
+        run: poetry run yapf --recursive --diff .
+
+      - name: Run mypy to check types
+        run: poetry run mypy .
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -461,6 +461,7 @@ dependencies = [
 "tar",
 "tokio",
 "tokio-postgres",
+ "url",
 "workspace_hack",
 ]

--- a/10
+++ b/10
@@ -1,5 +1,5 @@
 # Build Postgres
-FROM zimg/rust:1.58 AS pg-build
+FROM neondatabase/rust:1.58 AS pg-build
 WORKDIR /pg

 USER root
@@ -14,7 +14,7 @@ RUN set -e \
    && tar -C tmp_install -czf /postgres_install.tar.gz .

 # Build zenith binaries
-FROM zimg/rust:1.58 AS build
+FROM neondatabase/rust:1.58 AS build
 ARG GIT_VERSION=local

 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
@@ -46,9 +46,9 @@ RUN set -e \
    && useradd -d /data zenith \
    && chown -R zenith:zenith /data

-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy      /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy      /usr/local/bin

 COPY --from=pg-build /pg/tmp_install/         /usr/local/
 COPY --from=pg-build /postgres_install.tar.gz /data/
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,6 +1,6 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .circle/config.yml
-FROM zimg/rust:1.58 AS rust-build
+FROM neondatabase/rust:1.58 AS rust-build

 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
 ARG AWS_ACCESS_KEY_ID
@@ -15,4 +15,4 @@ RUN set -e \
 # Final image that only has one binary
 FROM debian:buster-slim

-COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd
+brew install protobuf etcd openssl
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -18,4 +18,5 @@ serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -33,7 +33,7 @@ use std::process::exit;
 use std::sync::{Arc, RwLock};
 use std::{thread, time::Duration};

-use anyhow::Result;
+use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
 use log::{error, info};
@@ -45,6 +45,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::pg_helpers::*;
 use compute_tools::spec::*;
+use url::Url;

 fn main() -> Result<()> {
    // TODO: re-use `utils::logging` later
@@ -131,7 +132,7 @@ fn main() -> Result<()> {

    let compute_state = ComputeNode {
        start_time: Utc::now(),
-        connstr: connstr.to_string(),
+        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
        spec,
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use anyhow::{anyhow, Result};
 use log::error;
 use postgres::Client;
@@ -23,9 +21,8 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
    Ok(())
 }

-pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
-    let connstr = &compute.connstr;
-    let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
+pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
+    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
    if client.is_closed() {
        return Err(anyhow!("connection to postgres closed"));
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -35,7 +35,8 @@ use crate::spec::*;
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
    pub start_time: DateTime<Utc>,
-    pub connstr: String,
+    // Url type maintains proper escaping
+    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
    pub spec: ComputeSpec,
@@ -268,27 +269,32 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin`name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut client = match Client::connect(&self.connstr, NoTls) {
+        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
            Err(e) => {
                info!(
                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                    e
                );
-                let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1);
+                let mut zenith_admin_connstr = self.connstr.clone();

-                let mut client = Client::connect(&zenith_admin_connstr, NoTls)?;
+                zenith_admin_connstr
+                    .set_username("zenith_admin")
+                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+
+                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);

                // reconnect with connsting with expected name
-                Client::connect(&self.connstr, NoTls)?
+                Client::connect(self.connstr.as_str(), NoTls)?
            }
            Ok(client) => client,
        };

        handle_roles(&self.spec, &mut client)?;
        handle_databases(&self.spec, &mut client)?;
+        handle_role_deletions(self, &mut client)?;
        handle_grants(&self.spec, &mut client)?;
        create_writablity_check_data(&mut client)?;

--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
 // XXX: the only expected panic is at `RwLock` unwrap().
-fn watch_compute_activity(compute: &Arc<ComputeNode>) {
+fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.clone();
+    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = Client::connect(&connstr, NoTls);
+    let mut client = Client::connect(connstr, NoTls);
    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);

    info!("watching Postgres activity at {}", connstr);
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
                    info!("connection to postgres closed, trying to reconnect");

                    // Connection is closed, reconnect and try again.
-                    client = Client::connect(&connstr, NoTls);
+                    client = Client::connect(connstr, NoTls);
                    continue;
                }

@@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
                debug!("cannot connect to postgres: {}, retrying", e);

                // Establish a new connection and try again.
-                client = Client::connect(&connstr, NoTls);
+                client = Client::connect(connstr, NoTls);
            }
        }
    }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,9 +2,10 @@ use std::path::Path;

 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
-use postgres::Client;
+use postgres::{Client, NoTls};
 use serde::Deserialize;

+use crate::compute::ComputeNode;
 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -97,18 +98,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Process delta operations first
    if let Some(ops) = &spec.delta_operations {
-        info!("processing delta operations on roles");
+        info!("processing role renames");
        for op in ops {
            match op.action.as_ref() {
-                // We do not check either role exists or not,
-                // Postgres will take care of it for us
                "delete_role" => {
-                    let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
-
-                    warn!("deleting role '{}'", &op.name);
-                    xact.execute(query.as_str(), &[])?;
+                    // no-op now, roles will be deleted at the end of configuration
                }
-                // Renaming role drops its password, since tole name is
+                // Renaming role drops its password, since role name is
                // used as a salt there.  It is important that this role
                // is recorded with a new `name` in the `roles` list.
                // Follow up roles update will set the new password.
@@ -182,7 +178,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            xact.execute(query.as_str(), &[])?;

            let grant_query = format!(
-                "grant pg_read_all_data, pg_write_all_data to {}",
+                "GRANT pg_read_all_data, pg_write_all_data TO {}",
                name.quote()
            );
            xact.execute(grant_query.as_str(), &[])?;
@@ -197,6 +193,70 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    Ok(())
 }

+/// Reassign all dependent objects and delete requested roles.
+pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
+    let spec = &node.spec;
+
+    // First, reassign all dependent objects to db owners.
+    if let Some(ops) = &spec.delta_operations {
+        info!("reassigning dependent objects of to-be-deleted roles");
+        for op in ops {
+            if op.action == "delete_role" {
+                reassign_owned_objects(node, &op.name)?;
+            }
+        }
+    }
+
+    // Second, proceed with role deletions.
+    let mut xact = client.transaction()?;
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing role deletions");
+        for op in ops {
+            // We do not check either role exists or not,
+            // Postgres will take care of it for us
+            if op.action == "delete_role" {
+                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
+
+                warn!("deleting role '{}'", &op.name);
+                xact.execute(query.as_str(), &[])?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+// Reassign all owned objects in all databases to the owner of the database.
+fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
+    for db in &node.spec.cluster.databases {
+        if db.owner != *role_name {
+            let mut connstr = node.connstr.clone();
+            // database name is always the last and the only component of the path
+            connstr.set_path(&db.name);
+
+            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+
+            // This will reassign all dependent objects to the db owner
+            let reassign_query = format!(
+                "REASSIGN OWNED BY {} TO {}",
+                role_name.quote(),
+                db.owner.quote()
+            );
+            info!(
+                "reassigning objects owned by '{}' in db '{}' to '{}'",
+                role_name, &db.name, &db.owner
+            );
+            client.simple_query(&reassign_query)?;
+
+            // This now will only drop privileges of the role
+            let drop_query = format!("DROP OWNED BY {}", role_name.quote());
+            client.simple_query(&drop_query)?;
+        }
+    }
+
+    Ok(())
+}
+
 /// It follows mostly the same logic as `handle_roles()` excepting that we
 /// does not use an explicit transactions block, since major database operations
 /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
@@ -294,13 +354,26 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    info!("cluster spec grants:");

+    // We now have a separate `web_access` role to connect to the database
+    // via the web interface and proxy link auth. And also we grant a
+    // read / write all data privilege to every role. So also grant
+    // create to everyone.
+    // XXX: later we should stop messing with Postgres ACL in such horrible
+    // ways.
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| r.name.quote())
+        .collect::<Vec<_>>();
+
    for db in &spec.cluster.databases {
        let dbname = &db.name;

        let query: String = format!(
            "GRANT CREATE ON DATABASE {} TO {}",
            dbname.quote(),
-            db.owner.quote()
+            roles.join(", ")
        );
        info!("grant query {}", &query);

--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -2190,7 +2190,7 @@ impl LayeredTimeline {

        // Calculate pitr cutoff point.
        // If we cannot determine a cutoff LSN, be conservative and don't GC anything.
-        let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn();
+        let mut pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn();

        if let Ok(timeline) =
            tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
@@ -2210,6 +2210,9 @@ impl LayeredTimeline {
                    LsnForTimestamp::Past(lsn) => {
                        debug!("past({})", lsn);
                    }
+                    LsnForTimestamp::NoData(lsn) => {
+                        debug!("nodata({})", lsn);
+                    }
                }
                debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
            }
@@ -2325,9 +2328,10 @@ impl LayeredTimeline {
            // If GC horizon is at 2500, we can remove layers A and B, but
            // we cannot remove C, even though it's older than 2500, because
            // the delta layer 2000-3000 depends on it.
-            if !layers
-                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
-            {
+            if !layers.image_layer_exists(
+                &l.get_key_range(),
+                &(l.get_lsn_range().end..disk_consistent_lsn + 1),
+            )? {
                debug!(
                    "keeping {} because it is the latest layer",
                    l.filename().display()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -554,7 +554,7 @@ impl PageServerHandler {
        // Create empty timeline
        info!("creating new timeline");
        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
-        let timeline = repo.create_empty_timeline(timeline_id, Lsn(0))?;
+        let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
        let repartition_distance = repo.get_checkpoint_distance();
        let mut datadir_timeline =
            DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
@@ -1151,6 +1151,7 @@ impl postgres_backend::Handler for PageServerHandler {
                LsnForTimestamp::Present(lsn) => format!("{}", lsn),
                LsnForTimestamp::Future(_lsn) => "future".into(),
                LsnForTimestamp::Past(_lsn) => "past".into(),
+                LsnForTimestamp::NoData(_lsn) => "nodata".into(),
            };
            pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -51,6 +51,7 @@ pub enum LsnForTimestamp {
    Present(Lsn),
    Future(Lsn),
    Past(Lsn),
+    NoData(Lsn),
 }

 impl<R: Repository> DatadirTimeline<R> {
@@ -263,7 +264,7 @@ impl<R: Repository> DatadirTimeline<R> {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
                // just after importing a cluster.
-                bail!("no commit timestamps found");
+                Ok(LsnForTimestamp::NoData(max_lsn))
            }
            (true, false) => {
                // Didn't find any commit timestamps larger than the request
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -37,7 +37,7 @@ pub mod defaults {
    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000;
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
 }

 /// Per-tenant configuration options
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -178,7 +178,7 @@ async fn shutdown_all_wal_connections(
 /// That may lead to certain events not being observed by the listener.
 #[derive(Debug)]
 struct TaskHandle<E> {
-    handle: JoinHandle<()>,
+    handle: JoinHandle<Result<(), String>>,
    events_receiver: watch::Receiver<TaskEvent<E>>,
    cancellation: watch::Sender<()>,
 }
@@ -205,8 +205,8 @@ impl<E: Clone> TaskHandle<E> {

        let sender = Arc::clone(&events_sender);
        let handle = tokio::task::spawn(async move {
-            let task_result = task(sender, cancellation_receiver).await;
-            events_sender.send(TaskEvent::End(task_result)).ok();
+            events_sender.send(TaskEvent::Started).ok();
+            task(sender, cancellation_receiver).await
        });

        TaskHandle {
@@ -216,6 +216,16 @@ impl<E: Clone> TaskHandle<E> {
        }
    }

+    async fn next_task_event(&mut self) -> TaskEvent<E> {
+        select! {
+            next_task_event = self.events_receiver.changed() => match next_task_event {
+                Ok(()) => self.events_receiver.borrow().clone(),
+                Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await,
+            },
+            task_completion_result = join_on_handle(&mut self.handle) => task_completion_result,
+        }
+    }
+
    /// Aborts current task, waiting for it to finish.
    async fn shutdown(self) {
        self.cancellation.send(()).ok();
@@ -225,6 +235,19 @@ impl<E: Clone> TaskHandle<E> {
    }
 }

+async fn join_on_handle<E>(handle: &mut JoinHandle<Result<(), String>>) -> TaskEvent<E> {
+    match handle.await {
+        Ok(task_result) => TaskEvent::End(task_result),
+        Err(e) => {
+            if e.is_cancelled() {
+                TaskEvent::End(Ok(()))
+            } else {
+                TaskEvent::End(Err(format!("WAL receiver task panicked: {e}")))
+            }
+        }
+    }
+}
+
 /// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery.
 /// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled.
 /// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled.
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -104,49 +104,29 @@ async fn connection_manager_loop_step(

            Some(wal_connection_update) = async {
                match walreceiver_state.wal_connection.as_mut() {
-                    Some(wal_connection) => {
-                        let receiver = &mut wal_connection.connection_task.events_receiver;
-                        Some(match receiver.changed().await {
-                            Ok(()) => receiver.borrow().clone(),
-                            Err(_cancellation_error) => TaskEvent::End(Ok(())),
-                        })
-                    }
+                    Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
                    None => None,
                }
            } => {
-                let (connection_update, reset_connection_attempts) = match &wal_connection_update {
-                    TaskEvent::Started => (Some(Utc::now().naive_utc()), true),
-                    TaskEvent::NewEvent(replication_feedback) => (Some(DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc()), true),
+                let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard");
+                match &wal_connection_update {
+                    TaskEvent::Started => {
+                        wal_connection.latest_connection_update = Utc::now().naive_utc();
+                        *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1;
+                    },
+                    TaskEvent::NewEvent(replication_feedback) => {
+                        wal_connection.latest_connection_update = DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc();
+                        // reset connection attempts here only, the only place where both nodes
+                        // explicitly confirmn with replication feedback that they are connected to each other
+                        walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
+                    },
                    TaskEvent::End(end_result) => {
-                        let should_reset_connection_attempts = match end_result {
-                            Ok(()) => {
-                                debug!("WAL receiving task finished");
-                                true
-                            },
-                            Err(e) => {
-                                warn!("WAL receiving task failed: {e}");
-                                false
-                            },
+                        match end_result {
+                            Ok(()) => debug!("WAL receiving task finished"),
+                            Err(e) => warn!("WAL receiving task failed: {e}"),
                        };
                        walreceiver_state.wal_connection = None;
-                        (None, should_reset_connection_attempts)
                    },
-                };
-
-                if let Some(connection_update) = connection_update {
-                    match &mut walreceiver_state.wal_connection {
-                        Some(wal_connection) => {
-                            wal_connection.latest_connection_update = connection_update;
-
-                            let attempts_entry = walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0);
-                            if reset_connection_attempts {
-                                *attempts_entry = 0;
-                            } else {
-                                *attempts_entry += 1;
-                            }
-                        },
-                        None => error!("Received connection update for WAL connection that is not active, update: {wal_connection_update:?}"),
-                    }
                }
            },

@@ -406,10 +386,8 @@ impl WalreceiverState {
            Some(existing_wal_connection) => {
                let connected_sk_node = existing_wal_connection.sk_id;

-                let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = self
-                    .applicable_connection_candidates()
-                    .filter(|&(sk_id, _, _)| sk_id != connected_sk_node)
-                    .max_by_key(|(_, info, _)| info.commit_lsn)?;
+                let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) =
+                    self.select_connection_candidate(Some(connected_sk_node))?;

                let now = Utc::now().naive_utc();
                if let Ok(latest_interaciton) =
@@ -462,9 +440,8 @@ impl WalreceiverState {
                }
            }
            None => {
-                let (new_sk_id, _, new_wal_producer_connstr) = self
-                    .applicable_connection_candidates()
-                    .max_by_key(|(_, info, _)| info.commit_lsn)?;
+                let (new_sk_id, _, new_wal_producer_connstr) =
+                    self.select_connection_candidate(None)?;
                return Some(NewWalConnectionCandidate {
                    safekeeper_id: new_sk_id,
                    wal_producer_connstr: new_wal_producer_connstr,
@@ -476,6 +453,49 @@ impl WalreceiverState {
        None
    }

+    /// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
+    /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
+    ///
+    /// The candidate that is chosen:
+    /// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent)
+    /// * has greatest data Lsn among the ones that are left
+    ///
+    /// NOTE:
+    /// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but
+    /// otherwise to reset the connection attempts, a successful connection to that node is needed.
+    /// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored.
+    fn select_connection_candidate(
+        &self,
+        node_to_omit: Option<NodeId>,
+    ) -> Option<(NodeId, &SkTimelineInfo, String)> {
+        let all_candidates = self
+            .applicable_connection_candidates()
+            .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
+            .collect::<Vec<_>>();
+
+        let smallest_attempts_allowed = all_candidates
+            .iter()
+            .map(|(sk_id, _, _)| {
+                self.wal_connection_attempts
+                    .get(sk_id)
+                    .copied()
+                    .unwrap_or(0)
+            })
+            .min()?;
+
+        all_candidates
+            .into_iter()
+            .filter(|(sk_id, _, _)| {
+                smallest_attempts_allowed
+                    >= self
+                        .wal_connection_attempts
+                        .get(sk_id)
+                        .copied()
+                        .unwrap_or(0)
+            })
+            .max_by_key(|(_, info, _)| info.commit_lsn)
+    }
+
    fn applicable_connection_candidates(
        &self,
    ) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, String)> {
@@ -500,15 +520,25 @@ impl WalreceiverState {
    }

    fn cleanup_old_candidates(&mut self) {
-        self.wal_stream_candidates.retain(|_, etcd_info| {
+        let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
+
+        self.wal_stream_candidates.retain(|node_id, etcd_info| {
            if let Ok(time_since_latest_etcd_update) =
                (Utc::now().naive_utc() - etcd_info.latest_update).to_std()
            {
-                time_since_latest_etcd_update < self.lagging_wal_timeout
+                let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
+                if !should_retain {
+                    node_ids_to_remove.push(*node_id);
+                }
+                should_retain
            } else {
                true
            }
        });
+
+        for node_id in node_ids_to_remove {
+            self.wal_connection_attempts.remove(&node_id);
+        }
    }
 }

@@ -843,6 +873,64 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("candidate_with_many_connection_failures")?;
+        let mut state = dummy_state(&harness);
+        let now = Utc::now().naive_utc();
+
+        let current_lsn = Lsn(100_000).align();
+        let bigger_lsn = Lsn(current_lsn.0 + 100).align();
+
+        state.wal_connection = None;
+        state.wal_stream_candidates = HashMap::from([
+            (
+                NodeId(0),
+                EtcdSkTimeline {
+                    timeline: SkTimelineInfo {
+                        last_log_term: None,
+                        flush_lsn: None,
+                        commit_lsn: Some(bigger_lsn),
+                        backup_lsn: None,
+                        remote_consistent_lsn: None,
+                        peer_horizon_lsn: None,
+                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
+                    },
+                    etcd_version: 0,
+                    latest_update: now,
+                },
+            ),
+            (
+                NodeId(1),
+                EtcdSkTimeline {
+                    timeline: SkTimelineInfo {
+                        last_log_term: None,
+                        flush_lsn: None,
+                        commit_lsn: Some(current_lsn),
+                        backup_lsn: None,
+                        remote_consistent_lsn: None,
+                        peer_horizon_lsn: None,
+                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
+                    },
+                    etcd_version: 0,
+                    latest_update: now,
+                },
+            ),
+        ]);
+        state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]);
+
+        let candidate_with_less_errors = state
+            .next_connection_candidate()
+            .expect("Expected one candidate selected, but got none");
+        assert_eq!(
+            candidate_with_less_errors.safekeeper_id,
+            NodeId(1),
+            "Should select the node with less connection errors"
+        );
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> {
        let harness = RepoHarness::create("connection_no_etcd_data_candidate")?;
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -49,6 +49,12 @@ impl UserFacingError for ConsoleAuthError {
    }
 }

+impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError {
+    fn from(e: &auth::credentials::ClientCredsParseError) -> Self {
+        ConsoleAuthError::BadProjectName(e.clone())
+    }
+}
+
 // TODO: convert into an enum with "error"
 #[derive(Serialize, Deserialize, Debug)]
 struct GetRoleSecretResponse {
@@ -92,14 +98,9 @@ impl<'a> Api<'a> {

    async fn get_auth_info(&self) -> Result<AuthInfo> {
        let mut url = self.endpoint.clone();
-        let project_name = self
-            .creds
-            .project_name
-            .as_ref()
-            .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?;
        url.path_segments_mut().push("proxy_get_role_secret");
        url.query_pairs_mut()
-            .append_pair("project", project_name)
+            .append_pair("project", self.creds.project_name.as_ref()?)
            .append_pair("role", &self.creds.user);

        // TODO: use a proper logger
@@ -121,12 +122,8 @@ impl<'a> Api<'a> {
    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(&self) -> Result<DatabaseInfo> {
        let mut url = self.endpoint.clone();
-        let project_name = self
-            .creds
-            .project_name
-            .as_ref()
-            .map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?;
        url.path_segments_mut().push("proxy_wake_compute");
+        let project_name = self.creds.project_name.as_ref()?;
        url.query_pairs_mut().append_pair("project", project_name);

        // TODO: use a proper logger
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -5,6 +5,11 @@ use anyhow::Context;
 use anyhow::Error;
 use anyhow::Result;
 use etcd_broker::subscription_value::SkTimelineInfo;
+use etcd_broker::LeaseKeepAliveStream;
+use etcd_broker::LeaseKeeper;
+
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
 use std::time::Duration;
 use tokio::spawn;
 use tokio::task::JoinHandle;
@@ -21,7 +26,7 @@ use utils::zid::{NodeId, ZTenantTimelineId};

 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
-const LEASE_TTL_SEC: i64 = 5;
+const LEASE_TTL_SEC: i64 = 10;

 pub fn thread_main(conf: SafeKeeperConf) {
    let runtime = runtime::Builder::new_current_thread()
@@ -154,13 +159,48 @@ pub fn get_candiate_name(system_id: NodeId) -> String {
    format!("id_{system_id}")
 }

+async fn push_sk_info(
+    zttid: ZTenantTimelineId,
+    mut client: Client,
+    key: String,
+    sk_info: SkTimelineInfo,
+    mut lease: Lease,
+) -> anyhow::Result<(ZTenantTimelineId, Lease)> {
+    let put_opts = PutOptions::new().with_lease(lease.id);
+    client
+        .put(
+            key.clone(),
+            serde_json::to_string(&sk_info)?,
+            Some(put_opts),
+        )
+        .await
+        .with_context(|| format!("failed to push safekeeper info to {}", key))?;
+
+    // revive the lease
+    lease
+        .keeper
+        .keep_alive()
+        .await
+        .context("failed to send LeaseKeepAliveRequest")?;
+    lease
+        .ka_stream
+        .message()
+        .await
+        .context("failed to receive LeaseKeepAliveResponse")?;
+
+    Ok((zttid, lease))
+}
+
+struct Lease {
+    id: i64,
+    keeper: LeaseKeeper,
+    ka_stream: LeaseKeepAliveStream,
+}
+
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
    let mut client = Client::connect(&conf.broker_endpoints, None).await?;
-
-    // Get and maintain lease to automatically delete obsolete data
-    let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
-    let (mut keeper, mut ka_stream) = client.lease_keep_alive(lease.id()).await?;
+    let mut leases: HashMap<ZTenantTimelineId, Lease> = HashMap::new();

    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
    loop {
@@ -168,33 +208,46 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
        // is under plain mutex. That's ok, all this code is not performance
        // sensitive and there is no risk of deadlock as we don't await while
        // lock is held.
-        for zttid in GlobalTimelines::get_active_timelines() {
-            if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
-                let sk_info = tli.get_public_info(&conf)?;
-                let put_opts = PutOptions::new().with_lease(lease.id());
-                client
-                    .put(
-                        timeline_safekeeper_path(
-                            conf.broker_etcd_prefix.clone(),
-                            zttid,
-                            conf.my_id,
-                        ),
-                        serde_json::to_string(&sk_info)?,
-                        Some(put_opts),
-                    )
-                    .await
-                    .context("failed to push safekeeper info")?;
+        let active_tlis = GlobalTimelines::get_active_timelines();
+
+        // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data.
+        for zttid in active_tlis.iter() {
+            if let Entry::Vacant(v) = leases.entry(*zttid) {
+                let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
+                let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?;
+                v.insert(Lease {
+                    id: lease.id(),
+                    keeper,
+                    ka_stream,
+                });
            }
        }
-        // revive the lease
-        keeper
-            .keep_alive()
-            .await
-            .context("failed to send LeaseKeepAliveRequest")?;
-        ka_stream
-            .message()
-            .await
-            .context("failed to receive LeaseKeepAliveResponse")?;
+        leases.retain(|zttid, _| active_tlis.contains(zttid));
+
+        // Push data concurrently to not suffer from latency, with many timelines it can be slow.
+        let handles = active_tlis
+            .iter()
+            .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid))
+            .map(|tli| {
+                let sk_info = tli.get_public_info(&conf);
+                let key = timeline_safekeeper_path(
+                    conf.broker_etcd_prefix.clone(),
+                    tli.zttid,
+                    conf.my_id,
+                );
+                let lease = leases.remove(&tli.zttid).unwrap();
+                tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease))
+            })
+            .collect::<Vec<_>>();
+        for h in handles {
+            let (zttid, lease) = h.await??;
+            // It is ugly to pull leases from hash and then put it back, but
+            // otherwise we have to resort to long living per tli tasks (which
+            // would generate a lot of errors when etcd is down) as task wants to
+            // have 'static objects, we can't borrow to it.
+            leases.insert(zttid, lease);
+        }
+
        sleep(push_interval).await;
    }
 }
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -239,6 +239,19 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
            remote_consistent_lsn: Lsn(0),
            peers: Peers(vec![]),
        });
+    } else if version == 5 {
+        info!("reading safekeeper control file version {}", version);
+        let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?;
+        if oldstate.timeline_start_lsn != Lsn(0) {
+            return Ok(oldstate);
+        }
+
+        // set special timeline_start_lsn because we don't know the real one
+        info!("setting timeline_start_lsn and local_start_lsn to Lsn(1)");
+        oldstate.timeline_start_lsn = Lsn(1);
+        oldstate.local_start_lsn = Lsn(1);
+
+        return Ok(oldstate);
    }
    bail!("unsupported safekeeper control file version {}", version)
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -28,7 +28,7 @@ use utils::{
 };

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 5;
+pub const SK_FORMAT_VERSION: u32 = 6;
 const SK_PROTOCOL_VERSION: u32 = 2;
 const UNKNOWN_SERVER_VERSION: u32 = 0;

--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use tokio::sync::watch;

 use std::cmp::{max, min};
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::fs::{self};

 use std::sync::{Arc, Mutex, MutexGuard};
@@ -445,9 +445,9 @@ impl Timeline {
    }

    /// Prepare public safekeeper info for reporting.
-    pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result<SkTimelineInfo> {
+    pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
        let shared_state = self.mutex.lock().unwrap();
-        Ok(SkTimelineInfo {
+        SkTimelineInfo {
            last_log_term: Some(shared_state.sk.get_epoch()),
            flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
            // note: this value is not flushed to control file yet and can be lost
@@ -460,7 +460,7 @@ impl Timeline {
            peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
            safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
            backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
-        })
+        }
    }

    /// Update timeline state with peer safekeeper data.
@@ -625,6 +625,8 @@ impl GlobalTimelines {
        zttid: ZTenantTimelineId,
        create: bool,
    ) -> Result<Arc<Timeline>> {
+        let _enter = info_span!("", timeline = %zttid.tenant_id).entered();
+
        let mut state = TIMELINES_STATE.lock().unwrap();

        match state.timelines.get(&zttid) {
@@ -667,7 +669,7 @@ impl GlobalTimelines {
    }

    /// Get ZTenantTimelineIDs of all active timelines.
-    pub fn get_active_timelines() -> Vec<ZTenantTimelineId> {
+    pub fn get_active_timelines() -> HashSet<ZTenantTimelineId> {
        let state = TIMELINES_STATE.lock().unwrap();
        state
            .timelines
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -45,7 +45,7 @@ If you want to run all tests that have the string "bench" in their names:

 Useful environment variables:

-`ZENITH_BIN`: The directory where zenith binaries can be found.
+`NEON_BIN`: The directory where neon binaries can be found.
 `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
--- a/test_runner/batch_others/test_ancestor_branch.py
+++ b/test_runner/batch_others/test_ancestor_branch.py
@@ -1,6 +1,3 @@
-from contextlib import closing
-
-import psycopg2.extras
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
--- a/test_runner/batch_others/test_auth.py
+++ b/test_runner/batch_others/test_auth.py
@@ -1,8 +1,6 @@
 from contextlib import closing
-from typing import Iterator
-from uuid import UUID, uuid4
+from uuid import uuid4
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
-from requests.exceptions import HTTPError
 import pytest


--- a/test_runner/batch_others/test_backpressure.py
+++ b/test_runner/batch_others/test_backpressure.py
@@ -1,11 +1,9 @@
 from contextlib import closing, contextmanager
 import psycopg2.extras
 import pytest
-from fixtures.neon_fixtures import PgProtocol, NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.log_helper import log
-import os
 import time
-import asyncpg
 from fixtures.neon_fixtures import Postgres
 import threading

--- a/test_runner/batch_others/test_basebackup_error.py
+++ b/test_runner/batch_others/test_basebackup_error.py
@@ -1,8 +1,6 @@
 import pytest
-from contextlib import closing

 from fixtures.neon_fixtures import NeonEnv
-from fixtures.log_helper import log


 #
--- a/test_runner/batch_others/test_branch_and_gc.py
+++ b/test_runner/batch_others/test_branch_and_gc.py
@@ -0,0 +1,101 @@
+import time
+from asyncpg.connection import os
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import lsn_from_hex
+
+
+# Test the GC implementation when running with branching
+# This test reproduces the issue https://github.com/neondatabase/neon/issues/707.
+#
+# Consider two LSNs `lsn1` and `lsn2` with some delta files as folows:
+# ...
+# ... -> has an image layer xx_p with p < lsn1
+# ...
+# lsn1
+# ...
+# ... -> has an image layer xx_q with lsn1 < q < lsn2
+# ...
+# lsn2
+#
+# Consider running a GC iteration such that the GC horizon is between p and lsn1
+# ...
+# ... -> has an image layer xx_p with p < lsn1
+# ...
+# ||| -------> a delta layer D's start
+# ... -> gc horizon h such that p < h < lsn1
+# lsn1
+# ||| -------> a delta layer D's end
+# ...
+# ... -> has an image layer xx_q with lsn1 < q < lsn2
+# ...
+# lsn2
+#
+# As described in the issue #707, the image layer xx_p will be deleted as
+# there exists a newer image layer xx_q. However, removing xx_p will corrupt
+# any delta layers that depend on xx_p that are not deleted by GC.
+# For example, the delta layer D is corrupted in the above example.
+#
+# Because the delta layer D covering lsn1 is corrupted, creating a branch
+# starting from lsn1 should return an error as follows:
+#     could not find data for key ... at LSN ..., for request at LSN ...
+def test_branch_and_gc(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable background GC
+            'gc_period': '10 m',
+            'gc_horizon': f'{10 * 1024 ** 3}',
+
+            # small checkpoint distance to create more delta layer files
+            'checkpoint_distance': f'{1024 ** 2}',
+
+            # set the target size to be large to allow the image layer to cover the whole key space
+            'compaction_target_size': f'{1024 ** 3}',
+
+            # tweak the default settings to allow quickly create image layers and L1 layers
+            'compaction_period': '1 s',
+            'compaction_threshold': '2',
+            'image_creation_threshold': '1',
+
+            # set PITR interval to be small, so we can do GC
+            'pitr_interval': '1 s'
+        })
+
+    timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant)
+    pg_main = env.postgres.create_start('test_main', tenant_id=tenant)
+
+    main_cur = pg_main.connect().cursor()
+
+    main_cur.execute(
+        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')"
+    )
+    main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn1 = main_cur.fetchone()[0]
+    log.info(f'LSN1: {lsn1}')
+
+    main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn2 = main_cur.fetchone()[0]
+    log.info(f'LSN2: {lsn2}')
+
+    # set the GC horizon such that it doesn't cover lsn1 so that
+    # we can create a new branch starting from lsn1
+    env.pageserver.safe_psql(
+        f'''do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}'''
+    )
+
+    env.neon_cli.create_branch('test_branch',
+                               'test_main',
+                               tenant_id=tenant,
+                               ancestor_start_lsn=lsn1)
+    pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant)
+
+    branch_cur = pg_branch.connect().cursor()
+    branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
+
+    branch_cur.execute('SELECT count(*) FROM foo')
+    assert branch_cur.fetchone() == (200000, )
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,4 +1,3 @@
-import subprocess
 from contextlib import closing

 import psycopg2.extras
--- a/test_runner/batch_others/test_fullbackup.py
+++ b/test_runner/batch_others/test_fullbackup.py
@@ -1,16 +1,10 @@
-import subprocess
 from contextlib import closing

-import psycopg2.extras
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres
 from fixtures.neon_fixtures import pg_distrib_dir
 import os
-from fixtures.utils import mkdir_if_needed, subprocess_capture
-import shutil
-import getpass
-import pwd
+from fixtures.utils import subprocess_capture

 num_rows = 1000

@@ -46,19 +40,20 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder,
    psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}

    # Get and unpack fullbackup from pageserver
-    restored_dir_path = os.path.join(env.repo_dir, "restored_datadir")
+    restored_dir_path = env.repo_dir / "restored_datadir"
    os.mkdir(restored_dir_path, 0o750)
    query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}"
    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
    result_basepath = pg_bin.run_capture(cmd, env=psql_env)
    tar_output_file = result_basepath + ".stdout"
-    subprocess_capture(str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", restored_dir_path])
+    subprocess_capture(str(env.repo_dir),
+                       ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)])

    # HACK
    # fullbackup returns neon specific pg_control and first WAL segment
    # use resetwal to overwrite it
    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal')
-    cmd = [pg_resetwal_path, "-D", restored_dir_path]
+    cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
    pg_bin.run_capture(cmd, env=psql_env)

    # Restore from the backup and find the data we inserted
--- a/test_runner/batch_others/test_import.py
+++ b/test_runner/batch_others/test_import.py
@@ -191,3 +191,8 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
    # Check it's the same as the first fullbackup
    # TODO pageserver should be checking checksum
    assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
+
+    # Check that gc works
+    psconn = env.pageserver.connect()
+    pscur = psconn.cursor()
+    pscur.execute(f"do_gc {tenant.hex} {timeline} 0")
--- a/test_runner/batch_others/test_remote_storage.py
+++ b/test_runner/batch_others/test_remote_storage.py
@@ -1,5 +1,5 @@
 # It's possible to run any regular test with the local fs remote storage via
-# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ......
+# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......

 import shutil, os
 from contextlib import closing
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -1,3 +1,4 @@
+import pathlib
 import pytest
 import random
 import time
@@ -14,7 +15,7 @@ from dataclasses import dataclass, field
 from multiprocessing import Process, Value
 from pathlib import Path
 from fixtures.neon_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol
-from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex
+from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex
 from fixtures.log_helper import log
 from typing import List, Optional, Any
 from uuid import uuid4
@@ -645,7 +646,7 @@ class ProposerPostgres(PgProtocol):
    def create_dir_config(self, safekeepers: str):
        """ Create dir and config for running --sync-safekeepers """

-        mkdir_if_needed(self.pg_data_dir_path())
+        pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True)
        with open(self.config_file_path(), "w") as f:
            cfg = [
                "synchronous_standby_names = 'walproposer'\n",
@@ -828,7 +829,7 @@ class SafekeeperEnv:

        self.timeline_id = uuid.uuid4()
        self.tenant_id = uuid.uuid4()
-        mkdir_if_needed(str(self.repo_dir))
+        self.repo_dir.mkdir(exist_ok=True)

        # Create config and a Safekeeper object for each safekeeper
        self.safekeepers = []
@@ -847,8 +848,8 @@ class SafekeeperEnv:
            http=self.port_distributor.get_port(),
        )

-        safekeeper_dir = os.path.join(self.repo_dir, f"sk{i}")
-        mkdir_if_needed(safekeeper_dir)
+        safekeeper_dir = self.repo_dir / f"sk{i}"
+        safekeeper_dir.mkdir(exist_ok=True)

        args = [
            self.bin_safekeeper,
@@ -857,7 +858,7 @@ class SafekeeperEnv:
            "--listen-http",
            f"127.0.0.1:{port.http}",
            "-D",
-            safekeeper_dir,
+            str(safekeeper_dir),
            "--id",
            str(i),
            "--broker-endpoints",
--- a/test_runner/batch_others/test_wal_restore.py
+++ b/test_runner/batch_others/test_wal_restore.py
@@ -1,19 +1,17 @@
 import os
-import subprocess
+from pathlib import Path

 from fixtures.neon_fixtures import (NeonEnvBuilder,
                                    VanillaPostgres,
                                    PortDistributor,
                                    PgBin,
                                    base_dir,
-                                    vanilla_pg,
                                    pg_distrib_dir)
-from fixtures.log_helper import log


 def test_wal_restore(neon_env_builder: NeonEnvBuilder,
                     pg_bin: PgBin,
-                     test_output_dir,
+                     test_output_dir: Path,
                     port_distributor: PortDistributor):
    env = neon_env_builder.init_start()
    env.neon_cli.create_branch("test_wal_restore")
@@ -22,13 +20,13 @@ def test_wal_restore(neon_env_builder: NeonEnvBuilder,
    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
    env.neon_cli.pageserver_stop()
    port = port_distributor.get_port()
-    data_dir = os.path.join(test_output_dir, 'pgsql.restored')
+    data_dir = test_output_dir / 'pgsql.restored'
    with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored:
        pg_bin.run_capture([
            os.path.join(base_dir, 'libs/utils/scripts/restore_from_wal.sh'),
            os.path.join(pg_distrib_dir, 'bin'),
-            os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)),
-            data_dir,
+            str(test_output_dir / 'repo' / 'safekeepers' / 'sk1' / str(tenant_id) / '*'),
+            str(data_dir),
            str(port)
        ])
        restored.start()
--- a/test_runner/batch_pg_regress/test_isolation.py
+++ b/test_runner/batch_pg_regress/test_isolation.py
@@ -1,13 +1,13 @@
 import os
+from pathlib import Path
 import pytest
-from fixtures.utils import mkdir_if_needed
 from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir


 # The isolation tests run for a long time, especially in debug mode,
 # so use a larger-than-default timeout.
@pytest.mark.timeout(1800)
-def test_isolation(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys):
+def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys):
    env = neon_simple_env

    env.neon_cli.create_branch("test_isolation", "empty")
@@ -17,9 +17,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys):
    pg.safe_psql('CREATE DATABASE isolation_regression')

    # Create some local directories for pg_isolation_regress to run in.
-    runpath = os.path.join(test_output_dir, 'regress')
-    mkdir_if_needed(runpath)
-    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+    runpath = test_output_dir / 'regress'
+    (runpath / 'testtablespace').mkdir(parents=True)

    # Compute all the file locations that pg_isolation_regress will need.
    build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation')
--- a/test_runner/batch_pg_regress/test_neon_regress.py
+++ b/test_runner/batch_pg_regress/test_neon_regress.py
@@ -1,6 +1,6 @@
 import os
+from pathlib import Path

-from fixtures.utils import mkdir_if_needed
 from fixtures.neon_fixtures import (NeonEnv,
                                    check_restored_datadir_content,
                                    base_dir,
@@ -8,7 +8,7 @@ from fixtures.neon_fixtures import (NeonEnv,
 from fixtures.log_helper import log


-def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys):
+def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys):
    env = neon_simple_env

    env.neon_cli.create_branch("test_neon_regress", "empty")
@@ -17,9 +17,8 @@ def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir, pg_bin, capsys)
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
-    runpath = os.path.join(test_output_dir, 'regress')
-    mkdir_if_needed(runpath)
-    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+    runpath = test_output_dir / 'regress'
+    (runpath / 'testtablespace').mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
    # This test runs neon specific tests
--- a/test_runner/batch_pg_regress/test_pg_regress.py
+++ b/test_runner/batch_pg_regress/test_pg_regress.py
@@ -1,13 +1,13 @@
 import os
+import pathlib
 import pytest
-from fixtures.utils import mkdir_if_needed
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content, base_dir, pg_distrib_dir


 # The pg_regress tests run for a long time, especially in debug mode,
 # so use a larger-than-default timeout.
@pytest.mark.timeout(1800)
-def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, capsys):
+def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_bin, capsys):
    env = neon_simple_env

    env.neon_cli.create_branch("test_pg_regress", "empty")
@@ -16,9 +16,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, caps
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
-    runpath = os.path.join(test_output_dir, 'regress')
-    mkdir_if_needed(runpath)
-    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+    runpath = test_output_dir / 'regress'
+    (runpath / 'testtablespace').mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
    build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress')
@@ -51,7 +50,7 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: str, pg_bin, caps

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
-        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
+        pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -35,12 +35,7 @@ from typing_extensions import Literal
 import requests
 import backoff  # type: ignore

-from .utils import (etcd_path,
-                    get_self_dir,
-                    mkdir_if_needed,
-                    subprocess_capture,
-                    lsn_from_hex,
-                    lsn_to_hex)
+from .utils import (etcd_path, get_self_dir, subprocess_capture, lsn_from_hex, lsn_to_hex)
 from fixtures.log_helper import log
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
@@ -50,7 +45,7 @@ A fixture is created with the decorator @pytest.fixture decorator.
 See docs: https://docs.pytest.org/en/6.2.x/fixture.html

 There are several environment variables that can control the running of tests:
-ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.
+NEON_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.

 There's no need to import this file to use it. It should be declared as a plugin
 inside conftest.py, and that makes it available to all tests.
@@ -127,7 +122,7 @@ def pytest_configure(config):
        top_output_dir = env_test_output
    else:
        top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
-    mkdir_if_needed(top_output_dir)
+    pathlib.Path(top_output_dir).mkdir(exist_ok=True)

    # Find the postgres installation.
    global pg_distrib_dir
@@ -151,7 +146,7 @@ def pytest_configure(config):
        return
    # Find the neon binaries.
    global neon_binpath
-    env_neon_bin = os.environ.get('ZENITH_BIN')
+    env_neon_bin = os.environ.get('NEON_BIN')
    if env_neon_bin:
        neon_binpath = env_neon_bin
    else:
@@ -1316,7 +1311,7 @@ def append_pageserver_param_overrides(

 class PgBin:
    """ A helper class for executing postgres binaries """
-    def __init__(self, log_dir: str):
+    def __init__(self, log_dir: Path):
        self.log_dir = log_dir
        self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin')
        self.env = os.environ.copy()
@@ -1367,22 +1362,27 @@ class PgBin:
        self._fixpath(command)
        log.info('Running command "{}"'.format(' '.join(command)))
        env = self._build_env(env)
-        return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs)
+        return subprocess_capture(str(self.log_dir),
+                                  command,
+                                  env=env,
+                                  cwd=cwd,
+                                  check=True,
+                                  **kwargs)


@pytest.fixture(scope='function')
-def pg_bin(test_output_dir: str) -> PgBin:
+def pg_bin(test_output_dir: Path) -> PgBin:
    return PgBin(test_output_dir)


 class VanillaPostgres(PgProtocol):
-    def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int, init=True):
+    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
        super().__init__(host='localhost', port=port, dbname='postgres')
        self.pgdatadir = pgdatadir
        self.pg_bin = pg_bin
        self.running = False
        if init:
-            self.pg_bin.run_capture(['initdb', '-D', pgdatadir])
+            self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)])
        self.configure([f"port = {port}\n"])

    def configure(self, options: List[str]):
@@ -1398,12 +1398,13 @@ class VanillaPostgres(PgProtocol):
        if log_path is None:
            log_path = os.path.join(self.pgdatadir, "pg.log")

-        self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, '-l', log_path, 'start'])
+        self.pg_bin.run_capture(
+            ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start'])

    def stop(self):
        assert self.running
        self.running = False
-        self.pg_bin.run_capture(['pg_ctl', '-w', '-D', self.pgdatadir, 'stop'])
+        self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop'])

    def get_subdir_size(self, subdir) -> int:
        """Return size of pgdatadir subdirectory in bytes."""
@@ -1418,9 +1419,9 @@ class VanillaPostgres(PgProtocol):


@pytest.fixture(scope='function')
-def vanilla_pg(test_output_dir: str,
+def vanilla_pg(test_output_dir: Path,
               port_distributor: PortDistributor) -> Iterator[VanillaPostgres]:
-    pgdatadir = os.path.join(test_output_dir, "pgdata-vanilla")
+    pgdatadir = test_output_dir / "pgdata-vanilla"
    pg_bin = PgBin(test_output_dir)
    port = port_distributor.get_port()
    with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg:
@@ -1457,7 +1458,7 @@ class RemotePostgres(PgProtocol):


@pytest.fixture(scope='function')
-def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]:
+def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]:
    pg_bin = PgBin(test_output_dir)

    connstr = os.getenv("BENCHMARK_CONNSTR")
@@ -1980,11 +1981,13 @@ class Etcd:
            self.handle.wait()


-def get_test_output_dir(request: Any) -> str:
+def get_test_output_dir(request: Any) -> pathlib.Path:
    """ Compute the working directory for an individual test. """
    test_name = request.node.name
-    test_dir = os.path.join(str(top_output_dir), test_name)
+    test_dir = pathlib.Path(top_output_dir) / test_name
    log.info(f'get_test_output_dir is {test_dir}')
+    # make mypy happy
+    assert isinstance(test_dir, pathlib.Path)
    return test_dir


@@ -1998,14 +2001,14 @@ def get_test_output_dir(request: Any) -> str:
 # this fixture ensures that the directory exists.  That works because
 # 'autouse' fixtures are run before other fixtures.
@pytest.fixture(scope='function', autouse=True)
-def test_output_dir(request: Any) -> str:
+def test_output_dir(request: Any) -> pathlib.Path:
    """ Create the working directory for an individual test. """

    # one directory per test
    test_dir = get_test_output_dir(request)
    log.info(f'test_output_dir is {test_dir}')
    shutil.rmtree(test_dir, ignore_errors=True)
-    mkdir_if_needed(test_dir)
+    test_dir.mkdir()
    return test_dir


@@ -2051,7 +2054,7 @@ def should_skip_file(filename: str) -> bool:
 #
 # Test helpers
 #
-def list_files_to_compare(pgdata_dir: str):
+def list_files_to_compare(pgdata_dir: pathlib.Path):
    pgdata_files = []
    for root, _file, filenames in os.walk(pgdata_dir):
        for filename in filenames:
@@ -2068,7 +2071,7 @@ def list_files_to_compare(pgdata_dir: str):


 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postgres):
+def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres):

    # Get the timeline ID. We need it for the 'basebackup' command
    with closing(pg.connect()) as conn:
@@ -2080,8 +2083,8 @@ def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postg
    pg.stop()

    # Take a basebackup from pageserver
-    restored_dir_path = os.path.join(env.repo_dir, f"{pg.node_name}_restored_datadir")
-    mkdir_if_needed(restored_dir_path)
+    restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir"
+    restored_dir_path.mkdir(exist_ok=True)

    pg_bin = PgBin(test_output_dir)
    psql_path = os.path.join(pg_bin.pg_bin_path, 'psql')
@@ -2108,7 +2111,7 @@ def check_restored_datadir_content(test_output_dir: str, env: NeonEnv, pg: Postg

    # list files we're going to compare
    assert pg.pgdata_dir
-    pgdata_files = list_files_to_compare(pg.pgdata_dir)
+    pgdata_files = list_files_to_compare(pathlib.Path(pg.pgdata_dir))
    restored_files = list_files_to_compare(restored_dir_path)

    # check that file sets are equal
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -12,18 +12,6 @@ def get_self_dir() -> str:
    return os.path.dirname(os.path.abspath(__file__))


-def mkdir_if_needed(path: str) -> None:
-    """ Create a directory if it doesn't already exist
-
-    Note this won't try to create intermediate directories.
-    """
-    try:
-        os.mkdir(path)
-    except FileExistsError:
-        pass
-    assert os.path.isdir(path)
-
-
 def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
    """ Run a process and capture its output

--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -80,6 +80,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it
            thread.join()


+@pytest.mark.timeout(1000)
@pytest.mark.parametrize("n_tables", [5])
@pytest.mark.parametrize("scale", get_scales_matrix(5))
@pytest.mark.parametrize("num_iters", [10])
@@ -121,6 +122,7 @@ def start_pgbench_simple_update_workload(env: PgCompare, duration: int):
        env.flush()


+@pytest.mark.timeout(1000)
@pytest.mark.parametrize("scale", get_scales_matrix(100))
@pytest.mark.parametrize("duration", get_durations_matrix())
 def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, duration: int):
@@ -158,6 +160,7 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int):
        ])


+@pytest.mark.timeout(1000)
@pytest.mark.parametrize("scale", get_scales_matrix(1000))
 def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int):
    env = pg_compare
Author	SHA1	Message	Date
Thang Pham	5f6f5f517a	revert fix for #707	2022-07-05 09:53:01 -04:00
Thang Pham	93e136a03c	update PITR	2022-07-05 09:51:18 -04:00
Thang Pham	0205e29185	update `test_branch_and_gc` test	2022-07-04 19:41:09 -04:00
Dhammika Pathirana	605ec2b4aa	Fix add ps tenant config Signed-off-by: Dhammika Pathirana <dhammika@gmail.com>	2022-07-04 15:26:43 -04:00
Dhammika Pathirana	ff0ad4213c	Add test config with ps compaction_threshold Signed-off-by: Dhammika Pathirana <dhammika@gmail.com>	2022-07-04 15:25:01 -04:00
Dhammika Pathirana	0664100755	Add a test for gc dropping active layers (#707 ) Signed-off-by: Dhammika Pathirana <dhammika@gmail.com>	2022-07-04 15:25:01 -04:00
Dmitry Rodionov	65704708fa	remove unused imports, make more use of pathlib.Path	2022-07-01 18:56:51 +03:00
Arseny Sher	6100a02d0f	Prefix WAL files in s3 with environment name. It wasn't merged to prod yet, so safe to enable.	2022-07-01 19:21:28 +04:00
Arseny Sher	97fed38213	Fix `cadaca010c` for older ssh clients.	2022-07-01 19:20:59 +04:00
Arseny Sher	cadaca010c	Make ansible to work with storage nodes through teleport from local box.	2022-07-01 16:58:34 +03:00
Bojan Serafimov	f09c09438a	Fix gc after import	2022-07-01 11:10:49 +03:00
Dmitry Rodionov	00fc696606	replace extra urlencode dependency with already present url library	2022-06-30 14:32:15 +03:00
Kirill Bulatov	1d0706cf25	Fix walreceiver connection selection mechanism * Avoid reconnecting to safekeeper immediately after its failure by limiting candidates to those with fewest connection attempts. Thus we don't have to wait lagging_wal_timeout (10s by default) before switch happens even if no new changes are generated, and current test_restarts_under_load expects some commits to happen within 4s. * Make default max_lsn_wal_lag larger, otherwise we constant reconnections happen during normal work. * Fix wal_connection_attempts maintanance, preventing busy loop of reconnections.	2022-06-30 00:40:12 +03:00
Dmitry Ivanov	5ee19b0758	Fix bloated coverage uploads (#2005 ) Move coverage data to a better directory, merge it better and don't publish it from CircleCI pipeline	2022-06-29 17:59:19 +03:00
Kirill Bulatov	cef90d9220	Disable cachepot for GH Actions builds (#2007 )	2022-06-29 17:56:02 +03:00
Kirill Bulatov	4a05413a4c	More code coverage fixes in GH Actions (#2002 )	2022-06-27 22:40:20 +03:00
Kirill Bulatov	dd61f3558f	Fix coverage upload credentials retrieval (#2001 )	2022-06-27 20:41:09 +03:00
Kirill Bulatov	8a714f1ebf	Add coverage to GH actions and rework part of them (#1987 )	2022-06-27 19:15:56 +03:00
Arseny Sher	137291dc24	Push to etcd from safekeeper many timelines concurrently. Mitigates latency fee, making push throughput 1-1.5 order of magnitude bigger. Also make leases per timeline, not per whole safekeeper, avoiding storing garbage in etcd for deleted timelines while safekeeper is alive.	2022-06-27 16:30:21 +03:00
Kirill Bulatov	eb8926083e	Use the updated base build Docker image (#1972 )	2022-06-27 13:12:58 +03:00
Johan Eliasson	26bca6ddba	Add `openssl` to OSX dependencies (#1994 )	2022-06-26 21:54:07 +03:00
Arthur Petukhovsky	55192384c3	Fix zero timeline_start_lsn (#1981 ) * Fix zero timeline_start_lsn * Log more info on control file upgrade * Fix formatting Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>	2022-06-24 13:59:37 +03:00
KlimentSerafimov	392cd8b1fc	Refactored extracting project_name in console.rs. (#1982 )	2022-06-24 05:57:33 -04:00
Alexey Kondratov	3cc531d093	Fix CREATE EXTENSION for non-db-owner users (#1408 ) Previously, we were granting create only to db owner, but now we have a dedicated 'web_access' role to connect via web UI and proxy link auth. We anyway grant read / write all data to all roles, so let's grant create to everyone too. This creates some provelege objects in each db, which we need to drop before deleting the role. So now we reassign all owned objects to each db owner before deletion. This also fixes deletion of roles that created some data in any db previously. Will be tested by https://github.com/neondatabase/cloud/pull/1673 Later we should stop messing with Postgres ACL that much.	2022-06-23 21:36:53 +02:00
bojanserafimov	84b9fcbbd5	Increase a few test timeouts (#1977 )	2022-06-23 11:51:56 -04:00