Merge remote-tracking branch 'origin/main' into exp-07-18

2026-06-02 04:50:38 +00:00 · 2022-07-18 11:44:27 -04:00
parent 4ea10c7096 0b14fdb078
commit f85e28ac9e
65 changed files with 1707 additions and 1058 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -0,0 +1,13 @@
+# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
+# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
+# optimizations enabled by "opt-level=1" don't affect debuggability too much.
+#
+# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
+#
+[profile.dev.package."*"]
+# Set the default for dependencies in Development mode.
+opt-level = 3
+
+[profile.dev]
+# Turn on a small amount of optimization in Development mode.
+opt-level = 1
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,10 +5,10 @@ executors:
    resource_class: xlarge
    docker:
      # NB: when changed, do not forget to update rust image tag in all Dockerfiles
-      - image: zimg/rust:1.58
+      - image: neondatabase/rust:1.58
  neon-executor:
    docker:
-      - image: zimg/rust:1.58
+      - image: neondatabase/rust:1.58

 jobs:
  # A job to build postgres
@@ -37,7 +37,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+            - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

        # Build postgres if the restore_cache didn't find a build.
        # `make` can't figure out whether the cache is valid, since
@@ -54,7 +54,7 @@ jobs:

      - save_cache:
          name: Save postgres cache
-          key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+          key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
          paths:
            - tmp_install

@@ -85,7 +85,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+            - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

      - restore_cache:
          name: Restore rust cache
@@ -93,7 +93,7 @@ jobs:
            # Require an exact match. While an out of date cache might speed up the build,
            # there's no way to clean out old packages, so the cache grows every time something
            # changes.
-            - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+            - v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}

        # Build the rust code, including test binaries
      - run:
@@ -107,7 +107,7 @@ jobs:

            export CARGO_INCREMENTAL=0
            export CACHEPOT_BUCKET=zenith-rust-cachepot
-            export RUSTC_WRAPPER=cachepot
+            export RUSTC_WRAPPER=""
            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
            mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
@@ -115,7 +115,7 @@ jobs:

      - save_cache:
          name: Save rust cache
-          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+          key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
          paths:
            - ~/.cargo/registry
            - ~/.cargo/git
@@ -142,11 +142,6 @@ jobs:
              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
            )

-            test_exe_paths=$(
-              cargo test --message-format=json --no-run |
-              jq -r '.executable | select(. != null)'
-            )
-
            mkdir -p /tmp/zenith/bin
            mkdir -p /tmp/zenith/test_bin
            mkdir -p /tmp/zenith/etc
@@ -330,274 +325,6 @@ jobs:
          paths:
            - "*"

-  # Build neondatabase/neon:latest image and push it to Docker hub
-  docker-image:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build \
-              --pull \
-              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
-            docker push neondatabase/neon:${DOCKER_TAG}
-            docker push neondatabase/neon:latest
-
-  # Build neondatabase/compute-node:latest image and push it to Docker hub
-  docker-image-compute:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Build and push compute-tools Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            docker build \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:local \
-              --tag neondatabase/compute-tools:latest \
-              -f Dockerfile.compute-tools .
-            # Only push :latest image
-            docker push neondatabase/compute-tools:latest
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push compute-node Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
-              --tag neondatabase/compute-node:latest vendor/postgres \
-              --build-arg COMPUTE_TOOLS_TAG=local
-            docker push neondatabase/compute-node:${DOCKER_TAG}
-            docker push neondatabase/compute-node:latest
-
-  # Build production neondatabase/neon:release image and push it to Docker hub
-  docker-image-release:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            docker build \
-              --pull \
-              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
-            docker push neondatabase/neon:${DOCKER_TAG}
-            docker push neondatabase/neon:release
-
-  # Build production neondatabase/compute-node:release image and push it to Docker hub
-  docker-image-compute-release:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Build and push compute-tools Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            docker build \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:release \
-              --tag neondatabase/compute-tools:local \
-              -f Dockerfile.compute-tools .
-            # Only push :release image
-            docker push neondatabase/compute-tools:release
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push compute-node Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
-              --tag neondatabase/compute-node:release vendor/postgres \
-              --build-arg COMPUTE_TOOLS_TAG=local
-            docker push neondatabase/compute-node:${DOCKER_TAG}
-            docker push neondatabase/compute-node:release
-
-  deploy-staging:
-    docker:
-      - image: cimg/python:3.10
-    steps:
-      - checkout
-      - setup_remote_docker
-      - run:
-          name: Setup ansible
-          command: |
-            pip install --progress-bar off --user ansible boto3
-      - run:
-          name: Redeploy
-          command: |
-            cd "$(pwd)/.circleci/ansible"
-
-            ./get_binaries.sh
-
-            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
-            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-            chmod 0600 ssh-key
-            ssh-add ssh-key
-            rm -f ssh-key ssh-key-cert.pub
-
-            ansible-playbook deploy.yaml -i staging.hosts
-            rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-staging-proxy:
-    docker:
-      - image: cimg/base:2021.04
-    environment:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - checkout
-      - run:
-          name: Store kubeconfig file
-          command: |
-            echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
-            chmod 0600 ${KUBECONFIG}
-      - run:
-          name: Setup helm v3
-          command: |
-            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add neondatabase https://neondatabase.github.io/helm-charts
-      - run:
-          name: Re-deploy proxy
-          command: |
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-
-  deploy-neon-stress:
-    docker:
-      - image: cimg/python:3.10
-    steps:
-      - checkout
-      - setup_remote_docker
-      - run:
-          name: Setup ansible
-          command: |
-            pip install --progress-bar off --user ansible boto3
-      - run:
-          name: Redeploy
-          command: |
-            cd "$(pwd)/.circleci/ansible"
-
-            ./get_binaries.sh
-
-            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
-            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-            chmod 0600 ssh-key
-            ssh-add ssh-key
-            rm -f ssh-key ssh-key-cert.pub
-
-            ansible-playbook deploy.yaml -i neon-stress.hosts
-            rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-neon-stress-proxy:
-    docker:
-      - image: cimg/base:2021.04
-    environment:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - checkout
-      - run:
-          name: Store kubeconfig file
-          command: |
-            echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
-            chmod 0600 ${KUBECONFIG}
-      - run:
-          name: Setup helm v3
-          command: |
-            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add neondatabase https://neondatabase.github.io/helm-charts
-      - run:
-          name: Re-deploy proxy
-          command: |
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade neon-stress-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
-            helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
-
-  deploy-release:
-    docker:
-      - image: cimg/python:3.10
-    steps:
-      - checkout
-      - setup_remote_docker
-      - run:
-          name: Setup ansible
-          command: |
-            pip install --progress-bar off --user ansible boto3
-      - run:
-          name: Redeploy
-          command: |
-            cd "$(pwd)/.circleci/ansible"
-
-            RELEASE=true ./get_binaries.sh
-
-            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
-            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-            chmod 0600 ssh-key
-            ssh-add ssh-key
-            rm -f ssh-key ssh-key-cert.pub
-
-            ansible-playbook deploy.yaml -i production.hosts
-            rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-release-proxy:
-    docker:
-      - image: cimg/base:2021.04
-    environment:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - checkout
-      - run:
-          name: Store kubeconfig file
-          command: |
-            echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
-            chmod 0600 ${KUBECONFIG}
-      - run:
-          name: Setup helm v3
-          command: |
-            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add neondatabase https://neondatabase.github.io/helm-charts
-      - run:
-          name: Re-deploy proxy
-          command: |
-            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
-            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
-
 workflows:
  build_and_test:
    jobs:
@@ -640,103 +367,3 @@ workflows:
          save_perf_report: true
          requires:
            - build-neon-release
-      - docker-image:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - docker-image-compute:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - deploy-staging:
-          # Context gives an ability to login
-          context: Docker Hub
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-      - deploy-staging-proxy:
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-
-      - deploy-neon-stress:
-          # Context gives an ability to login
-          context: Docker Hub
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-      - deploy-neon-stress-proxy:
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-
-      - docker-image-release:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - docker-image-compute-release:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - deploy-release:
-          # Context gives an ability to login
-          context: Docker Hub
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - docker-image-release
-      - deploy-release-proxy:
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - docker-image-release
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -37,6 +37,12 @@ runs:
        name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
        path: ./neon-artifact/

+    - name: Get Postgres artifact for restoration
+      uses: actions/download-artifact@v3
+      with:
+        name: postgres-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        path: ./pg-artifact/
+
    - name: Extract Neon artifact
      shell: bash -ex {0}
      run: |
@@ -44,6 +50,13 @@ runs:
        tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
        rm -rf ./neon-artifact/

+    - name: Extract Postgres artifact
+      shell: bash -ex {0}
+      run: |
+        mkdir -p /tmp/neon/tmp_install
+        tar -xf ./pg-artifact/pg.tgz -C /tmp/neon/tmp_install
+        rm -rf ./pg-artifact/
+
    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
      uses: actions/checkout@v3
@@ -65,7 +78,7 @@ runs:
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
-        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        POSTGRES_DISTRIB_DIR: /tmp/neon/tmp_install
        TEST_OUTPUT: /tmp/test_output
        # this variable will be embedded in perf test report
        # and is needed to distinguish different environments
--- a/.circleci/ansible/.gitignore
+++ b/.circleci/ansible/.gitignore
--- a/.circleci/ansible/ansible.cfg
+++ b/.circleci/ansible/ansible.cfg
--- a/.circleci/ansible/ansible.ssh.cfg
+++ b/.circleci/ansible/ansible.ssh.cfg
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
--- a/.circleci/ansible/get_binaries.sh
+++ b/.circleci/ansible/get_binaries.sh
--- a/.circleci/ansible/neon-stress.hosts
+++ b/.circleci/ansible/neon-stress.hosts
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -1,6 +1,7 @@
 [pageservers]
 #zenith-1-ps-1 console_region_id=1
 zenith-1-ps-2 console_region_id=1
+zenith-1-ps-3 console_region_id=1

 [safekeepers]
 zenith-1-sk-1 console_region_id=1
--- a/.circleci/ansible/scripts/init_pageserver.sh
+++ b/.circleci/ansible/scripts/init_pageserver.sh
--- a/.circleci/ansible/scripts/init_safekeeper.sh
+++ b/.circleci/ansible/scripts/init_safekeeper.sh
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
--- a/.circleci/ansible/systemd/pageserver.service
+++ b/.circleci/ansible/systemd/pageserver.service
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
--- a/.circleci/helm-values/neon-stress.proxy-scram.yaml
+++ b/.circleci/helm-values/neon-stress.proxy-scram.yaml
--- a/.circleci/helm-values/neon-stress.proxy.yaml
+++ b/.circleci/helm-values/neon-stress.proxy.yaml
--- a/.circleci/helm-values/production.proxy-scram.yaml
+++ b/.circleci/helm-values/production.proxy-scram.yaml
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
--- a/.circleci/helm-values/staging.proxy-scram.yaml
+++ b/.circleci/helm-values/staging.proxy-scram.yaml
--- a/.circleci/helm-values/staging.proxy.yaml
+++ b/.circleci/helm-values/staging.proxy.yaml
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -104,3 +104,12 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
      run: |
        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,9 +1,10 @@
-name: Test
+name: Test and Deploy

 on:
  push:
    branches:
    - main
+    - release
  pull_request:

 defaults:
@@ -11,8 +12,9 @@ defaults:
    shell: bash -ex {0}

 concurrency:
-   group: ${{ github.workflow }}-${{ github.ref }}
-   cancel-in-progress: true
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true

 env:
  RUST_BACKTRACE: 1
@@ -93,12 +95,17 @@ jobs:
          tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/
          rm -rf ./postgres-artifact/

+      # Don't include the ~/.cargo/registry/src directory. It contains just
+      # uncompressed versions of the crates in ~/.cargo/registry/cache
+      # directory, and it's faster to let 'cargo' to rebuild it from the
+      # compressed crates.
      - name: Cache cargo deps
        id: cache_cargo
        uses: actions/cache@v3
        with:
          path: |
            ~/.cargo/registry/
+            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
@@ -170,14 +177,14 @@ jobs:
            for bin in $test_exe_paths; do
              SRC=$bin
              DST=/tmp/neon/test_bin/$(basename $bin)
-              cp "$SRC" "$DST"
+
+              # We don't need debug symbols for code coverage, so strip them out to make
+              # the artifact smaller.
+              strip "$SRC" -o "$DST"
              echo "$DST" >> /tmp/coverage/binaries.list
            done
          fi

-      - name: Install postgres binaries
-        run: cp -a tmp_install /tmp/neon/pg_install
-
      - name: Prepare neon artifact
        run: tar -C /tmp/neon/ -czf ./neon.tgz .

@@ -298,6 +305,7 @@ jobs:
        with:
          path: |
            ~/.cargo/registry/
+            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
          key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
@@ -390,3 +398,253 @@ jobs:
               \"remote_repo\": \"${{ github.repository }}\"
             }
           }"
+
+  docker-image:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ pg_regress-tests, other-tests ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+        with:
+          driver: docker
+
+      - name: Get build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::$(git rev-list --count HEAD)"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: build-tag
+
+      - name: Get legacy build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::latest
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: legacy-build-tag
+
+      - name: Build neon Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION="${{github.sha}}"
+            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
+            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
+          pull: true
+          push: true
+          tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
+
+  docker-image-compute:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ pg_regress-tests, other-tests ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+        with:
+          driver: docker
+
+      - name: Get build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::$(git rev-list --count HEAD)"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: build-tag
+
+      - name: Get legacy build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::latest
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: legacy-build-tag
+
+      - name: Build compute-tools Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION="${{github.sha}}"
+            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
+            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
+          push: false
+          file: Dockerfile.compute-tools
+          tags: neondatabase/compute-tools:local
+
+      - name: Push compute-tools Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION="${{github.sha}}"
+            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
+            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
+          push: true
+          file: Dockerfile.compute-tools
+          tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
+
+      - name: Build compute-node Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: ./vendor/postgres/
+          build-args:
+            COMPUTE_TOOLS_TAG=local
+          push: true
+          tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
+
+  calculate-deploy-targets:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    outputs:
+      matrix-include: ${{ steps.set-matrix.outputs.include }}
+    steps:
+      - id: set-matrix
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
+            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
+            echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
+            echo "::set-output name=include::[$PRODUCTION]"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+  deploy:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    # We need both storage **and** compute images for deploy, because control plane
+    # picks the compute version based on the storage version. If it notices a fresh
+    # storage it may bump the compute version. And if compute image failed to build
+    # it may break things badly.
+    needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    strategy:
+      matrix:
+        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Setup ansible
+        run: |
+          pip install --progress-bar off --user ansible boto3
+
+      - name: Redeploy
+        run: |
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          eval $(ssh-agent)
+          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
+          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+          chmod 0600 ssh-key
+          ssh-add ssh-key
+          rm -f ssh-key ssh-key-cert.pub
+
+          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-proxy:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it
+    # to run all deploy jobs consistently.
+    needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    strategy:
+      matrix:
+        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Setup helm v3
+        run: |
+          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -11,8 +11,9 @@ defaults:
    shell: bash -ex {0}

 concurrency:
-   group: ${{ github.workflow }}-${{ github.ref }}
-   cancel-in-progress: true
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true

 env:
  RUST_BACKTRACE: 1
@@ -97,6 +98,7 @@ jobs:
        with:
          path: |
            ~/.cargo/registry
+            !~/.cargo/registry/src
            ~/.cargo/git
            target
          key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -13,8 +13,9 @@ on:
  workflow_dispatch:

 concurrency:
-   group: ${{ github.workflow }}-${{ github.ref }}
-   cancel-in-progress: true
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true

 jobs:
  test-postgres-client-libs:
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/39
+++ b/39
@@ -1,3 +1,8 @@
+ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+# Where to install Postgres, default is ./tmp_install, maybe useful for package managers
+POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install
+
 # Seccomp BPF is only available for Linux
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
@@ -55,55 +60,55 @@ zenith: postgres-headers
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

 ### PostgreSQL parts
-tmp_install/build/config.status:
+$(POSTGRES_INSTALL_DIR)/build/config.status:
 	+@echo "Configuring postgres build"
-	mkdir -p tmp_install/build
-	(cd tmp_install/build && \
-	../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build
+	(cd $(POSTGRES_INSTALL_DIR)/build && \
+	$(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		$(SECCOMP) \
-		--prefix=$(abspath tmp_install) > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log)

 # nicer alias for running 'configure'
 .PHONY: postgres-configure
-postgres-configure: tmp_install/build/config.status
+postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status

-# Install the PostgreSQL header files into tmp_install/include
+# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include
 .PHONY: postgres-headers
 postgres-headers: postgres-configure
 	+@echo "Installing PostgreSQL headers"
-	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install

 # Compile and install PostgreSQL and contrib/neon
 .PHONY: postgres
 postgres: postgres-configure \
 		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
 	+@echo "Compiling PostgreSQL"
-	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
 	+@echo "Compiling contrib/neon"
-	$(MAKE) -C tmp_install/build/contrib/neon install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
 	+@echo "Compiling contrib/neon_test_utils"
-	$(MAKE) -C tmp_install/build/contrib/neon_test_utils install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
 	+@echo "Compiling pg_buffercache"
-	$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect"
-	$(MAKE) -C tmp_install/build/contrib/pageinspect install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install


 .PHONY: postgres-clean
 postgres-clean:
-	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
 clean:
-	cd tmp_install/build && $(MAKE) clean
+	cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
 	$(CARGO_CMD_PREFIX) cargo clean

 # This removes everything
 .PHONY: distclean
 distclean:
-	rm -rf tmp_install
+	rm -rf $(POSTGRES_INSTALL_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
@@ -112,4 +117,4 @@ fmt:

 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
-	ln -s -f ../../pre-commit.py .git/hooks/pre-commit
+	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -295,7 +295,7 @@ impl ComputeNode {
        handle_roles(&self.spec, &mut client)?;
        handle_databases(&self.spec, &mut client)?;
        handle_role_deletions(self, &mut client)?;
-        handle_grants(&self.spec, &mut client)?;
+        handle_grants(self, &mut client)?;
        create_writablity_check_data(&mut client)?;

        // 'Close' connection
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -349,9 +349,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    Ok(())
 }

-// Grant CREATE ON DATABASE to the database owner
-// to allow clients create trusted extensions.
-pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
+/// to allow users creating trusted extensions and re-creating `public` schema, for example.
+pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
+    let spec = &node.spec;
+
    info!("cluster spec grants:");

    // We now have a separate `web_access` role to connect to the database
@@ -380,5 +382,47 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        client.execute(query.as_str(), &[])?;
    }

+    // Do some per-database access adjustments. We'd better do this at db creation time,
+    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
+    // atomically.
+    let mut db_connstr = node.connstr.clone();
+    for db in &node.spec.cluster.databases {
+        // database name is always the last and the only component of the path
+        db_connstr.set_path(&db.name);
+
+        let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
+
+        // This will only change ownership on the schema itself, not the objects
+        // inside it. Without it owner of the `public` schema will be `cloud_admin`
+        // and database owner cannot do anything with it. SQL procedure ensures
+        // that it won't error out if schema `public` doesn't exist.
+        let alter_query = format!(
+            "DO $$\n\
+                DECLARE\n\
+                    schema_owner TEXT;\n\
+                BEGIN\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    )\n\
+                    THEN\n\
+                        SELECT nspowner::regrole::text\n\
+                            FROM pg_catalog.pg_namespace\n\
+                            WHERE nspname = 'public'\n\
+                            INTO schema_owner;\n\
+                \n\
+                        IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\
+                        THEN\n\
+                            ALTER SCHEMA public OWNER TO {};\n\
+                        END IF;\n\
+                    END IF;\n\
+                END\n\
+            $$;",
+            db.owner.quote()
+        );
+        db_client.simple_query(&alter_query)?;
+    }
+
    Ok(())
 }
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+book
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,14 +0,0 @@
-# Zenith documentation
-
-## Table of contents
-
- [authentication.md](authentication.md) — pageserver JWT authentication.
- [docker.md](docker.md) — Docker images and building pipeline.
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [sourcetree.md](sourcetree.md) — Overview of the source tree layout.
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -0,0 +1,84 @@
+# Summary
+
+[Introduction]()
+- [Separation of Compute and Storage](./separation-compute-storage.md)
+
+# Architecture
+
+- [Compute]()
+  - [WAL proposer]()
+  - [WAL Backpressure]()
+  - [Postgres changes](./core_changes.md)
+
+- [Pageserver](./pageserver.md)
+    - [Services](./pageserver-services.md)
+    - [Thread management](./pageserver-thread-mgmt.md)
+    - [WAL Redo](./pageserver-walredo.md)
+    - [Page cache](./pageserver-pagecache.md)
+    - [Storage](./pageserver-storage.md)
+        - [Datadir mapping]()
+        - [Layer files]()
+        - [Branching]()
+        - [Garbage collection]()
+    - [Cloud Storage]()
+    - [Processing a GetPage request](./pageserver-processing-getpage.md)
+    - [Processing WAL](./pageserver-processing-wal.md)
+	- [Management API]()
+	- [Tenant Rebalancing]()
+
+- [WAL Service](walservice.md)
+  - [Consensus protocol](safekeeper-protocol.md)
+  - [Management API]()
+  - [Rebalancing]()
+
+- [Control Plane]()
+
+- [Proxy]()
+
+- [Source view](./sourcetree.md)
+  - [docker.md](./docker.md) — Docker images and building pipeline.
+  - [Error handling and logging]()
+  - [Testing]()
+    - [Unit testing]()
+    - [Integration testing]()
+    - [Benchmarks]()
+
+
+- [Glossary](./glossary.md)
+
+# Uncategorized
+
+- [authentication.md](./authentication.md)
+- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
+- [settings.md](./settings.md)
+#FIXME: move these under sourcetree.md
+#- [pageserver/README.md](/pageserver/README.md)
+#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
+#- [test_runner/README.md](/test_runner/README.md)
+#- [safekeeper/README.md](/safekeeper/README.md)
+
+
+# RFCs
+
+- [RFCs](./rfcs/README.md)
+
+- [002-storage](rfcs/002-storage.md)
+- [003-laptop-cli](rfcs/003-laptop-cli.md)
+- [004-durability](rfcs/004-durability.md)
+- [005-zenith_local](rfcs/005-zenith_local.md)
+- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
+- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
+- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
+- [008-push-pull](rfcs/008-push-pull.md)
+- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
+- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
+- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
+- [010-storage_details](rfcs/010-storage_details.md)
+- [011-retention-policy](rfcs/011-retention-policy.md)
+- [012-background-tasks](rfcs/012-background-tasks.md)
+- [013-term-history](rfcs/013-term-history.md)
+- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
+- [014-storage-lsm](rfcs/014-storage-lsm.md)
+- [015-storage-messaging](rfcs/015-storage-messaging.md)
+- [016-connection-routing](rfcs/016-connection-routing.md)
+- [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/docs/book.toml
+++ b/docs/book.toml
@@ -0,0 +1,5 @@
+[book]
+language = "en"
+multilingual = false
+src = "."
+title = "Neon architecture"
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -1,3 +1,12 @@
+# Postgres core changes
+
+This lists all the changes that have been made to the PostgreSQL
+source tree, as a somewhat logical set of patches. The long-term goal
+is to eliminate all these changes, by submitting patches to upstream
+and refactoring code into extensions, so that you can run unmodified
+PostgreSQL against Neon storage.
+
+
 1. Add t_cid to XLOG record
 - Why?
  The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
--- a/docs/pageserver-page-service.md
+++ b/docs/pageserver-page-service.md
@@ -0,0 +1,9 @@
+# Page Service
+
+The Page Service listens for GetPage@LSN requests from the Compute Nodes,
+and responds with pages from the repository. On each GetPage@LSN request,
+it calls into the Repository function
+
+A separate thread is spawned for each incoming connection to the page
+service. The page service uses the libpq protocol to communicate with
+the client. The client is a Compute Postgres instance.
--- a/docs/pageserver-pagecache.md
+++ b/docs/pageserver-pagecache.md
@@ -0,0 +1,8 @@
+# Page cache
+
+TODO:
+
+- shared across tenants
+- store pages from layer files
+- store pages from "in-memory layer"
+- store materialized pages
--- a/docs/pageserver-processing-getpage.md
+++ b/docs/pageserver-processing-getpage.md
@@ -0,0 +1,4 @@
+# Processing a GetPage request
+
+TODO:
+- sequence diagram that shows how a GetPage@LSN request is processed
--- a/docs/pageserver-processing-wal.md
+++ b/docs/pageserver-processing-wal.md
@@ -0,0 +1,5 @@
+# Processing WAL
+
+TODO:
+- diagram that shows how incoming WAL is processed
+- explain durability, what is fsync'd when, disk_consistent_lsn
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -1,15 +1,4 @@
-## Page server architecture
-
-The Page Server has a few different duties:
-
- Respond to GetPage@LSN requests from the Compute Nodes
- Receive WAL from WAL safekeeper
- Replay WAL that's applicable to the chunks that the Page Server maintains
- Backup to S3
-
-S3 is the main fault-tolerant storage of all data, as there are no Page Server
-replicas. We use a separate fault-tolerant WAL service to reduce latency. It
-keeps track of WAL records which are not synced to S3 yet.
+# Services

 The Page Server consists of multiple threads that operate on a shared
 repository of page versions:
@@ -21,18 +10,22 @@ repository of page versions:
                                   | WAL receiver |
                                   |              |
                                   +--------------+
-                                                                                 +----+
-                  +---------+                              ..........            |    |
-                  |         |                              .        .            |    |
- GetPage@LSN      |         |                              . backup .  ------->  | S3 |
------------->    |  Page   |         repository           .        .            |    |
-                  | Service |                              ..........            |    |
-   page           |         |                                                    +----+
+                                                                                 ......
+                  +---------+                              +--------+            .    .
+                  |         |                              |        |            .    .
+ GetPage@LSN      |         |                              | backup |  ------->  . S3 .
+------------->    |  Page   |         repository           |        |            .    .
+                  | Service |                              +--------+            .    .
+   page           |         |                                                    ......
 <-------------    |         |
-                  +---------+      +--------------------+
-		                   |   Checkpointing /  |
-				   | Garbage collection |
-                                   +--------------------+
+                  +---------+     +-----------+     +--------------------+
+                                  | WAL redo  |     | Checkpointing,     |
+                  +----------+    | processes |     | Garbage collection |
+                  |          |    +-----------+     +--------------------+
+                  |   HTTP   |
+                  | mgmt API |
+                  |          |
+                  +----------+

 Legend:

@@ -40,28 +33,77 @@ Legend:
 |  |   A thread or multi-threaded service
 +--+

-....
-.  .   Component at its early development phase.
-....
-
 --->   Data flow
 <---
 ```

-Page Service
------------
+## Page Service

 The Page Service listens for GetPage@LSN requests from the Compute Nodes,
-and responds with pages from the repository.
+and responds with pages from the repository. On each GetPage@LSN request,
+it calls into the Repository function
+
+A separate thread is spawned for each incoming connection to the page
+service. The page service uses the libpq protocol to communicate with
+the client. The client is a Compute Postgres instance.
+
+## WAL Receiver
+
+The WAL receiver connects to the external WAL safekeeping service
+using PostgreSQL physical streaming replication, and continuously
+receives WAL. It decodes the WAL records, and stores them to the
+repository.


-WAL Receiver
------------
+## Backup service

-The WAL receiver connects to the external WAL safekeeping service (or
-directly to the primary) using PostgreSQL physical streaming
-replication, and continuously receives WAL. It decodes the WAL records,
-and stores them to the repository.
+The backup service, responsible for storing pageserver recovery data externally.
+
+Currently, pageserver stores its files in a filesystem directory it's pointed to.
+That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
+Therefore, the server interacts with external, more reliable storage to back up and restore its state.
+
+The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
+There are the following implementations present:
+* local filesystem — to use in tests mainly
+* AWS S3           - to use in production
+
+Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
+
+The backup service is disabled by default and can be enabled to interact with a single remote storage.
+
+CLI examples:
+* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
+* AWS S3  : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
+
+For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
+For local S3 installations, refer to the their documentation for name format and credentials.
+
+Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
+Required sections are:
+
+```toml
+[remote_storage]
+local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
+```
+
+or
+
+```toml
+[remote_storage]
+bucket_name = 'some-sample-bucket'
+bucket_region = 'eu-north-1'
+prefix_in_bucket = '/test_prefix/'
+```
+
+`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.
+
+
+## Repository background tasks
+
+The Repository also has a few different background threads and tokio tasks that perform
+background duties like dumping accumulated WAL data from memory to disk, reorganizing
+files for performance (compaction), and garbage collecting old files.


 Repository
@@ -116,48 +158,6 @@ Remove old on-disk layer files that are no longer needed according to the
 PITR retention policy


-### Backup service
-
-The backup service, responsible for storing pageserver recovery data externally.
-
-Currently, pageserver stores its files in a filesystem directory it's pointed to.
-That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
-Therefore, the server interacts with external, more reliable storage to back up and restore its state.
-
-The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
-There are the following implementations present:
-* local filesystem — to use in tests mainly
-* AWS S3           - to use in production
-
-Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
-
-The backup service is disabled by default and can be enabled to interact with a single remote storage.
-
-CLI examples:
-* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
-* AWS S3  : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
-
-For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
-For local S3 installations, refer to the their documentation for name format and credentials.
-
-Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
-Required sections are:
-
-```toml
-[remote_storage]
-local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
-```
-
-or
-
-```toml
-[remote_storage]
-bucket_name = 'some-sample-bucket'
-bucket_region = 'eu-north-1'
-prefix_in_bucket = '/test_prefix/'
-```
-
-`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

 TODO: Sharding
 --------------------
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -1,4 +1,4 @@
-# Overview
+# Pageserver storage

 The main responsibility of the Page Server is to process the incoming WAL, and
 reprocess it into a format that allows reasonably quick access to any page
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -0,0 +1,26 @@
+## Thread management
+
+Each thread in the system is tracked by the `thread_mgr` module. It
+maintains a registry of threads, and which tenant or timeline they are
+operating on. This is used for safe shutdown of a tenant, or the whole
+system.
+
+### Handling shutdown
+
+When a tenant or timeline is deleted, we need to shut down all threads
+operating on it, before deleting the data on disk. A thread registered
+in the thread registry can check if it has been requested to shut down,
+by calling `is_shutdown_requested()`. For async operations, there's also
+a `shudown_watcher()` async task that can be used to wake up on shutdown.
+
+### Sync vs async
+
+The primary programming model in the page server is synchronous,
+blocking code. However, there are some places where async code is
+used. Be very careful when mixing sync and async code.
+
+Async is primarily used to wait for incoming data on network
+connections. For example, all WAL receivers have a shared thread pool,
+with one async Task for each connection. Once a piece of WAL has been
+received from the network, the thread calls the blocking functions in
+the Repository to process the WAL.
--- a/docs/pageserver-walredo.md
+++ b/docs/pageserver-walredo.md
@@ -0,0 +1,77 @@
+# WAL Redo
+
+To reconstruct a particular page version from an image of the page and
+some WAL records, the pageserver needs to replay the WAL records. This
+happens on-demand, when a GetPage@LSN request comes in, or as part of
+background jobs that reorganize data for faster access.
+
+It's important that data cannot leak from one tenant to another, and
+that a corrupt WAL record on one timeline doesn't affect other tenants
+or timelines.
+
+## Multi-tenant security
+
+If you have direct access to the WAL directory, or if you have
+superuser access to a running PostgreSQL server, it's easy to
+construct a malicious or corrupt WAL record that causes the WAL redo
+functions to crash, or to execute arbitrary code. That is not a
+security problem for PostgreSQL; if you have superuser access, you
+have full access to the system anyway.
+
+The Neon pageserver, however, is multi-tenant. It needs to execute WAL
+belonging to different tenants in the same system, and malicious WAL
+in one tenant must not affect other tenants.
+
+A separate WAL redo process is launched for each tenant, and the
+process uses the seccomp(2) system call to restrict its access to the
+bare minimum needed to replay WAL records. The process does not have
+access to the filesystem or network. It can only communicate with the
+parent pageserver process through a pipe.
+
+If an attacker creates a malicious WAL record and injects it into the
+WAL stream of a timeline, he can take control of the WAL redo process
+in the pageserver. However, the WAL redo process cannot access the
+rest of the system. And because there is a separate WAL redo process
+for each tenant, the hijacked WAL redo process can only see WAL and
+data belonging to the same tenant, which the attacker would have
+access to anyway.
+
+## WAL-redo process communication
+
+The WAL redo process runs the 'postgres' executable, launched with a
+Neon-specific command-line option to put it into WAL-redo process
+mode.  The pageserver controls the lifetime of the WAL redo processes,
+launching them as needed. If a tenant is detached from the pageserver,
+any WAL redo processes for that tenant are killed.
+
+The pageserver communicates with each WAL redo process over its
+stdin/stdout/stderr. It works in request-response model with a simple
+custom protocol, described in walredo.rs. To replay a set of WAL
+records for a page, the pageserver sends the "before" image of the
+page and the WAL records over 'stdin', followed by a command to
+perform the replay. The WAL redo process responds with an "after"
+image of the page.
+
+## Special handling of some records
+
+Some WAL record types are handled directly in the pageserver, by
+bespoken Rust code, and are not sent over to the WAL redo process.
+This includes SLRU-related WAL records, like commit records. SLRUs
+don't use the standard Postgres buffer manager, so dealing with them
+in the Neon WAL redo mode would require quite a few changes to
+Postgres code and special handling in the protocol anyway.
+
+Some record types that include a full-page-image (e.g. XLOG_FPI) are
+also handled specially when incoming WAL is processed already, and are
+stored as page images rather than WAL records.
+
+
+## Records that modify multiple pages
+
+Some Postgres WAL records modify multiple pages. Such WAL records are
+duplicated, so that a copy is stored for each affected page. This is
+somewhat wasteful, but because most WAL records only affect one page,
+the overhead is acceptable.
+
+The WAL redo always happens for one particular page. If the WAL record
+coantains changes to other pages, they are ignored.
--- a/docs/pageserver.md
+++ b/docs/pageserver.md
@@ -0,0 +1,11 @@
+# Page server architecture
+
+The Page Server has a few different duties:
+
+- Respond to GetPage@LSN requests from the Compute Nodes
+- Receive WAL from WAL safekeeper, and store it
+- Upload data to S3 to make it durable, download files from S3 as needed
+
+S3 is the main fault-tolerant storage of all data, as there are no Page Server
+replicas. We use a separate fault-tolerant WAL service to reduce latency. It
+keeps track of WAL records which are not synced to S3 yet.
--- a/docs/safekeeper-protocol.md
+++ b/docs/safekeeper-protocol.md
--- a/docs/separation-compute-storage.md
+++ b/docs/separation-compute-storage.md
@@ -0,0 +1,8 @@
+# Separation of Compute and Storage
+
+TODO:
+
+- Read path
+- Write path
+- Durability model
+- API auth
--- a/safekeeper/README.md
+++ b/safekeeper/README.md
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -2,6 +2,7 @@ extern crate bindgen;

 use std::env;
 use std::path::PathBuf;
+use std::process::Command;

 use bindgen::callbacks::ParseCallbacks;

@@ -45,6 +46,43 @@ fn main() {
    // Tell cargo to invalidate the built crate whenever the wrapper changes
    println!("cargo:rerun-if-changed=pg_control_ffi.h");

+    // Finding the location of C headers for the Postgres server:
+    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
+    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/tmp_install/include/postgresql/server`
+    let mut pg_install_dir: PathBuf;
+    if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
+        pg_install_dir = postgres_install_dir.into();
+    } else {
+        pg_install_dir = PathBuf::from("tmp_install")
+    }
+
+    if pg_install_dir.is_relative() {
+        let cwd = env::current_dir().unwrap();
+        pg_install_dir = cwd.join("..").join("..").join(pg_install_dir);
+    }
+
+    let pg_config_bin = pg_install_dir.join("bin").join("pg_config");
+    let inc_server_path: String = if pg_config_bin.exists() {
+        let output = Command::new(pg_config_bin)
+            .arg("--includedir-server")
+            .output()
+            .expect("failed to execute `pg_config --includedir-server`");
+
+        if !output.status.success() {
+            panic!("`pg_config --includedir-server` failed")
+        }
+
+        String::from_utf8(output.stdout).unwrap().trim_end().into()
+    } else {
+        pg_install_dir
+            .join("include")
+            .join("postgresql")
+            .join("server")
+            .into_os_string()
+            .into_string()
+            .unwrap()
+    };
+
    // The bindgen::Builder is the main entry point
    // to bindgen, and lets you build up options for
    // the resulting bindings.
@@ -81,15 +119,7 @@ fn main() {
        // explicit padding fields.
        .explicit_padding(true)
        //
-        // Path the server include dir. It is in tmp_install/include/server, if you did
-        // "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
-        // and used DESTDIR to move it into tmp_install, then it's in
-        // tmp_install/include/postgres/server
-        // 'pg_config --includedir-server' would perhaps be the more proper way to find it,
-        // but this will do for now.
-        //
-        .clang_arg("-I../../tmp_install/include/server")
-        .clang_arg("-I../../tmp_install/include/postgresql/server")
+        .clang_arg(format!("-I{inc_server_path}"))
        //
        // Finish the builder and generate the bindings.
        //
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -15,6 +15,7 @@ use crate::XLogPageHeaderData;
 use crate::XLogRecord;
 use crate::XLOG_PAGE_MAGIC;

+use crate::pg_constants::WAL_SEGMENT_SIZE;
 use anyhow::{bail, ensure};
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
@@ -461,8 +462,7 @@ pub fn find_end_of_wal(
 pub fn main() {
    let mut data_dir = PathBuf::new();
    data_dir.push(".");
-    let wal_seg_size = 16 * 1024 * 1024;
-    let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true, Lsn(0)).unwrap();
+    let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap();
    println!(
        "wal_end={:>08X}{:>08X}, tli={}",
        (wal_end >> 32) as u32,
@@ -606,10 +606,9 @@ mod tests {
    fn test_end_of_wal<C: wal_craft::Crafter>(
        test_name: &str,
        expected_end_of_wal_non_partial: Lsn,
-        last_segment: &str,
    ) {
        use wal_craft::*;
-        // 1. Generate some WAL
+        // Craft some WAL
        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("..")
            .join("..");
@@ -622,24 +621,71 @@ mod tests {
        }
        cfg.initdb().unwrap();
        let srv = cfg.start_server().unwrap();
-        let expected_wal_end: Lsn =
-            u64::from(C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
+        let (intermediate_lsns, expected_end_of_wal_partial) =
+            C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+        let intermediate_lsns: Vec<Lsn> = intermediate_lsns
+            .iter()
+            .map(|&lsn| u64::from(lsn).into())
+            .collect();
+        let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into();
        srv.kill();

-        // 2. Pick WAL generated by initdb
-        let wal_dir = cfg.datadir.join("pg_wal");
-        let wal_seg_size = 16 * 1024 * 1024;
+        // Check find_end_of_wal on the initial WAL
+        let last_segment = cfg
+            .wal_dir()
+            .read_dir()
+            .unwrap()
+            .map(|f| f.unwrap().file_name().into_string().unwrap())
+            .filter(|fname| IsXLogFileName(fname))
+            .max()
+            .unwrap();
+        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial);
+        for start_lsn in std::iter::once(Lsn(0))
+            .chain(intermediate_lsns)
+            .chain(std::iter::once(expected_end_of_wal_partial))
+        {
+            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
+            // We assume that `start_lsn` is non-decreasing.
+            info!(
+                "Checking with start_lsn={}, erasing WAL before it",
+                start_lsn
+            );
+            for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
+                let fname = file.file_name().into_string().unwrap();
+                if !IsXLogFileName(&fname) {
+                    continue;
+                }
+                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+                if seg_start_lsn > u64::from(start_lsn) {
+                    continue;
+                }
+                let mut f = File::options().write(true).open(file.path()).unwrap();
+                const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
+                f.write_all(
+                    &ZEROS[0..min(
+                        WAL_SEGMENT_SIZE,
+                        (u64::from(start_lsn) - seg_start_lsn) as usize,
+                    )],
+                )
+                .unwrap();
+            }
+            check_end_of_wal(
+                &cfg,
+                &last_segment,
+                start_lsn,
+                expected_end_of_wal_non_partial,
+                expected_end_of_wal_partial,
+            );
+        }
+    }

-        // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
-        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
-        let wal_end = Lsn(wal_end);
-        info!(
-            "find_end_of_wal returned (wal_end={}, tli={})",
-            wal_end, tli
-        );
-        assert_eq!(wal_end, expected_end_of_wal_non_partial);
-
-        // 4. Get the actual end of WAL by pg_waldump
+    fn check_pg_waldump_end_of_wal(
+        cfg: &wal_craft::Conf,
+        last_segment: &str,
+        expected_end_of_wal: Lsn,
+    ) {
+        // Get the actual end of WAL by pg_waldump
        let waldump_output = cfg
            .pg_waldump("000000010000000000000001", last_segment)
            .unwrap()
@@ -658,32 +704,57 @@ mod tests {
        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
        info!(
            "waldump erred on {}, expected wal end at {}",
-            waldump_wal_end, expected_wal_end
+            waldump_wal_end, expected_end_of_wal
        );
-        assert_eq!(waldump_wal_end, expected_wal_end);
+        assert_eq!(waldump_wal_end, expected_end_of_wal);
+    }

-        // 5. Rename file to partial to actually find last valid lsn
-        fs::rename(
-            wal_dir.join(last_segment),
-            wal_dir.join(format!("{}.partial", last_segment)),
-        )
-        .unwrap();
-        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
+    fn check_end_of_wal(
+        cfg: &wal_craft::Conf,
+        last_segment: &str,
+        start_lsn: Lsn,
+        expected_end_of_wal_non_partial: Lsn,
+        expected_end_of_wal_partial: Lsn,
+    ) {
+        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
+        let (wal_end, tli) =
+            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
        let wal_end = Lsn(wal_end);
        info!(
-            "find_end_of_wal returned (wal_end={}, tli={})",
+            "find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment",
            wal_end, tli
        );
-        assert_eq!(wal_end, waldump_wal_end);
+        assert_eq!(wal_end, expected_end_of_wal_non_partial);
+
+        // Rename file to partial to actually find last valid lsn, then rename it back.
+        fs::rename(
+            cfg.wal_dir().join(&last_segment),
+            cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        )
+        .unwrap();
+        let (wal_end, tli) =
+            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
+        let wal_end = Lsn(wal_end);
+        info!(
+            "find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment",
+            wal_end, tli
+        );
+        assert_eq!(wal_end, expected_end_of_wal_partial);
+        fs::rename(
+            cfg.wal_dir().join(format!("{}.partial", last_segment)),
+            cfg.wal_dir().join(last_segment),
+        )
+        .unwrap();
    }

+    const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
+
    #[test]
    pub fn test_find_end_of_wal_simple() {
        init_logging();
        test_end_of_wal::<wal_craft::Simple>(
            "test_find_end_of_wal_simple",
            "0/2000000".parse::<Lsn>().unwrap(),
-            "000000010000000000000001",
        );
    }

@@ -693,7 +764,6 @@ mod tests {
        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
            "0/3000000".parse::<Lsn>().unwrap(),
-            "000000010000000000000002",
        );
    }

@@ -704,7 +774,6 @@ mod tests {
        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
            "test_find_end_of_wal_last_crossing_segment",
            "0/3000000".parse::<Lsn>().unwrap(),
-            "000000010000000000000002",
        );
    }

--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -55,7 +55,7 @@ fn main() -> Result<()> {
        .get_matches();

    let wal_craft = |arg_matches: &ArgMatches, client| {
-        let lsn = match arg_matches.value_of("type").unwrap() {
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() {
            Simple::NAME => Simple::craft(client)?,
            LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
@@ -67,7 +67,10 @@ fn main() -> Result<()> {
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
            a => panic!("Unknown --type argument: {}", a),
        };
-        println!("end_of_wal = {}", lsn);
+        for lsn in intermediate_lsns {
+            println!("intermediate_lsn = {}", lsn);
+        }
+        println!("end_of_wal = {}", end_of_wal_lsn);
        Ok(())
    };

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,6 +4,7 @@ use log::*;
 use once_cell::sync::Lazy;
 use postgres::types::PgLsn;
 use postgres::Client;
+use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE;
 use postgres_ffi::xlog_utils::{
    XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
@@ -45,6 +46,10 @@ impl Conf {
        self.pg_distrib_dir.join("lib")
    }

+    pub fn wal_dir(&self) -> PathBuf {
+        self.datadir.join("pg_wal")
+    }
+
    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
        let path = self.pg_bin_dir().join(command);
        ensure!(path.exists(), "Command {:?} does not exist", path);
@@ -211,7 +216,7 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result
        "Unexpected wal_segment_size unit"
    );
    ensure!(
-        wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024,
+        wal_segment_size.get::<_, i64>("setting") == WAL_SEGMENT_SIZE as i64,
        "Unexpected wal_segment_size in bytes"
    );

@@ -221,20 +226,24 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result
 pub trait Crafter {
    const NAME: &'static str;

-    /// Generates WAL using the client `client`. Returns the expected end-of-wal LSN.
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn>;
+    /// Generates WAL using the client `client`. Returns a pair of:
+    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
+    ///   May include or exclude Lsn(0) and the end-of-wal.
+    /// * The expected end-of-wal LSN.
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)>;
 }

 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
-) -> Result<PgLsn> {
+    f: impl Fn(&mut C, PgLsn) -> Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> Result<(Vec<PgLsn>, PgLsn)> {
    ensure_server_config(client)?;

    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);

-    let last_lsn = match f(client, initial_lsn)? {
+    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
+    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
            Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
@@ -242,6 +251,9 @@ fn craft_internal<C: postgres::GenericClient>(
            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
        },
    };
+    if !intermediate_lsns.starts_with(&[initial_lsn]) {
+        intermediate_lsns.insert(0, initial_lsn);
+    }

    // Some records may be not flushed, e.g. non-transactional logical messages.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
@@ -250,16 +262,16 @@ fn craft_internal<C: postgres::GenericClient>(
        Ordering::Equal => {}
        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
    }
-    Ok(last_lsn)
+    Ok((intermediate_lsns, last_lsn))
 }

 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
-            Ok(None)
+            Ok((Vec::new(), None))
        })
    }
 }
@@ -267,12 +279,13 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;

        client.execute("CREATE table t(x int)", &[])?;
+        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
@@ -281,14 +294,14 @@ impl Crafter for LastWalRecordXlogSwitch {
            after_xlog_switch,
            next_segment
        );
-        Ok(next_segment)
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -334,6 +347,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        );

        // Emit the XLOG_SWITCH
+        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
@@ -347,14 +361,14 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
            "XLOG_SWITCH message ended not on page boundary: {}",
            after_xlog_switch
        );
-        Ok(next_segment)
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

 fn craft_single_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> Result<PgLsn> {
+) -> Result<(Vec<PgLsn>, PgLsn)> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -386,9 +400,9 @@ fn craft_single_logical_message(
                message_lsn < after_message_lsn,
                "No record found after the emitted message"
            );
-            Ok(Some(after_message_lsn))
+            Ok((vec![message_lsn], Some(after_message_lsn)))
        } else {
-            Ok(Some(message_lsn))
+            Ok((Vec::new(), Some(message_lsn)))
        }
    })
 }
@@ -396,7 +410,7 @@ fn craft_single_logical_message(
 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
        craft_single_logical_message(client, true)
    }
 }
@@ -404,7 +418,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
        craft_single_logical_message(client, false)
    }
 }
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -1768,24 +1768,23 @@ impl LayeredTimeline {

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
-        let layer_paths_to_upload;
-
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
        // repository have the same LSN.
        let lsn_range = frozen_layer.get_lsn_range();
-        if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+        let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn
+            && lsn_range.end == Lsn(self.initdb_lsn.0 + 1)
+        {
            let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?;
            let (partitioning, _lsn) =
                pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?;
-            layer_paths_to_upload =
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)?;
+            self.create_image_layers(&partitioning, self.initdb_lsn, true)?
        } else {
            // normal case, write out a L0 delta layer file.
            let delta_path = self.create_delta_layer(&frozen_layer)?;
-            layer_paths_to_upload = HashSet::from([delta_path]);
-        }
+            HashSet::from([delta_path])
+        };

        fail_point!("flush-frozen-before-sync");

--- a/pageserver/src/storage_sync.rs
+++ b/pageserver/src/storage_sync.rs
@@ -928,7 +928,7 @@ fn storage_sync_loop<P, S>(
                    );
                    let mut sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>> =
                        HashMap::new();
-                    let index_accessor = runtime.block_on(index.write());
+                    let index_accessor = runtime.block_on(index.read());
                    for tenant_id in updated_tenants {
                        let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
                            Some(tenant_entry) => tenant_entry,
@@ -1557,6 +1557,7 @@ fn schedule_first_sync_tasks(
    local_timeline_init_statuses
 }

+/// bool in return value stands for awaits_download
 fn compare_local_and_remote_timeline(
    new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>,
    sync_id: ZTenantTimelineId,
@@ -1566,14 +1567,6 @@ fn compare_local_and_remote_timeline(
 ) -> (LocalTimelineInitStatus, bool) {
    let remote_files = remote_entry.stored_files();

-    // TODO probably here we need more sophisticated logic,
-    //   if more data is available remotely can we just download what's there?
-    //   without trying to upload something. It may be tricky, needs further investigation.
-    //   For now looks strange that we can request upload
-    //   and download for the same timeline simultaneously.
-    //   (upload needs to be only for previously unsynced files, not whole timeline dir).
-    //   If one of the tasks fails they will be reordered in the queue which can lead
-    //   to timeline being stuck in evicted state
    let number_of_layers_to_download = remote_files.difference(&local_files).count();
    let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 {
        new_sync_tasks.push_back((
--- a/pageserver/src/storage_sync/download.rs
+++ b/pageserver/src/storage_sync/download.rs
@@ -3,12 +3,13 @@
 use std::{
    collections::{HashMap, HashSet},
    fmt::Debug,
+    mem,
    path::Path,
 };

 use anyhow::Context;
 use futures::stream::{FuturesUnordered, StreamExt};
-use remote_storage::{path_with_suffix_extension, RemoteObjectName, RemoteStorage};
+use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage};
 use tokio::{
    fs,
    io::{self, AsyncWriteExt},
@@ -27,28 +28,50 @@ use super::{

 pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";

-/// FIXME: Needs cleanup. Currently it swallows errors. Here we need to ensure that
-/// we successfully downloaded all metadata parts for one tenant.
-/// And successful includes absence of index_part in the remote. Because it is valid situation
-/// when timeline was just created and pageserver restarted before upload of index part was completed.
-/// But currently RemoteStorage interface does not provide this knowledge because it uses
-/// anyhow::Error as an error type. So this needs a refactoring.
-///
-/// In other words we need to yield only complete sets of tenant timelines.
-/// Failure for one timeline of a tenant should exclude whole tenant from returned hashmap.
-/// So there are two requirements: keep everything in one futures unordered
-/// to allow higher concurrency. Mark tenants as failed independently.
-/// That requires some bookeeping.
+// We collect timelines remotely available for each tenant
+// in case we failed to gather all index parts (due to an error)
+// Poisoned variant is returned.
+// When data is received succesfully without errors Present variant is used.
+pub enum TenantIndexParts {
+    Poisoned {
+        present: HashMap<ZTimelineId, IndexPart>,
+        missing: HashSet<ZTimelineId>,
+    },
+    Present(HashMap<ZTimelineId, IndexPart>),
+}
+
+impl TenantIndexParts {
+    fn add_poisoned(&mut self, timeline_id: ZTimelineId) {
+        match self {
+            TenantIndexParts::Poisoned { missing, .. } => {
+                missing.insert(timeline_id);
+            }
+            TenantIndexParts::Present(present) => {
+                *self = TenantIndexParts::Poisoned {
+                    present: mem::take(present),
+                    missing: HashSet::from([timeline_id]),
+                }
+            }
+        }
+    }
+}
+
+impl Default for TenantIndexParts {
+    fn default() -> Self {
+        TenantIndexParts::Present(HashMap::default())
+    }
+}
+
 pub async fn download_index_parts<P, S>(
    conf: &'static PageServerConf,
    storage: &S,
    keys: HashSet<ZTenantTimelineId>,
-) -> HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>>
+) -> HashMap<ZTenantId, TenantIndexParts>
 where
    P: Debug + Send + Sync + 'static,
    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
 {
-    let mut index_parts: HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>> = HashMap::new();
+    let mut index_parts: HashMap<ZTenantId, TenantIndexParts> = HashMap::new();

    let mut part_downloads = keys
        .into_iter()
@@ -59,12 +82,29 @@ where
        match part_upload_result {
            Ok(index_part) => {
                debug!("Successfully fetched index part for {id}");
-                index_parts
-                    .entry(id.tenant_id)
-                    .or_default()
-                    .insert(id.timeline_id, index_part);
+                match index_parts.entry(id.tenant_id).or_default() {
+                    TenantIndexParts::Poisoned { present, .. } => {
+                        present.insert(id.timeline_id, index_part);
+                    }
+                    TenantIndexParts::Present(parts) => {
+                        parts.insert(id.timeline_id, index_part);
+                    }
+                }
+            }
+            Err(download_error) => {
+                match download_error {
+                    DownloadError::NotFound => {
+                        // thats ok because it means that we didnt upload something we have locally for example
+                    }
+                    e => {
+                        let tenant_parts = index_parts.entry(id.tenant_id).or_default();
+                        tenant_parts.add_poisoned(id.timeline_id);
+                        error!(
+                            "Failed to fetch index part for {id}: {e} poisoning tenant index parts"
+                        );
+                    }
+                }
            }
-            Err(e) => error!("Failed to fetch index part for {id}: {e}"),
        }
    }

@@ -119,12 +159,16 @@ where
        });
    }

-    download_index_parts(conf, storage, sync_ids)
+    match download_index_parts(conf, storage, sync_ids)
        .await
        .remove(&tenant_id)
-        .ok_or(anyhow::anyhow!(
-            "Missing tenant index parts. This is a bug."
-        ))
+        .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))?
+    {
+        TenantIndexParts::Poisoned { missing, .. } => {
+            anyhow::bail!("Failed to download index parts for all timelines. Missing {missing:?}")
+        }
+        TenantIndexParts::Present(parts) => Ok(parts),
+    }
 }

 /// Retrieves index data from the remote storage for a given timeline.
@@ -132,7 +176,7 @@ async fn download_index_part<P, S>(
    conf: &'static PageServerConf,
    storage: &S,
    sync_id: ZTenantTimelineId,
-) -> anyhow::Result<IndexPart>
+) -> Result<IndexPart, DownloadError>
 where
    P: Debug + Send + Sync + 'static,
    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -147,15 +191,11 @@ where
                "Failed to get the index part storage path for local path '{}'",
                index_part_path.display()
            )
-        })?;
+        })
+        .map_err(DownloadError::BadInput)?;
+
+    let mut index_part_download = storage.download(&part_storage_path).await?;

-    let mut index_part_download =
-        storage
-            .download(&part_storage_path)
-            .await
-            .with_context(|| {
-                format!("Failed to open download stream for for storage path {part_storage_path:?}")
-            })?;
    let mut index_part_bytes = Vec::new();
    io::copy(
        &mut index_part_download.download_stream,
@@ -164,11 +204,16 @@ where
    .await
    .with_context(|| {
        format!("Failed to download an index part from storage path {part_storage_path:?}")
-    })?;
+    })
+    .map_err(DownloadError::Other)?;

-    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
-        format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'")
-    })?;
+    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
+        .with_context(|| {
+            format!(
+                "Failed to deserialize index part file from storage path '{part_storage_path:?}'"
+            )
+        })
+        .map_err(DownloadError::Other)?;

    let missing_files = index_part.missing_files();
    if !missing_files.is_empty() {
--- a/pageserver/src/storage_sync/index.rs
+++ b/pageserver/src/storage_sync/index.rs
@@ -13,6 +13,7 @@ use anyhow::{anyhow, Context, Ok};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use tokio::sync::RwLock;
+use tracing::log::warn;

 use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata};
 use utils::{
@@ -20,6 +21,8 @@ use utils::{
    zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

+use super::download::TenantIndexParts;
+
 /// A part of the filesystem path, that needs a root to become a path again.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 #[serde(transparent)]
@@ -88,21 +91,27 @@ pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
 impl RemoteIndex {
    pub fn from_parts(
        conf: &'static PageServerConf,
-        index_parts: HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>>,
+        index_parts: HashMap<ZTenantId, TenantIndexParts>,
    ) -> anyhow::Result<Self> {
        let mut entries: HashMap<ZTenantId, TenantEntry> = HashMap::new();

-        for (tenant_id, timelines) in index_parts {
-            for (timeline_id, index_part) in timelines {
-                let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
-                let remote_timeline =
-                    RemoteTimeline::from_index_part(&timeline_path, index_part)
-                        .context("Failed to restore remote timeline data from index part")?;
+        for (tenant_id, index_parts) in index_parts {
+            match index_parts {
+                // TODO: should we schedule a retry so it can be recovered? otherwise we can revive it only through detach/attach or pageserver restart
+                TenantIndexParts::Poisoned { missing, ..} => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline(s): {missing:?}"),
+                TenantIndexParts::Present(timelines) => {
+                    for (timeline_id, index_part) in timelines {
+                        let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
+                        let remote_timeline =
+                            RemoteTimeline::from_index_part(&timeline_path, index_part)
+                                .context("Failed to restore remote timeline data from index part")?;

-                entries
-                    .entry(tenant_id)
-                    .or_default()
-                    .insert(timeline_id, remote_timeline);
+                        entries
+                            .entry(tenant_id)
+                            .or_default()
+                            .insert(timeline_id, remote_timeline);
+                    }
+                },
            }
        }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -623,6 +623,7 @@ impl PostgresRedoProcess {
            .env_clear()
            .env("LD_LIBRARY_PATH", conf.pg_lib_dir())
            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
+            .close_fds()
            .output()
            .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?;

--- a/poetry.lock
+++ b/poetry.lock
@@ -544,20 +544,21 @@ test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pr

 [[package]]
 name = "docker"
-version = "5.0.3"
+version = "4.2.2"
 description = "A Python library for the Docker Engine API."
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"

 [package.dependencies]
-pywin32 = {version = "227", markers = "sys_platform == \"win32\""}
+pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""}
 requests = ">=2.14.2,<2.18.0 || >2.18.0"
+six = ">=1.4.0"
 websocket-client = ">=0.32.0"

 [package.extras]
 ssh = ["paramiko (>=2.4.2)"]
-tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=3.4.7)", "idna (>=2.0.0)"]
+tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"]

 [[package]]
 name = "ecdsa"
@@ -1003,6 +1004,17 @@ python-versions = ">=3.6"
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]

+[[package]]
+name = "pypiwin32"
+version = "223"
+description = ""
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pywin32 = ">=223"
+
 [[package]]
 name = "pyrsistent"
 version = "0.18.1"
@@ -1124,7 +1136,7 @@ python-versions = "*"

 [[package]]
 name = "pywin32"
-version = "227"
+version = "301"
 description = "Python for Window Extensions"
 category = "main"
 optional = false
@@ -1501,8 +1513,8 @@ cryptography = [
    {file = "cryptography-36.0.1.tar.gz", hash = "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638"},
 ]
 docker = [
-    {file = "docker-5.0.3-py2.py3-none-any.whl", hash = "sha256:7a79bb439e3df59d0a72621775d600bc8bc8b422d285824cb37103eab91d1ce0"},
-    {file = "docker-5.0.3.tar.gz", hash = "sha256:d916a26b62970e7c2f554110ed6af04c7ccff8e9f81ad17d0d40c75637e227fb"},
+    {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"},
+    {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"},
 ]
 ecdsa = [
    {file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"},
@@ -1802,6 +1814,10 @@ pyparsing = [
    {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"},
    {file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"},
 ]
+pypiwin32 = [
+    {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"},
+    {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"},
+]
 pyrsistent = [
    {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"},
    {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"},
@@ -1858,18 +1874,16 @@ pytz = [
    {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"},
 ]
 pywin32 = [
-    {file = "pywin32-227-cp27-cp27m-win32.whl", hash = "sha256:371fcc39416d736401f0274dd64c2302728c9e034808e37381b5e1b22be4a6b0"},
-    {file = "pywin32-227-cp27-cp27m-win_amd64.whl", hash = "sha256:4cdad3e84191194ea6d0dd1b1b9bdda574ff563177d2adf2b4efec2a244fa116"},
-    {file = "pywin32-227-cp35-cp35m-win32.whl", hash = "sha256:f4c5be1a293bae0076d93c88f37ee8da68136744588bc5e2be2f299a34ceb7aa"},
-    {file = "pywin32-227-cp35-cp35m-win_amd64.whl", hash = "sha256:a929a4af626e530383a579431b70e512e736e9588106715215bf685a3ea508d4"},
-    {file = "pywin32-227-cp36-cp36m-win32.whl", hash = "sha256:300a2db938e98c3e7e2093e4491439e62287d0d493fe07cce110db070b54c0be"},
-    {file = "pywin32-227-cp36-cp36m-win_amd64.whl", hash = "sha256:9b31e009564fb95db160f154e2aa195ed66bcc4c058ed72850d047141b36f3a2"},
-    {file = "pywin32-227-cp37-cp37m-win32.whl", hash = "sha256:47a3c7551376a865dd8d095a98deba954a98f326c6fe3c72d8726ca6e6b15507"},
-    {file = "pywin32-227-cp37-cp37m-win_amd64.whl", hash = "sha256:31f88a89139cb2adc40f8f0e65ee56a8c585f629974f9e07622ba80199057511"},
-    {file = "pywin32-227-cp38-cp38-win32.whl", hash = "sha256:7f18199fbf29ca99dff10e1f09451582ae9e372a892ff03a28528a24d55875bc"},
-    {file = "pywin32-227-cp38-cp38-win_amd64.whl", hash = "sha256:7c1ae32c489dc012930787f06244426f8356e129184a02c25aef163917ce158e"},
-    {file = "pywin32-227-cp39-cp39-win32.whl", hash = "sha256:c054c52ba46e7eb6b7d7dfae4dbd987a1bb48ee86debe3f245a2884ece46e295"},
-    {file = "pywin32-227-cp39-cp39-win_amd64.whl", hash = "sha256:f27cec5e7f588c3d1051651830ecc00294f90728d19c3bf6916e6dba93ea357c"},
+    {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"},
+    {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"},
+    {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"},
+    {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"},
+    {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"},
+    {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"},
+    {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"},
+    {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"},
+    {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"},
+    {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"},
 ]
 pyyaml = [
    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -83,7 +83,9 @@ impl ElectionLeader {
    ) -> Result<bool> {
        let resp = self.client.leader(election_name).await?;

-        let kv = resp.kv().ok_or(anyhow!("failed to get leader response"))?;
+        let kv = resp
+            .kv()
+            .ok_or_else(|| anyhow!("failed to get leader response"))?;
        let leader = kv.value_str()?;

        Ok(leader == candidate_name)
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -637,6 +637,17 @@ where
        &mut self,
        msg: &VoteRequest,
    ) -> Result<Option<AcceptorProposerMessage>> {
+        // Once voted, we won't accept data from older proposers; flush
+        // everything we've already received so that new proposer starts
+        // streaming at end of our WAL, without overlap. Currently we truncate
+        // WAL at streaming point, so this avoids truncating already committed
+        // WAL.
+        //
+        // TODO: it would be smoother to not truncate committed piece at
+        // handle_elected instead. Currently not a big deal, as proposer is the
+        // only source of WAL; with peer2peer recovery it would be more
+        // important.
+        self.wal_store.flush_wal()?;
        // initialize with refusal
        let mut resp = VoteResponse {
            term: self.state.acceptor_state.term,
--- a/test_runner/batch_others/test_branch_and_gc.py
+++ b/test_runner/batch_others/test_branch_and_gc.py
@@ -0,0 +1,101 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import lsn_from_hex
+
+
+# Test the GC implementation when running with branching.
+# This test reproduces the issue https://github.com/neondatabase/neon/issues/707.
+#
+# Consider two LSNs `lsn1` and `lsn2` with some delta files as follows:
+# ...
+# p   -> has an image layer xx_p with p < lsn1
+# ...
+# lsn1
+# ...
+# q   -> has an image layer yy_q with lsn1 < q < lsn2
+# ...
+# lsn2
+#
+# Consider running a GC iteration such that the GC horizon is between p and lsn1
+# ...
+# p       -> has an image layer xx_p with p < lsn1
+# D_start -> is a delta layer D's start (e.g D = '...-...-D_start-D_end')
+# ...
+# GC_h    -> is a gc horizon such that p < GC_h < lsn1
+# ...
+# lsn1
+# ...
+# D_end   -> is a delta layer D's end
+# ...
+# q       -> has an image layer yy_q with lsn1 < q < lsn2
+# ...
+# lsn2
+#
+# As described in the issue #707, the image layer xx_p will be deleted as
+# its range is below the GC horizon and there exists a newer image layer yy_q (q > p).
+# However, removing xx_p will corrupt any delta layers that depend on xx_p that
+# are not deleted by GC. For example, the delta layer D is corrupted in the
+# above example because D depends on the image layer xx_p for value reconstruction.
+#
+# Because the delta layer D covering lsn1 is corrupted, creating a branch
+# starting from lsn1 should return an error as follows:
+#     could not find data for key ... at LSN ..., for request at LSN ...
+def test_branch_and_gc(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable background GC
+            'gc_period': '10 m',
+            'gc_horizon': f'{10 * 1024 ** 3}',
+
+            # small checkpoint distance to create more delta layer files
+            'checkpoint_distance': f'{1024 ** 2}',
+
+            # set the target size to be large to allow the image layer to cover the whole key space
+            'compaction_target_size': f'{1024 ** 3}',
+
+            # tweak the default settings to allow quickly create image layers and L1 layers
+            'compaction_period': '1 s',
+            'compaction_threshold': '2',
+            'image_creation_threshold': '1',
+
+            # set PITR interval to be small, so we can do GC
+            'pitr_interval': '1 s'
+        })
+
+    timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant)
+    pg_main = env.postgres.create_start('test_main', tenant_id=tenant)
+
+    main_cur = pg_main.connect().cursor()
+
+    main_cur.execute(
+        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')"
+    )
+    main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn1 = main_cur.fetchone()[0]
+    log.info(f'LSN1: {lsn1}')
+
+    main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn2 = main_cur.fetchone()[0]
+    log.info(f'LSN2: {lsn2}')
+
+    # Set the GC horizon so that lsn1 is inside the horizon, which means
+    # we can create a new branch starting from lsn1.
+    env.pageserver.safe_psql(
+        f'''do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}'''
+    )
+
+    env.neon_cli.create_branch('test_branch',
+                               'test_main',
+                               tenant_id=tenant,
+                               ancestor_start_lsn=lsn1)
+    pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant)
+
+    branch_cur = pg_branch.connect().cursor()
+    branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
+
+    branch_cur.execute('SELECT count(*) FROM foo')
+    assert branch_cur.fetchone() == (200000, )
--- a/test_runner/batch_others/test_branching.py
+++ b/test_runner/batch_others/test_branching.py
@@ -0,0 +1,89 @@
+from typing import List
+import threading
+import pytest
+from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres
+import time
+import random
+from fixtures.log_helper import log
+from performance.test_perf_pgbench import get_scales_matrix
+
+
+# Test branch creation
+#
+# This test spawns pgbench in a thread in the background, and creates a branch while
+# pgbench is running. Then it launches pgbench on the new branch, and creates another branch.
+# Repeat `n_branches` times.
+#
+# If 'ty' == 'cascade', each branch is created from the previous branch, so that you end
+# up with a branch of a branch of a branch ... of a branch. With 'ty' == 'flat',
+# each branch is created from the root.
+@pytest.mark.parametrize("n_branches", [10])
+@pytest.mark.parametrize("scale", get_scales_matrix(1))
+@pytest.mark.parametrize("ty", ["cascade", "flat"])
+def test_branching_with_pgbench(neon_simple_env: NeonEnv,
+                                pg_bin: PgBin,
+                                n_branches: int,
+                                scale: int,
+                                ty: str):
+    env = neon_simple_env
+
+    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
+    tenant, _ = env.neon_cli.create_tenant(
+         conf={
+             'gc_period': '5 s',
+             'gc_horizon': f'{1024 ** 2}',
+             'checkpoint_distance': f'{1024 ** 2}',
+             'compaction_target_size': f'{1024 ** 2}',
+             # set PITR interval to be small, so we can do GC
+             'pitr_interval': '5 s'
+         })
+
+    def run_pgbench(pg: Postgres):
+        connstr = pg.connstr()
+
+        log.info(f"Start a pgbench workload on pg {connstr}")
+
+        pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr])
+        pg_bin.run_capture(['pgbench', '-T15', connstr])
+
+    env.neon_cli.create_branch('b0', tenant_id=tenant)
+    pgs: List[Postgres] = []
+    pgs.append(env.postgres.create_start('b0', tenant_id=tenant))
+
+    threads: List[threading.Thread] = []
+    threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True))
+    threads[-1].start()
+
+    thread_limit = 4
+
+    for i in range(n_branches):
+        # random a delay between [0, 5]
+        delay = random.random() * 5
+        time.sleep(delay)
+        log.info(f"Sleep {delay}s")
+
+        # If the number of concurrent threads exceeds a threshold,
+        # wait for all the threads to finish before spawning a new one.
+        # Because tests defined in `batch_others` are run concurrently in CI,
+        # we want to avoid the situation that one test exhausts resources for other tests.
+        if len(threads) >= thread_limit:
+            for thread in threads:
+                thread.join()
+            threads = []
+
+        if ty == "cascade":
+            env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant)
+        else:
+            env.neon_cli.create_branch('b{}'.format(i + 1), 'b0', tenant_id=tenant)
+
+        pgs.append(env.postgres.create_start('b{}'.format(i + 1), tenant_id=tenant))
+
+        threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1], ), daemon=True))
+        threads[-1].start()
+
+    for thread in threads:
+        thread.join()
+
+    for pg in pgs:
+        res = pg.safe_psql('SELECT count(*) from pgbench_accounts')
+        assert res[0] == (100000 * scale, )
--- a/test_runner/batch_others/test_close_fds.py
+++ b/test_runner/batch_others/test_close_fds.py
@@ -0,0 +1,51 @@
+from contextlib import closing
+import shutil
+import time
+import subprocess
+import os.path
+
+from cached_property import threading
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.log_helper import log
+
+
+def lsof_path() -> str:
+    path_output = shutil.which("lsof")
+    if path_output is None:
+        raise RuntimeError('lsof not found in PATH')
+    else:
+        return path_output
+
+
+# Makes sure that `pageserver.pid` is only held by `pageserve` command, not other commands.
+# This is to test the changes in https://github.com/neondatabase/neon/pull/1834.
+def test_lsof_pageserver_pid(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    def start_workload():
+        env.neon_cli.create_branch("test_lsof_pageserver_pid")
+        pg = env.postgres.create_start("test_lsof_pageserver_pid")
+        with closing(pg.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x")
+                cur.execute("update foo set x=x+1")
+
+    workload_thread = threading.Thread(target=start_workload, args=(), daemon=True)
+    workload_thread.start()
+
+    path = os.path.join(env.repo_dir, "pageserver.pid")
+    lsof = lsof_path()
+    while workload_thread.is_alive():
+        res = subprocess.run([lsof, path],
+                             check=False,
+                             universal_newlines=True,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE)
+
+        # parse the `lsof` command's output to get only the list of commands
+        commands = [line.split(' ')[0] for line in res.stdout.strip().split('\n')[1:]]
+        if len(commands) > 0:
+            log.info(f"lsof commands: {commands}")
+            assert commands == ['pageserve']
+
+        time.sleep(1.0)
--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -302,6 +302,8 @@ def test_compute_restarts(neon_env_builder: NeonEnvBuilder):


 class BackgroundCompute(object):
+    MAX_QUERY_GAP_SECONDS = 2
+
    def __init__(self, index: int, env: NeonEnv, branch: str):
        self.index = index
        self.env = env
@@ -339,7 +341,7 @@ class BackgroundCompute(object):

            # With less sleep, there is a very big chance of not committing
            # anything or only 1 xact during test run.
-            await asyncio.sleep(2 * random.random())
+            await asyncio.sleep(random.uniform(0, self.MAX_QUERY_GAP_SECONDS))
        self.running = False


@@ -356,20 +358,34 @@ async def run_concurrent_computes(env: NeonEnv,
    background_tasks = [asyncio.create_task(compute.run()) for compute in computes]

    await asyncio.sleep(run_seconds)
+    log.info("stopping all tasks but one")
    for compute in computes[1:]:
        compute.stopped = True
+    await asyncio.gather(*background_tasks[1:])
    log.info("stopped all tasks but one")

    # work for some time with only one compute -- it should be able to make some xacts
-    await asyncio.sleep(8)
+    TIMEOUT_SECONDS = computes[0].MAX_QUERY_GAP_SECONDS + 3
+    initial_queries_by_0 = len(computes[0].successful_queries)
+    log.info(f'Waiting for another query by computes[0], '
+             f'it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s')
+    for _ in range(10 * TIMEOUT_SECONDS):
+        current_queries_by_0 = len(computes[0].successful_queries) - initial_queries_by_0
+        if current_queries_by_0 >= 1:
+            log.info(f'Found {current_queries_by_0} successful queries '
+                     f'by computes[0], completing the test')
+            break
+        await asyncio.sleep(0.1)
+    else:
+        assert False, "Timed out while waiting for another query by computes[0]"
    computes[0].stopped = True

-    await asyncio.gather(*background_tasks)
+    await asyncio.gather(background_tasks[0])

    result = await exec_compute_query(env, branch, 'SELECT * FROM query_log')
    # we should have inserted something while single compute was running
-    assert len(result) >= 4
-    log.info(f'Executed {len(result)} queries')
+    log.info(f'Executed {len(result)} queries, {current_queries_by_0} of them '
+             f'by computes[0] after we started stopping the others')
    for row in result:
        log.info(f'{row[0]} {row[1]} {row[2]}')

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1276,12 +1276,9 @@ class WalCraft(AbstractNeonCli):
        res.check_returncode()
        return res.stdout.split('\n')

-    def in_existing(self, type: str, connection: str) -> int:
+    def in_existing(self, type: str, connection: str) -> None:
        res = self.raw_cli(["in-existing", type, connection])
        res.check_returncode()
-        m = re.fullmatch(r'end_of_wal = (.*)\n', res.stdout)
-        assert m
-        return lsn_from_hex(m.group(1))


 class NeonPageserver(PgProtocol):
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -83,6 +83,9 @@ def get_dir_size(path: str) -> int:
    totalbytes = 0
    for root, dirs, files in os.walk(path):
        for name in files:
-            totalbytes += os.path.getsize(os.path.join(root, name))
+            try:
+                totalbytes += os.path.getsize(os.path.join(root, name))
+            except FileNotFoundError as e:
+                pass  # file could be concurrently removed

    return totalbytes
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -33,7 +33,9 @@ itoa = { version = "0.4", features = ["i128", "std"] }
 libc = { version = "0.2", features = ["extra_traits", "std"] }
 log = { version = "0.4", default-features = false, features = ["serde", "std"] }
 memchr = { version = "2", features = ["std", "use_std"] }
-num-integer = { version = "0.1", default-features = false, features = ["i128"] }
+nom = { version = "7", features = ["alloc", "std"] }
+num-bigint = { version = "0.4", features = ["std"] }
+num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] }
 num-traits = { version = "0.2", features = ["i128", "std"] }
 prost = { version = "0.10", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
@@ -41,10 +43,11 @@ regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cac
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
-tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
+time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "quickcheck", "quickcheck-dep", "std", "time-macros"] }
+tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
 tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] }
-tracing-core = { version = "0.1", features = ["lazy_static", "std"] }
+tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] }

 [build-dependencies]
 ahash = { version = "0.7", features = ["std"] }
@@ -57,6 +60,7 @@ indexmap = { version = "1", default-features = false, features = ["std"] }
 libc = { version = "0.2", features = ["extra_traits", "std"] }
 log = { version = "0.4", default-features = false, features = ["serde", "std"] }
 memchr = { version = "2", features = ["std", "use_std"] }
+nom = { version = "7", features = ["alloc", "std"] }
 prost = { version = "0.10", features = ["prost-derive", "std"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }