WIP: Process received GetPage requests in parallel

Have a pool of WAL redo processes per tenant
To allow more concurrency, have a pool of WAL redo processes that can grow up to 4 processes per tenant. There's no way to shrink the pool, that's why I'm capping it at 4 processes, to keep the total number of processes reasonable.
2026-05-18 05:30:37 +00:00 · 2022-11-21 20:43:16 +02:00 · 2022-11-21 20:42:44 +02:00 · 2022-11-21 15:25:09 +00:00 · 2022-11-21 16:24:19 +01:00 · 2022-11-19 23:39:42 +00:00
69 changed files with 1862 additions and 558 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -190,7 +190,7 @@ runs:
        prefix: latest

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: store
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -22,6 +22,10 @@ storage:
          console_region_id: aws-us-west-2
        zenith-1-ps-3:
          console_region_id: aws-us-west-2
+        zenith-1-ps-4:
+          console_region_id: aws-us-west-2
+        zenith-1-ps-5:
+          console_region_id: aws-us-west-2

    safekeepers:
      hosts:
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: zenith-us-stage-etcd.local:2379
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -144,7 +144,9 @@ jobs:
        # neon-captest-new: Run pgbench in a freshly created project
        # neon-captest-reuse: Same, but reusing existing project
        # neon-captest-prefetch: Same, with prefetching enabled (new project)
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
+        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
        db_size: [ 10gb ]
        include:
          - platform: neon-captest-new
@@ -164,7 +166,7 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
      options: --init
@@ -207,8 +209,11 @@ jobs:
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
+            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -265,7 +270,7 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: generate
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -18,8 +18,8 @@ env:

 jobs:
  tag:
-    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}

@@ -46,7 +46,7 @@ jobs:
        id: build-tag

  build-neon:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -236,7 +236,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -269,7 +269,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -300,12 +300,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  merge-allure-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    needs: [ regress-tests, benchmarks ]
-    if: always()
+    if: success() || failure()
    strategy:
      fail-fast: false
      matrix:
@@ -338,7 +338,7 @@ jobs:
          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json

  coverage-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -415,7 +415,7 @@ jobs:
        shell: bash -euxo pipefail {0}

  trigger-e2e-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
@@ -460,7 +460,7 @@ jobs:
            }"

  neon-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -478,7 +478,7 @@ jobs:
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -492,28 +492,8 @@ jobs:
      - name: Kaniko build compute tools
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

-  compute-node-image:
-    runs-on: dev
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
-
  compute-node-image-v14:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -529,9 +509,8 @@ jobs:
      - name: Kaniko build compute node with extensions v14
        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}

-
  compute-node-image-v15:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -547,18 +526,58 @@ jobs:
      - name: Kaniko build compute node with extensions v15
        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}

+  test-images:
+    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
+      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
+      # Regular pageserver version string looks like
+      #   Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
+      # Bad versions might loop like:
+      #   Neon page server git-env:local failpoints: true, features: ["testing"]
+      # Ensure that we don't have bad versions.
+      - name: Verify image versions
+        shell: bash # ensure no set -e for better error messages
+        run: |
+          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+
+          echo "Pageserver version string: $pageserver_version"
+
+          if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
+            echo "Pageserver version should not be the default Dockerfile one"
+            exit 1
+          fi
+
+          if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
+            echo "Pageserver version should have no testing feature enabled"
+            exit 1
+          fi
+
+      - name: Verify docker-compose example
+        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose -f ./docker-compose/docker-compose.yml logs || 0
+          docker compose -f ./docker-compose/docker-compose.yml down
+
  promote-images:
-    runs-on: dev
-    needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+    needs: [ tag, test-images ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
      fail-fast: false
      matrix:
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-        name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]

    steps:
      - name: Promote image to latest
@@ -567,7 +586,7 @@ jobs:
          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ promote-images, tag ]
    container: golang:1.19-bullseye

@@ -588,9 +607,6 @@ jobs:
      - name: Pull compute tools image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools

-      - name: Pull compute node image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
-
      - name: Pull compute node v14 image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

@@ -607,7 +623,6 @@ jobs:
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest

@@ -623,9 +638,6 @@ jobs:
      - name: Push compute tools image to Docker Hub
        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}

-      - name: Push compute node image to Docker Hub
-        run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
-
      - name: Push compute node v14 image to Docker Hub
        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}

@@ -642,7 +654,6 @@ jobs:
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest

@@ -725,7 +736,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -805,7 +816,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -847,7 +858,7 @@ jobs:
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  deploy-proxy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -917,7 +928,7 @@ jobs:
          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  promote-compatibility-data:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -115,7 +115,7 @@ jobs:
        run: cargo build --locked --all --all-targets

  check-rust-dependencies:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
--- a/1
+++ b/1
@@ -8,3 +8,4 @@
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/control-plane 
 /safekeeper/ @neondatabase/safekeepers
+/vendor/ @neondatabase/compute
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,10 @@ members = [
 # Besides, debug info should not affect the performance.
 debug = true

+# disable debug symbols for all packages except this one to decrease binaries size
+[profile.release.package."*"]
+debug = false
+
 [profile.release-line-debug]
 inherits = "release"
 debug = 1 # true = 2 = all symbols, 1 = line only
--- a/Dockerfile.compute-node.legacy
+++ b/Dockerfile.compute-node.legacy
@@ -1,88 +0,0 @@
-#
-# Legacy version of the Dockerfile for the compute node.
-# Used by e2e CI. Building Dockerfile.compute-node will take
-# unreasonable ammount of time without v2 runners.
-#
-# TODO: remove once cloud repo CI is moved to v2 runners.
-#
-
-
-# Allow specifiyng different compute-tools tag and image repo, so we are
-# able to use different images
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-ARG IMAGE=compute-tools
-ARG TAG=latest
-
-#
-# Image with pre-built tools
-#
-FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
-# Only to get ready compute_ctl binary as deppendency
-
-#
-# Image with Postgres build deps
-#
-FROM debian:bullseye-slim AS build-deps
-
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libcurl4-openssl-dev libossp-uuid-dev
-
-#
-# Image with built Postgres
-#
-FROM build-deps AS pg-build
-
-# Add user postgres
-RUN adduser postgres
-RUN mkdir /pg && chown postgres:postgres /pg
-
-# Copy source files
-# version 14 is default for now
-COPY ./vendor/postgres-v14 /pg/
-COPY ./pgxn /pg/
-
-# Build and install Postgres locally
-RUN mkdir /pg/compute_build && cd /pg/compute_build && \
-    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
-    # Install main binaries and contribs
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
-
-# Install neon contrib
-RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
-
-USER postgres
-WORKDIR /pg
-
-#
-# Final compute node image to be exported
-#
-FROM debian:bullseye-slim
-
-# libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
-
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute
-
-# Copy ready Postgres binaries
-COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
-
-# Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
-
-# XXX: temporary symlink for compatibility with old control-plane
-RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
-
-# Add postgres shared objects to the search path
-RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-USER postgres
-
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/28
+++ b/28
@@ -20,18 +20,18 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

-# Seccomp BPF is only available for Linux
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
+	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
-endif
-
-# macOS with brew-installed openssl requires explicit paths
-# It can be configured with OPENSSL_PREFIX variable
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Darwin)
-    OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-    PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+else ifeq ($(UNAME_S),Darwin)
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
@@ -73,7 +73,8 @@ $(POSTGRES_INSTALL_DIR)/build/v14/config.status:
 	+@echo "Configuring Postgres v14 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)

@@ -81,7 +82,8 @@ $(POSTGRES_INSTALL_DIR)/build/v15/config.status:
 	+@echo "Configuring Postgres v15 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)

@@ -111,6 +113,8 @@ postgres-v14: postgres-v14-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
 	+@echo "Compiling libpq v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v14"
@@ -123,6 +127,8 @@ postgres-v15: postgres-v15-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
 	+@echo "Compiling libpq v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v15"
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd openssl
+brew install protobuf etcd openssl flex bison
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -125,24 +125,23 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7
-Stopping pageserver gracefully...done!
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2545906
+Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
+Stopped pageserver 1 process with pid 2545906

 # start pageserver and safekeeper
 > ./target/debug/neon_local start
-Starting etcd broker using /usr/bin/etcd
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
-Safekeeper started
+Starting etcd broker using "/usr/bin/etcd"
+etcd started, pid: 2545996
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2546005
+Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
+safekeeper 1 started, pid: 2546041

 # start postgres compute node
 > ./target/debug/neon_local pg start main
-Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -26,8 +26,18 @@ use nix::unistd::Pid;

 use utils::lock_file;

-const RETRIES: u32 = 15;
-const RETRY_TIMEOUT_MILLIS: u64 = 500;
+// These constants control the loop used to poll for process start / stop.
+//
+// The loop waits for at most 10 seconds, polling every 100 ms.
+// Once a second, it prints a dot ("."), to give the user an indication that
+// it's waiting. If the process hasn't started/stopped after 5 seconds,
+// it prints a notice that it's taking long, but keeps waiting.
+//
+const RETRY_UNTIL_SECS: u64 = 10;
+const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
+const RETRY_INTERVAL_MILLIS: u64 = 100;
+const DOT_EVERY_RETRIES: u64 = 10;
+const NOTICE_AFTER_RETRIES: u64 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
@@ -107,16 +117,16 @@ where
                return Ok(spawned_process);
            }
            Ok(false) => {
-                if retries < 5 {
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not started yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
                    print!(".");
                    io::stdout().flush().unwrap();
-                } else {
-                    if retries == 5 {
-                        println!() // put a line break after dots for second message
-                    }
-                    println!("{process_name} has not started yet, retrying ({retries})...");
                }
-                thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} failed to start: {e:#}");
@@ -127,7 +137,8 @@ where
            }
        }
    }
-    anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
+    println!();
+    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -158,7 +169,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    }

    // Wait until process is gone
-    for _ in 0..RETRIES {
+    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
                println!("\n{process_name} stopped");
@@ -170,9 +181,16 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
                return Ok(());
            }
            Ok(false) => {
-                print!(".");
-                io::stdout().flush().unwrap();
-                thread::sleep(Duration::from_secs(1))
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not stopped yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                }
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -180,24 +198,21 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
            }
        }
    }
-
-    anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
+    println!();
+    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
 }

 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");

-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        filled_cmd = filled_cmd.env(var, val);
+    // Pass through these environment variables to the command
+    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+        if let Some(val) = std::env::var_os(var) {
+            filled_cmd = filled_cmd.env(var, val);
+        }
    }

-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        filled_cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        filled_cmd
-    }
+    filled_cmd
 }

 fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -6,7 +6,7 @@ use crate::{background_process, local_env};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
-    println!(
+    print!(
        "Starting etcd broker using {:?}",
        etcd_broker.etcd_binary_path
    );
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -237,7 +237,7 @@ impl PageServerNode {
        datadir: &Path,
        update_config: bool,
    ) -> anyhow::Result<Child> {
-        println!(
+        print!(
            "Starting pageserver at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
            datadir.display()
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -0,0 +1,13 @@
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG COMPUTE_IMAGE=compute-node-v14
+ARG TAG=latest
+
+FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -2,6 +2,7 @@ version: '3'

 services:
  etcd:
+    restart: always
    image: quay.io/coreos/etcd:v3.5.4
    ports:
      - 2379:2379
@@ -9,7 +10,7 @@ services:
    environment:
      # This signifficantly speeds up etcd and we anyway don't data persistency there.
      ETCD_UNSAFE_NO_FSYNC: "1"
-    command: 
+    command:
      - "etcd"
      - "--auto-compaction-mode=revision"
      - "--auto-compaction-retention=1"
@@ -24,6 +25,7 @@ services:
      - "--quota-backend-bytes=134217728" # 128 MB

  minio:
+    restart: always
    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
    ports:
      - 9000:9000
@@ -41,7 +43,7 @@ services:
    entrypoint:
      - "/bin/sh"
      - "-c"
-    command: 
+    command:
      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
             echo 'Waiting to start minio...' && sleep 1;
         done;
@@ -51,7 +53,8 @@ services:
      - minio

  pageserver:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - BROKER_ENDPOINT='http://etcd:2379'
      - AWS_ACCESS_KEY_ID=minio
@@ -77,7 +80,8 @@ services:
      - minio_create_buckets

  safekeeper1:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
      - SAFEKEEPER_ID=1
@@ -106,7 +110,8 @@ services:
      - minio_create_buckets

  safekeeper2:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
      - SAFEKEEPER_ID=2
@@ -135,7 +140,8 @@ services:
      - minio_create_buckets

  safekeeper3:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
      - SAFEKEEPER_ID=3
@@ -164,18 +170,21 @@ services:
      - minio_create_buckets

  compute:
+    restart: always
    build:
-      context: ./image/compute
+      context: ./compute_wrapper/
      args:
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
+        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
      - PG_VERSION=${PG_VERSION:-14}
      #- RUST_BACKTRACE=1
+    # Mount the test files directly, for faster editing cycle.
    volumes:
-      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
-      - ./compute/shell/:/shell/
+      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/shell/:/shell/
    ports:
      - 55433:55433 # pg protocol handler
      - 3080:3080 # http endpoints
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# A basic test to ensure Docker images are built correctly.
+# Build a wrapper around the compute, start all services and runs a simple SQL query.
+# Repeats the process for all currenly supported Postgres versions.
+
+# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
+# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
+# to verify custom image builds (e.g pre-published ones).
+
+# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
+
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=docker-compose-compute-1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+    echo "show container information"
+    docker ps
+    docker compose -f $COMPOSE_FILE logs
+    echo "stop containers..."
+    docker compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+    echo "start containers (pg_version=$pg_version)."
+    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
+
+    echo "wait until the compute is ready. timeout after 60s. "
+    cnt=0
+    while sleep 1; do
+        # check timeout
+        cnt=`expr $cnt + 1`
+        if [ $cnt -gt 60 ]; then
+            echo "timeout before the compute is ready."
+            cleanup
+            exit 1
+        fi
+
+        # check if the compute is ready
+        set +o pipefail
+        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+        set -o pipefail
+        if [ $result -eq 1 ]; then
+            echo "OK. The compute is ready to connect."
+            echo "execute simple queries."
+            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+            cleanup
+            break
+        fi
+    done
+done
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -1,10 +0,0 @@
-ARG COMPUTE_IMAGE=compute-node-v14:latest
-FROM neondatabase/${COMPUTE_IMAGE}
-
-USER root
-RUN apt-get update &&       \
-    apt-get install -y curl \
-                       jq   \
-                       netcat
-
-USER postgres
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -37,7 +37,7 @@

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
-  - [Error handling and logging]()
+  - [Error handling and logging](./error-handling.md)
  - [Testing]()
    - [Unit testing]()
    - [Integration testing]()
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -0,0 +1,198 @@
+# Error handling and logging
+
+## Logging errors
+
+The principle is that errors are logged when they are handled. If you
+just propagate an error to the caller in a function, you don't need to
+log it; the caller will. But if you consume an error in a function,
+you *must* log it (if it needs to be logged at all).
+
+For example:
+
+```rust
+fn read_motd_file() -> std::io::Result<String> {
+    let mut f = File::open("/etc/motd")?;
+    let mut result = String::new();
+    f.read_to_string(&mut result)?;
+    result
+}
+```
+
+Opening or reading the file could fail, but there is no need to log
+the error here. The function merely propagates the error to the
+caller, and it is up to the caller to log the error or propagate it
+further, if the failure is not expected. But if, for example, it is
+normal that the "/etc/motd" file doesn't exist, the caller can choose
+to silently ignore the error, or log it as an INFO or DEBUG level
+message:
+
+```rust
+fn get_message_of_the_day() -> String {
+    // Get the motd from /etc/motd, or return the default proverb
+    match read_motd_file() {
+        Ok(motd) => motd,
+        Err(err)  => {
+            // It's normal that /etc/motd doesn't exist, but if we fail to
+            // read it for some other reason, that's unexpected. The message
+            // of the day isn't very important though, so we just WARN and
+            // continue with the default in any case.
+            if err.kind() != std::io::ErrorKind::NotFound {
+                 tracing::warn!("could not read \"/etc/motd\": {err:?}");
+            }
+            "An old error is always more popular than a new truth. - German proverb"
+        }
+    }
+}
+```
+
+## Error types
+
+We use the `anyhow` crate widely. It contains many convenient macros
+like `bail!` and `ensure!` to construct and return errors, and to
+propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
+
+A downside of `anyhow::Error` is that the caller cannot distinguish
+between different error cases. Most errors are propagated all the way
+to the mgmt API handler function, or the main loop that handles a
+connection with the compute node, and they are all handled the same
+way: the error is logged and returned to the client as an HTTP or
+libpq error.
+
+But in some cases, we need to distinguish between errors and handle
+them differently. For example, attaching a tenant to the pageserver
+could fail either because the tenant has already been attached, or
+because we could not load its metadata from cloud storage. The first
+case is more or less expected. The console sends the Attach request to
+the pageserver, and the pageserver completes the operation, but the
+network connection might be lost before the console receives the
+response. The console will retry the operation in that case, but the
+tenant has already been attached. It is important that the pagserver
+responds with the HTTP 403 Already Exists error in that case, rather
+than a generic HTTP 500 Internal Server Error.
+
+If you need to distinguish between different kinds of errors, create a
+new `Error` type. The `thiserror` crate is useful for that. But in
+most cases `anyhow::Error` is good enough.
+
+## Panics
+
+Depending on where a panic happens, it can cause the whole pageserver
+or safekeeper to restart, or just a single tenant. In either case,
+that is pretty bad and causes an outage. Avoid panics. Never use
+`unwrap()` or other calls that might panic, to verify inputs from the
+network or from disk.
+
+It is acceptable to use functions that might panic, like `unwrap()`, if
+it is obvious that it cannot panic. For example, if you have just
+checked that a variable is not None, it is OK to call `unwrap()` on it,
+but it is still preferable to use `expect("reason")` instead to explain
+why the function cannot fail.
+
+`assert!` and `panic!` are reserved for checking clear invariants and
+very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
+or `bail!` instead.
+
+## Error levels
+
+`tracing::Level` doesn't provide very clear guidelines on what the
+different levels mean, or when to use which level. Here is how we use
+them:
+
+### Error
+
+Examples:
+- could not open file "foobar"
+- invalid tenant id
+
+Errors are not expected to happen during normal operation. Incorrect
+inputs from client can cause ERRORs. For example, if a client tries to
+call a mgmt API that doesn't exist, or if a compute node sends passes
+an LSN that has already been garbage collected away.
+
+These should *not* happen during normal operations. "Normal
+operations" is not a very precise concept. But for example, disk
+errors are not expected to happen when the system is working, so those
+count as Errors. However, if a TCP connection to a compute node is
+lost, that is not considered an Error, because it doesn't affect the
+pageserver's or safekeeper's operation in any way, and happens fairly
+frequently when compute nodes are shut down, or are killed abruptly
+because of errors in the compute.
+
+**Errors are monitored, and always need human investigation to determine
+the cause.**
+
+Whether something should be logged at ERROR, WARNING or INFO level can
+depend on the callers and clients. For example, it might be unexpected
+and a sign of a serious issue if the console calls the
+"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
+would be appropriate in that case. But if the console routinely calls
+the API after deleting a timeline, to check if the deletion has
+completed, then it would be totally normal and an INFO or DEBUG level
+message would be more appropriate. If a message is logged as an ERROR,
+but it in fact happens frequently in production and never requires any
+action, it should probably be demoted to an INFO level message.
+
+### Warn
+
+Examples:
+- could not remove temporary file "foobar.temp"
+- unrecognized file "foobar" in timeline directory
+
+Warnings are similar to Errors, in that they should not happen
+when the system is operating normally. The difference between Error and
+Warning is that an Error means that the operation failed, whereas Warning
+means that something unexpected happened, but the operation continued anyway.
+For example, if deleting a file fails because the file already didn't exist,
+it should be logged as Warning.
+
+> **Note:** The python regression tests, under `test_regress`, check the
+> pageserver log after each test for any ERROR and WARN lines. If there are
+> any ERRORs or WARNs that have not been explicitly listed in the test as
+> allowed, the test is marked a failed. This is to catch unexpected errors
+> e.g. in background operations, that don't cause immediate misbehaviour in
+> the tested functionality.
+
+### Info
+
+Info level is used to log useful information when the system is
+operating normally. Info level is appropriate e.g. for logging state
+changes, background operations, and network connections.
+
+Examples:
+- "system is shutting down"
+- "tenant was created"
+- "retrying S3 upload"
+
+### Debug & Trace
+
+Debug and Trace level messages are not printed to the log in our normal
+production configuration, but could be enabled for a specific server or
+tenant, to aid debugging. (Although we don't actually have that
+capability as of this writing).
+
+## Context
+
+We use logging "spans" to hold context information about the current
+operation. Almost every operation happens on a particular tenant and
+timeline, so we enter a span with the "tenant_id" and "timeline_id"
+very early when processing an incoming API request, for example. All
+background operations should also run in a span containing at least
+those two fields, and any other parameters or information that might
+be useful when debugging an error that might happen when performing
+the operation.
+
+TODO: Spans are not captured in the Error when it is created, but when
+the error is logged. It would be more useful to capture them at Error
+creation. We should consider using `tracing_error::SpanTrace` to do
+that.
+
+## Error message style
+
+PostgreSQL has a style guide for writing error messages:
+
+https://www.postgresql.org/docs/current/error-style-guide.html
+
+Follow that guide when writing error messages in the PostgreSQL
+extension. We don't follow it strictly in the pageserver and
+safekeeper, but the advice in the PostgreSQL style guide is generally
+good, and you can't go wrong by following it.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -83,6 +83,16 @@ A subject for future modularization.
 `/libs/metrics`:
 Helpers for exposing Prometheus metrics from the server.

+### Adding dependencies
+When you add a Cargo dependency, you should update hakari manifest by running commands below and committing the updated `Cargo.lock` and `workspace_hack/`. There may be no changes, that's fine.
+
+```bash
+cargo hakari generate
+cargo hakari manage-deps
+```
+
+If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
+
 ## Using Python
 Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
 so manual installation of dependencies is not recommended.
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -48,6 +48,25 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

+/// use with fail::cfg("$name", "return(2000)")
+#[macro_export]
+macro_rules! failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        let should_sleep: Option<std::time::Duration> = (|| {
+            fail::fail_point!($name, |v: Option<_>| {
+                let millis = v.unwrap().parse::<u64>().unwrap();
+                Some(Duration::from_millis(millis))
+            });
+            None
+        })();
+        if let Some(d) = should_sleep {
+            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
+            tokio::time::sleep(d).await;
+            tracing::info!("failpoint {:?}: sleep done", $name);
+        }
+    }};
+}
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -138,7 +138,7 @@ impl FromStr for Lsn {
    ///
    /// If the input string is missing the '/' character, then use `Lsn::from_hex`
    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut splitter = s.split('/');
+        let mut splitter = s.trim().split('/');
        if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,6 +270,11 @@ mod tests {
        );
        assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
        assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
+
+        let expected_lsn = Lsn(0x3C490F8);
+        assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
+        assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
+        assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
    }

    #[test]
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -76,3 +76,7 @@ tempfile = "3.2"
 [[bench]]
 name = "bench_layer_map"
 harness = false
+
+[[bench]]
+name = "bench_walredo"
+harness = false
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -199,6 +199,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    logging::init(conf.log_format)?;
    info!("version: {}", version());

+    // If any failpoints were set from FAILPOINTS environment variable,
+    // print them to the log for debugging purposes
+    let failpoints = fail::list();
+    if !failpoints.is_empty() {
+        info!(
+            "started with failpoints: {}",
+            failpoints
+                .iter()
+                .map(|(name, actions)| format!("{name}={actions}"))
+                .collect::<Vec<String>>()
+                .join(";")
+        )
+    }
+
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
        lock_file::LockCreationResult::Created {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -614,8 +614,9 @@ impl PageServerConf {
        PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

-    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
+        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
+
        PageServerConf {
            id: NodeId(0),
            wait_lsn_timeout: Duration::from_secs(60),
@@ -626,7 +627,7 @@ impl PageServerConf {
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "cloud_admin".to_string(),
            workdir: repo_dir,
-            pg_distrib_dir: PathBuf::new(),
+            pg_distrib_dir,
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
            remote_storage_config: None,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,6 +13,7 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use futures::stream::FuturesOrdered;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -25,6 +26,7 @@ use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
+use std::time::Instant;
 use tokio::pin;
 use tokio_util::io::StreamReader;
 use tokio_util::io::SyncIoBridge;
@@ -54,6 +56,9 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

+/// Number of requests to process in parallel, from a single connection
+const MAX_INFLIGHT_REQUESTS: usize = 4;
+
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
    async_stream::try_stream! {
        loop {
@@ -76,6 +81,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                        FeMessage::CopyData(bytes) => bytes,
                        FeMessage::CopyDone => { break },
                        FeMessage::Sync => continue,
+                        FeMessage::Terminate => {
+                            let msg = format!("client terminated connection with Terminate message during COPY");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                            break;
+                        }
                        m => {
                            let msg = format!("unexpected message {:?}", m);
                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -87,10 +98,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                    yield copy_data_bytes;
                }
                Ok(None) => {
-                    let msg = "client closed connection";
+                    let msg = "client closed connection during COPY";
                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                }
                Err(e) => {
                    Err(io::Error::new(io::ErrorKind::Other, e))?;
@@ -218,6 +229,13 @@ struct PageRequestMetrics {
    get_db_size: metrics::Histogram,
 }

+pub enum RequestType {
+    Exists,
+    Nblocks,
+    GetPage,
+    DbSize,
+}
+
 impl PageRequestMetrics {
    fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
        let tenant_id = tenant_id.to_string();
@@ -292,66 +310,101 @@ impl PageServerHandler {

        let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);

+        //
+        // Main loop to handle the stream of requests
+        //
+        // We process multiple requests in parallel, by spawning a new Task for each
+        // incoming request.
+        let mut inprogress_requests = FuturesOrdered::new();
        loop {
-            let msg = tokio::select! {
+            tokio::select! {
                biased;

+                // If we were requested to shut down, stop
                _ = task_mgr::shutdown_watcher() => {
-                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    break;
                }

-                msg = pgb.read_message() => { msg }
-            };
+                // When a task completes, send the response to the client
+                completed_task = inprogress_requests.next(), if !inprogress_requests.is_empty() => {
+                    let response: Bytes;
+                    let request_type: RequestType;
+                    let elapsed_sec: f64;
+                    (response, request_type, elapsed_sec) = completed_task.unwrap()?;

-            let copy_data_bytes = match msg? {
-                Some(FeMessage::CopyData(bytes)) => bytes,
-                Some(m) => {
-                    bail!("unexpected message: {m:?} during COPY");
+                    pgb.write_message(&BeMessage::CopyData(&response))?;
+                    pgb.flush().await?;
+
+                    match request_type {
+                        RequestType::Exists => metrics.get_rel_exists.observe(elapsed_sec),
+                        RequestType::Nblocks => metrics.get_rel_size.observe(elapsed_sec),
+                        RequestType::GetPage => metrics.get_page_at_lsn.observe(elapsed_sec),
+                        RequestType::DbSize => metrics.get_db_size.observe(elapsed_sec),
+                    }
+
+                    continue;
                }
-                None => break, // client disconnected
-            };

-            trace!("query: {copy_data_bytes:?}");
+                // When a new request arrives, spawn a task to process it.
+                // If we already have MAX_INFLIGHT_REQUESTS requests in-progress, however,
+                // don't start new ones.
+                msg = pgb.read_message(), if inprogress_requests.len() < MAX_INFLIGHT_REQUESTS => {
+                    let copy_data_bytes = match msg? {
+                        Some(FeMessage::CopyData(bytes)) => bytes,
+                        Some(m) => {
+                            bail!("unexpected message: {m:?} during COPY");
+                        }
+                        None => break, // client disconnected
+                    };

-            // Trace request if needed
-            if let Some(t) = tracer.as_mut() {
-                t.trace(&copy_data_bytes)
-            }
+                    trace!("query: {copy_data_bytes:?}");

-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+                    // Trace request if needed
+                    if let Some(t) = tracer.as_mut() {
+                        t.trace(&copy_data_bytes)
+                    }

-            let response = match neon_fe_msg {
-                PagestreamFeMessage::Exists(req) => {
-                    let _timer = metrics.get_rel_exists.start_timer();
-                    self.handle_get_rel_exists_request(&timeline, &req).await
-                }
-                PagestreamFeMessage::Nblocks(req) => {
-                    let _timer = metrics.get_rel_size.start_timer();
-                    self.handle_get_nblocks_request(&timeline, &req).await
-                }
-                PagestreamFeMessage::GetPage(req) => {
-                    let _timer = metrics.get_page_at_lsn.start_timer();
-                    self.handle_get_page_at_lsn_request(&timeline, &req).await
-                }
-                PagestreamFeMessage::DbSize(req) => {
-                    let _timer = metrics.get_db_size.start_timer();
-                    self.handle_db_size_request(&timeline, &req).await
+                    let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+
+                    let timeline = Arc::clone(&timeline);
+                    let conf = self.conf;
+                    let task = async move {
+                        let start_time = Instant::now();
+                        let (response, request_type) = match neon_fe_msg {
+                            PagestreamFeMessage::Exists(req) => {
+                                (Self::handle_get_rel_exists_request(&timeline, &req).await,
+                                 RequestType::Exists)
+                            }
+                            PagestreamFeMessage::Nblocks(req) => {
+                                (Self::handle_get_nblocks_request(&timeline, &req).await,
+                                 RequestType::Nblocks)
+                            }
+                            PagestreamFeMessage::GetPage(req) => {
+                                (Self::handle_get_page_at_lsn_request(conf, &timeline, &req).await,
+                                 RequestType::GetPage)
+                            }
+                            PagestreamFeMessage::DbSize(req) => {
+                                (Self::handle_db_size_request(&timeline, &req).await,
+                                 RequestType::DbSize)
+                            }
+                        };
+
+                        let response = response.unwrap_or_else(|e| {
+                            // print the all details to the log with {:#}, but for the client the
+                            // error message is enough
+                            error!("error reading relation or page version: {:?}", e);
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            })
+                        });
+                        let response: Bytes = response.serialize();
+                        (response, request_type, start_time.elapsed().as_secs_f64())
+                    };
+                    inprogress_requests.push_back(tokio::spawn(task));
+                    continue;
                }
            };
-
-            let response = response.unwrap_or_else(|e| {
-                // print the all details to the log with {:#}, but for the client the
-                // error message is enough
-                error!("error reading relation or page version: {:?}", e);
-                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                    message: e.to_string(),
-                })
-            });
-
-            pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
-            pgb.flush().await?;
        }
        Ok(())
    }
@@ -523,9 +576,8 @@ impl PageServerHandler {
        Ok(lsn)
    }

-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
    async fn handle_get_rel_exists_request(
-        &self,
        timeline: &Timeline,
        req: &PagestreamExistsRequest,
    ) -> Result<PagestreamBeMessage> {
@@ -540,9 +592,8 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
    async fn handle_get_nblocks_request(
-        &self,
        timeline: &Timeline,
        req: &PagestreamNblocksRequest,
    ) -> Result<PagestreamBeMessage> {
@@ -557,9 +608,8 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
+    #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
    async fn handle_db_size_request(
-        &self,
        timeline: &Timeline,
        req: &PagestreamDbSizeRequest,
    ) -> Result<PagestreamBeMessage> {
@@ -577,11 +627,11 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
-    async fn handle_get_page_at_lsn_request(
-        &self,
-        timeline: &Timeline,
-        req: &PagestreamGetPageRequest,
+    #[instrument(skip(conf, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
+    async fn handle_get_page_at_lsn_request<'a>(
+        conf: &'static PageServerConf,
+        timeline: &'a Timeline,
+        req: &'a PagestreamGetPageRequest,
    ) -> Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
@@ -598,7 +648,7 @@ impl PageServerHandler {
        // FIXME: this profiling now happens at different place than it used to. The
        // current profiling is based on a thread-local variable, so it doesn't work
        // across awaits
-        let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
+        let _profiling_guard = profpoint_start(conf, ProfilingConfig::PageRequests);
        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -606,9 +656,8 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(pgb))]
    async fn handle_basebackup_request(
-        &self,
        pgb: &mut PostgresBackend,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -741,7 +790,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
            };

            // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
+            Self::handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
                .await?;
            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
@@ -801,7 +850,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
            self.check_permission(Some(tenant_id))?;

            // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
+            Self::handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
                .await?;
            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("import basebackup ") {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1010,6 +1010,10 @@ impl Tenant {

        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;

+        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+
+        info!("starting on {} timelines", gc_timelines.len());
+
        // Perform GC for each timeline.
        //
        // Note that we don't hold the GC lock here because we don't want
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -74,6 +74,7 @@ where
        };

        dstbuf.clear();
+        dstbuf.reserve(len);

        // Read the payload
        let mut remain = len;
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -260,8 +260,9 @@ impl Layer for DeltaLayer {

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
            let mut cursor = file.block_cursor();
+            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
-                let buf = cursor.read_blob(pos).with_context(|| {
+                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
                    format!(
                        "Failed to read blob from virtual file {}",
                        file.file.path.display()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -61,6 +61,13 @@ use crate::{
    storage_sync::{self, index::LayerFileMetadata},
 };

+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum FlushLoopState {
+    NotStarted,
+    Running,
+    Exited,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -122,7 +129,7 @@ pub struct Timeline {
    write_lock: Mutex<()>,

    /// Used to avoid multiple `flush_loop` tasks running
-    flush_loop_started: Mutex<bool>,
+    flush_loop_state: Mutex<FlushLoopState>,

    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
    /// The value is a counter, incremented every time a new flush cycle is requested.
@@ -755,7 +762,7 @@ impl Timeline {

            upload_layers: AtomicBool::new(upload_layers),

-            flush_loop_started: Mutex::new(false),
+            flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

            layer_flush_start_tx,
            layer_flush_done_tx,
@@ -794,13 +801,23 @@ impl Timeline {
    }

    pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
-        let mut flush_loop_started = self.flush_loop_started.lock().unwrap();
-        if *flush_loop_started {
-            info!(
-                "skipping attempt to start flush_loop twice {}/{}",
-                self.tenant_id, self.timeline_id
-            );
-            return;
+        let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
+        match *flush_loop_state {
+            FlushLoopState::NotStarted => (),
+            FlushLoopState::Running => {
+                info!(
+                    "skipping attempt to start flush_loop twice {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
+            FlushLoopState::Exited => {
+                warn!(
+                    "ignoring attempt to restart exited flush_loop {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
        }

        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
@@ -813,11 +830,16 @@ impl Timeline {
                    Some(self.timeline_id),
                    "layer flush task",
                    false,
-                    async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) }
+                    async move {
+                         self_clone.flush_loop(layer_flush_start_rx).await;
+                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                         *flush_loop_state  = FlushLoopState::Exited;
+                         Ok(()) }
                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
                );

-        *flush_loop_started = true;
+        *flush_loop_state = FlushLoopState::Running;
    }

    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
@@ -1365,8 +1387,9 @@ impl Timeline {
        // finished, instead of some other flush that was started earlier.
        let mut my_flush_request = 0;

-        if !&*self.flush_loop_started.lock().unwrap() {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running")
+        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
+        if flush_loop_state != FlushLoopState::Running {
+            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

        self.layer_flush_start_tx.send_modify(|counter| {
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -216,7 +216,6 @@ impl TenantConf {
        }
    }

-    #[cfg(test)]
    pub fn dummy_conf() -> Self {
        TenantConf {
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -71,9 +71,7 @@ async fn compaction_loop(tenant_id: TenantId) {
            let mut sleep_duration = tenant.get_compaction_period();
            if let Err(e) = tenant.compaction_iteration() {
                sleep_duration = wait_duration;
-                error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
-                #[cfg(feature = "testing")]
-                std::process::abort();
+                error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
            }

            // Sleep
@@ -122,9 +120,7 @@ async fn gc_loop(tenant_id: TenantId) {
                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
                {
                    sleep_duration = wait_duration;
-                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
-                    #[cfg(feature = "testing")]
-                    std::process::abort();
+                    error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
                }
            }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -21,6 +21,7 @@
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
@@ -31,7 +32,8 @@ use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::Mutex;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Condvar, Mutex};
 use std::time::Duration;
 use std::time::Instant;
 use std::{fs, io};
@@ -57,6 +59,9 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;

+/// Maximum number of WAL redo processes to launch for a single tenant.
+const MAX_PROCESSES: usize = 4;
+
 ///
 /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
 ///
@@ -90,18 +95,32 @@ pub trait WalRedoManager: Send + Sync {
    ) -> Result<Bytes, WalRedoError>;
 }

+static WAL_REDO_PROCESS_COUNTER: Lazy<AtomicU64> = Lazy::new(|| { AtomicU64::new(0) });
+
 ///
-/// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. Only one thread can use the process at a time,
-/// that is controlled by the Mutex. In the future, we might want to
-/// launch a pool of processes to allow concurrent replay of multiple
-/// records.
+/// This is the real implementation that uses a special Postgres
+/// process to perform WAL replay. There is a pool of these processes.
 ///
 pub struct PostgresRedoManager {
    tenant_id: TenantId,
    conf: &'static PageServerConf,

-    process: Mutex<Option<PostgresRedoProcess>>,
+    /// Pool of processes.
+    process_list: Mutex<ProcessList>,
+    /// Condition variable that can be used to sleep until a process
+    /// becomes available in the pool.
+    condvar: Condvar,
+}
+
+// A pool of WAL redo processes
+#[derive(Default)]
+struct ProcessList {
+    /// processes that are available for reuse
+    free_processes: Vec<PostgresRedoProcess>,
+
+    /// Total number of processes, including all the processes in
+    /// 'free_processes' list, and any processes that are in use.
+    num_processes: usize,
 }

 /// Can this request be served by neon redo functions
@@ -206,10 +225,42 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenant_id,
            conf,
-            process: Mutex::new(None),
+            process_list: Mutex::new(ProcessList::default()),
+            condvar: Condvar::new(),
        }
    }

+    // Get a handle to a redo process from the pool.
+    fn get_process(&self, pg_version: u32) -> Result<PostgresRedoProcess, WalRedoError> {
+        let mut process_list = self.process_list.lock().unwrap();
+
+        loop {
+            // If there's a free process immediately available, take it.
+            if let Some(process) = process_list.free_processes.pop() {
+                return Ok(process);
+            }
+
+            // All processes are in use. If the pool is at its maximum size
+            // already, wait for a process to become free. Otherwise launch
+            // a new process.
+            if process_list.num_processes >= MAX_PROCESSES {
+                process_list = self.condvar.wait(process_list).unwrap();
+                continue;
+            } else {
+                let process = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
+                process_list.num_processes += 1;
+                return Ok(process);
+            }
+        }
+    }
+
+    /// Launch process pre-emptively. Should not be needed except for benchmarking.
+    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
+        // get_process launches a process, if no processes were running previously
+        let _ = self.get_process(pg_version)?;
+        Ok(())
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -226,15 +277,9 @@ impl PostgresRedoManager {

        let start_time = Instant::now();

-        let mut process_guard = self.process.lock().unwrap();
-        let lock_time = Instant::now();
+        let mut process = self.get_process(pg_version)?;

-        // launch the WAL redo process on first use
-        if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *process_guard = Some(p);
-        }
-        let process = process_guard.as_mut().unwrap();
+        let lock_time = Instant::now();

        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

@@ -268,8 +313,9 @@ impl PostgresRedoManager {
            lsn
        );

-        // If something went wrong, don't try to reuse the process. Kill it, and
-        // next request will launch a new one.
+        // If something went wrong, don't try to reuse the
+        // process. Kill it, and next request will launch a new one.
+        // Otherwise return the process to the pool.
        if result.is_err() {
            error!(
                "error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
@@ -277,8 +323,14 @@ impl PostgresRedoManager {
                nbytes,
                lsn
            );
-            let process = process_guard.take().unwrap();
            process.kill();
+            let mut process_list = self.process_list.lock().unwrap();
+            process_list.num_processes -= 1;
+            self.condvar.notify_one();
+        } else {
+            let mut process_list = self.process_list.lock().unwrap();
+            process_list.free_processes.push(process);
+            self.condvar.notify_one();
        }
        result
    }
@@ -598,11 +650,10 @@ impl PostgresRedoProcess {
        tenant_id: TenantId,
        pg_version: u32,
    ) -> Result<PostgresRedoProcess, Error> {
-        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
-        // just create one with constant name. That fails if you try to launch more than
-        // one WAL redo manager concurrently.
+        // We need a dummy Postgres cluster to run the process in.
+        let processno = WAL_REDO_PROCESS_COUNTER.fetch_add(1, Ordering::Relaxed);
        let datadir = path_with_suffix_extension(
-            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
+            conf.tenant_path(&tenant_id).join(format!("wal-redo-datadir-{}", processno)),
            TEMP_FILE_SUFFIX,
        );

@@ -740,7 +791,11 @@ impl PostgresRedoProcess {
        // This could be problematic if there are millions of records to replay,
        // but in practice the number of records is usually so small that it doesn't
        // matter, and it's better to keep this code simple.
-        let mut writebuf: Vec<u8> = Vec::new();
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
        build_begin_redo_for_block_msg(tag, &mut writebuf);
        if let Some(img) = base_img {
            build_push_page_msg(tag, &img, &mut writebuf);
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -32,11 +32,6 @@

 #define PageStoreTrace DEBUG5

-#define NEON_TAG "[NEON_SMGR] "
-#define neon_log(tag, fmt, ...) ereport(tag,                                  \
-										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true)))
-
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;

@@ -52,6 +47,7 @@ char	   *page_server_connstring_raw;

 int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
+int			readahead_buffer_size = 128;

 static void pageserver_flush(void);

@@ -96,11 +92,10 @@ pageserver_connect()

 	while (PQisBusy(pageserver_conn))
 	{
-		int			wc;
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -140,11 +135,10 @@ retry:

 	if (ret == 0)
 	{
-		int			wc;
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -238,6 +232,9 @@ pageserver_receive(void)
 	StringInfoData resp_buff;
 	NeonResponse *resp;

+	if (!connected)
+		return NULL;
+
 	PG_TRY();
 	{
 		/* read response */
@@ -247,7 +244,10 @@ pageserver_receive(void)
 		if (resp_buff.len < 0)
 		{
 			if (resp_buff.len == -1)
-				neon_log(ERROR, "end of COPY");
+			{
+				pageserver_disconnect();
+				return NULL;
+			}
 			else if (resp_buff.len == -2)
 				neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 		}
@@ -449,9 +449,22 @@ pg_init_libpagestore(void)
 							NULL,
 							&flush_every_n_requests,
 							8, -1, INT_MAX,
-							PGC_SIGHUP,
+							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, NULL, NULL);
+	DefineCustomIntVariable("neon.readahead_buffer_size",
+							"number of prefetches to buffer",
+							"This buffer is used to store prefetched data; so "
+							"it is important that this buffer is at least as "
+							"large as the configured value of all tablespaces' "
+							"effective_io_concurrency and maintenance_io_concurrency, "
+							"your sessions' values of these, and the value for "
+							"seqscan_prefetch_buffers.",
+							&readahead_buffer_size,
+							128, 16, 1024,
+							PGC_USERSET,
+							0,	/* no flags required */
+							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);

 	relsize_hash_init();

--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -49,6 +49,11 @@ typedef struct

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag,                                  \
+										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
+										 errhidestmt(true), errhidecontext(true)))
+
 /*
 * supertype of all the Neon*Request structs below
 *
@@ -150,6 +155,8 @@ extern void prefetch_on_ps_disconnect(void);
 extern page_server_api * page_server;

 extern char *page_server_connstring;
+extern int flush_every_n_requests;
+extern int readahead_buffer_size;
 extern bool seqscan_prefetch_enabled;
 extern int seqscan_prefetch_distance;
 extern char *neon_timeline;
@@ -159,6 +166,7 @@ extern int32 max_cluster_size;

 extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
 extern void smgr_init_neon(void);
+extern void readahead_buffer_resize(int newsize, void *extra);

 /* Neon storage manager functionality */

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -116,10 +116,10 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 * 
 * Prefetch is performed locally by each backend.
 *
- * There can be up to READ_BUFFER_SIZE active IO requests registered at any
- * time. Requests using smgr_prefetch are sent to the pageserver, but we don't
- * wait on the response. Requests using smgr_read are either read from the
- * buffer, or (if that's not possible) we wait on the response to arrive -
+ * There can be up to readahead_buffer_size active IO requests registered at
+ * any time. Requests using smgr_prefetch are sent to the pageserver, but we
+ * don't wait on the response. Requests using smgr_read are either read from
+ * the buffer, or (if that's not possible) we wait on the response to arrive -
 * this also will allow us to receive other prefetched pages. 
 * Each request is immediately written to the output buffer of the pageserver
 * connection, but may not be flushed if smgr_prefetch is used: pageserver
@@ -136,15 +136,25 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 * the connection; the responses are stored for later use.
 *
 * NOTE: The current implementation of the prefetch system implements a ring
- * buffer of up to READ_BUFFER_SIZE requests. If there are more _read and
+ * buffer of up to readahead_buffer_size requests. If there are more _read and
 * _prefetch requests between the initial _prefetch and the _read of a buffer,
 * the prefetch request will have been dropped from this prefetch buffer, and
 * your prefetch was wasted.
 */

-/* Max amount of tracked buffer reads */
-#define READ_BUFFER_SIZE 128
-
+/*
+ * State machine:
+ *        
+ * not in hash : in hash
+ *             :
+ * UNUSED ------> REQUESTED --> RECEIVED
+ *   ^         :      |            |
+ *   |         :      v            |
+ *   |         : TAG_UNUSED        |
+ *   |         :      |            |
+ *   +----------------+------------+
+ *             :
+ */
 typedef enum PrefetchStatus {
 	PRFS_UNUSED = 0,	/* unused slot */
 	PRFS_REQUESTED,		/* request was written to the sendbuffer to PS, but not
@@ -192,7 +202,7 @@ typedef struct PrfHashEntry {
 * It maintains a (ring) buffer of in-flight requests and responses.
 * 
 * We maintain several indexes into the ring buffer:
- * ring_unused >= ring_receive >= ring_last >= 0
+ * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
 * 
 * ring_unused points to the first unused slot of the buffer
 * ring_receive is the next request that is to be received
@@ -208,6 +218,7 @@ typedef struct PrefetchState {

 	/* buffer indexes */
 	uint64	ring_unused;		/* first unused slot */
+	uint64	ring_flush;			/* next request to flush */
 	uint64	ring_receive;		/* next slot that is to receive a response */
 	uint64	ring_last;			/* min slot with a response value */

@@ -218,11 +229,19 @@ typedef struct PrefetchState {

 	/* the buffers */
 	prfh_hash *prf_hash;
-	PrefetchRequest prf_buffer[READ_BUFFER_SIZE]; /* prefetch buffers */
+	PrefetchRequest prf_buffer[]; /* prefetch buffers */
 } PrefetchState;

 PrefetchState *MyPState;

+#define GetPrfSlot(ring_index) ( \
+	( \
+		AssertMacro((ring_index) < MyPState->ring_unused && \
+					(ring_index) >= MyPState->ring_last), \
+		&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+	) \
+)
+
 int			n_prefetch_hits = 0;
 int			n_prefetch_misses = 0;
 int			n_prefetch_missed_caches = 0;
@@ -232,18 +251,116 @@ XLogRecPtr	prefetch_lsn = 0;

 static void consume_prefetch_responses(void);
 static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
-static void prefetch_read(PrefetchRequest *slot);
+static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
-static void prefetch_wait_for(uint64 ring_index);
+static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup(void);
-static inline void prefetch_set_unused(uint64 ring_index, bool hash_cleanup);
+static inline void prefetch_set_unused(uint64 ring_index);

 static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
 									   ForkNumber forknum, BlockNumber blkno);

+void
+readahead_buffer_resize(int newsize, void *extra)
+{
+	uint64		end,
+				nfree = newsize;
+	PrefetchState *newPState;
+	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);
+	
+	/* don't try to re-initialize if we haven't initialized yet */
+	if (MyPState == NULL)
+		return;
+
+	/*
+	 * Make sure that we don't lose track of active prefetch requests by
+	 * ensuring we have received all but the last n requests (n = newsize).
+	 */
+	if (MyPState->n_requests_inflight > newsize)
+		prefetch_wait_for(MyPState->ring_unused - newsize);
+
+	/* construct the new PrefetchState, and copy over the memory contexts */
+	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
+
+	newPState->bufctx = MyPState->bufctx;
+	newPState->errctx = MyPState->errctx;
+	newPState->hashctx = MyPState->hashctx;
+	newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
+	newPState->n_unused = newsize;
+	newPState->n_requests_inflight = 0;
+	newPState->n_responses_buffered = 0;
+	newPState->ring_last = newsize;
+	newPState->ring_unused = newsize;
+	newPState->ring_receive = newsize;
+	newPState->ring_flush = newsize;
+
+	/* 
+	 * Copy over the prefetches.
+	 * 
+	 * We populate the prefetch array from the end; to retain the most recent
+	 * prefetches, but this has the benefit of only needing to do one iteration
+	 * on the dataset, and trivial compaction.
+	 */
+	for (end = MyPState->ring_unused - 1;
+		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
+		 end -= 1)
+	{
+		PrefetchRequest *slot = GetPrfSlot(end);
+		PrefetchRequest *newslot;
+		bool	found;
+
+		if (slot->status == PRFS_UNUSED)
+			continue;
+
+		nfree -= 1;
+
+		newslot = &newPState->prf_buffer[nfree];
+		*newslot = *slot;
+		newslot->my_ring_index = nfree;
+
+		prfh_insert(newPState->prf_hash, newslot, &found);
+
+		Assert(!found);
+		
+		switch (newslot->status)
+		{
+			case PRFS_UNUSED:
+				pg_unreachable();
+			case PRFS_REQUESTED:
+				newPState->n_requests_inflight += 1;
+				newPState->ring_receive -= 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_RECEIVED:
+				newPState->n_responses_buffered += 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_TAG_REMAINS:
+				newPState->ring_last -= 1;
+				break;
+		}
+		newPState->n_unused -= 1;
+	}
+
+	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
+	{
+		prefetch_set_unused(end);
+	}
+
+	prfh_destroy(MyPState->prf_hash);
+	pfree(MyPState);
+	MyPState = newPState;
+}
+
+

 /*
 * Make sure that there are no responses still in the buffer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */
 static void
 consume_prefetch_responses(void)
@@ -255,14 +372,12 @@ consume_prefetch_responses(void)
 static void
 prefetch_cleanup(void)
 {
-	int		index;
 	uint64	ring_index;
 	PrefetchRequest *slot;

 	while (MyPState->ring_last < MyPState->ring_receive) {
 		ring_index = MyPState->ring_last;
-		index = (ring_index % READ_BUFFER_SIZE);
-		slot = &MyPState->prf_buffer[index];
+		slot = GetPrfSlot(ring_index);

 		if (slot->status == PRFS_UNUSED)
 			MyPState->ring_last += 1;
@@ -274,23 +389,33 @@ prefetch_cleanup(void)
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
+ * 
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */
-static void
+static bool
 prefetch_wait_for(uint64 ring_index)
 {
-	int index;
 	PrefetchRequest *entry;

+	if (MyPState->ring_flush <= ring_index &&
+		MyPState->ring_unused > MyPState->ring_flush)
+	{
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
 	Assert(MyPState->ring_unused > ring_index);

 	while (MyPState->ring_receive <= ring_index)
 	{
-		index = (MyPState->ring_receive % READ_BUFFER_SIZE);
-		entry = &MyPState->prf_buffer[index];
+		entry = GetPrfSlot(MyPState->ring_receive);

 		Assert(entry->status == PRFS_REQUESTED);
-		prefetch_read(entry);
+		if (!prefetch_read(entry))
+			return false;
 	}
+	return true;
 }

 /*
@@ -298,8 +423,11 @@ prefetch_wait_for(uint64 ring_index)
 * 
 * The caller is responsible for making sure that the request for this buffer
 * was flushed to the PageServer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */
-static void
+static bool
 prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
@@ -312,15 +440,22 @@ prefetch_read(PrefetchRequest *slot)
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive();
 	MemoryContextSwitchTo(old);
+	if (response)
+	{
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;

-	/* update prefetch state */
-	MyPState->n_responses_buffered += 1;
-	MyPState->n_requests_inflight -= 1;
-	MyPState->ring_receive += 1;
-
-	/* update slot state */
-	slot->status = PRFS_RECEIVED;
-	slot->response = response;
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+		return true;
+	}
+	else
+	{
+		return false;
+	}
 }

 /*
@@ -332,19 +467,22 @@ prefetch_read(PrefetchRequest *slot)
 void
 prefetch_on_ps_disconnect(void)
 {
-	for (; MyPState->ring_receive < MyPState->ring_unused; MyPState->ring_receive++)
+	MyPState->ring_flush = MyPState->ring_unused;
+	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
-		int		index = MyPState->ring_receive % READ_BUFFER_SIZE;
+		uint64 ring_index = MyPState->ring_receive;
+
+		slot = GetPrfSlot(ring_index);

-		slot = &MyPState->prf_buffer[index];
 		Assert(slot->status == PRFS_REQUESTED);
-		Assert(slot->my_ring_index == MyPState->ring_receive);
+		Assert(slot->my_ring_index == ring_index);

 		/* clean up the request */
 		slot->status = PRFS_TAG_REMAINS;
-		MyPState->n_requests_inflight--;
-		prefetch_set_unused(MyPState->ring_receive, true);
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		prefetch_set_unused(ring_index);
 	}
 }

@@ -353,21 +491,24 @@ prefetch_on_ps_disconnect(void)
 *
 * The slot at ring_index must be a current member of the ring buffer,
 * and may not be in the PRFS_REQUESTED state.
+ *
+ * NOTE: this function will update MyPState->pfs_hash; which invalidates any
+ * active pointers into the hash table.
 */
 static inline void
-prefetch_set_unused(uint64 ring_index, bool hash_cleanup)
+prefetch_set_unused(uint64 ring_index)
 {
-	PrefetchRequest *slot = &MyPState->prf_buffer[ring_index % READ_BUFFER_SIZE];
+	PrefetchRequest *slot = GetPrfSlot(ring_index);

-	Assert(MyPState->ring_last <= ring_index &&
-		   MyPState->ring_unused > ring_index);
+	if (ring_index < MyPState->ring_last)
+		return; /* Should already be unused */
+
+	Assert(MyPState->ring_unused > ring_index);

 	if (slot->status == PRFS_UNUSED)
 		return;

 	Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
-	Assert(ring_index >= MyPState->ring_last &&
-		   ring_index < MyPState->ring_unused);

 	if (slot->status == PRFS_RECEIVED)
 	{
@@ -382,8 +523,7 @@ prefetch_set_unused(uint64 ring_index, bool hash_cleanup)
 		Assert(slot->response == NULL);
 	}

-	if (hash_cleanup)
-		prfh_delete(MyPState->prf_hash, slot);
+	prfh_delete(MyPState->prf_hash, slot);

 	/* clear all fields */
 	MemSet(slot, 0, sizeof(PrefetchRequest));
@@ -397,6 +537,7 @@ prefetch_set_unused(uint64 ring_index, bool hash_cleanup)
 static void
 prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
 {
+	bool found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
@@ -454,6 +595,9 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
+
+	prfh_insert(MyPState->prf_hash, slot, &found);
+	Assert(!found);
 }

 /*
@@ -464,13 +608,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 * If force_latest and force_lsn are not NULL, those values are sent to the
 * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
 * to fill in these values manually.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */

 static uint64
 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	int		index;
-	bool	found;
 	uint64	ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
@@ -485,28 +630,49 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	{
 		slot = entry->slot;
 		ring_index = slot->my_ring_index;
-		index = (ring_index % READ_BUFFER_SIZE);
-		Assert(slot == &MyPState->prf_buffer[index]);
+		Assert(slot == GetPrfSlot(ring_index));

 		Assert(slot->status != PRFS_UNUSED);
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index < MyPState->ring_unused);
 		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
-		
+
 		/*
 		 * If we want a specific lsn, we do not accept requests that were made
 		 * with a potentially different LSN.
 		 */
-		if (force_lsn && slot->effective_request_lsn != *force_lsn)
+		if (force_latest && force_lsn)
 		{
-			prefetch_wait_for(ring_index);
-			prefetch_set_unused(ring_index, true);
+			/* if we want the latest version, any effective_request_lsn < request lsn is OK */
+			if (*force_latest)
+			{
+				if (*force_lsn > slot->effective_request_lsn)
+				{
+					prefetch_wait_for(ring_index);
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
+			}
+			/* if we don't want the latest version, only accept requests with the exact same LSN */
+			else
+			{
+				if (*force_lsn != slot->effective_request_lsn)
+				{
+					prefetch_wait_for(ring_index);
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
+			}
 		}
+
 		/*
 		 * We received a prefetch for a page that was recently read and
 		 * removed from the buffers. Remove that request from the buffers.
 		 */
 		else if (slot->status == PRFS_TAG_REMAINS)
 		{
-			prefetch_set_unused(ring_index, true);
+			prefetch_set_unused(ring_index);
+			entry = NULL;
 		}
 		else
 		{
@@ -529,9 +695,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	 * output buffer, and 'not sending' a prefetch request kind of goes
 	 * against the principles of prefetching)
 	 */
-	if (MyPState->ring_last + READ_BUFFER_SIZE - 1 == MyPState->ring_unused)
+	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
 	{
-		slot = &MyPState->prf_buffer[(MyPState->ring_last % READ_BUFFER_SIZE)];
+		uint64 cleanup_index = MyPState->ring_last;
+		slot = GetPrfSlot(cleanup_index);

 		Assert(slot->status != PRFS_UNUSED);

@@ -539,13 +706,13 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 		switch (slot->status)
 		{
 			case PRFS_REQUESTED:
-				Assert(MyPState->ring_receive == MyPState->ring_last);
-				prefetch_wait_for(MyPState->ring_last);
-				prefetch_set_unused(MyPState->ring_last, true);
+				Assert(MyPState->ring_receive == cleanup_index);
+				prefetch_wait_for(cleanup_index);
+				prefetch_set_unused(cleanup_index);
 				break;
 			case PRFS_RECEIVED:
 			case PRFS_TAG_REMAINS:
-				prefetch_set_unused(MyPState->ring_last, true);
+				prefetch_set_unused(cleanup_index);
 				break;
 			default:
 				pg_unreachable();
@@ -553,12 +720,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	}

 	/*
-	 * The next buffer pointed to by `ring_unused` is now unused, so we can insert
-	 * the new request to it.
+	 * The next buffer pointed to by `ring_unused` is now definitely empty,
+	 * so we can insert the new request to it.
 	 */
 	ring_index = MyPState->ring_unused;
-	index = (ring_index % READ_BUFFER_SIZE);
-	slot = &MyPState->prf_buffer[index];
+	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];

 	Assert(MyPState->ring_last <= ring_index);

@@ -571,22 +737,34 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	slot->buftag = tag;
 	slot->my_ring_index = ring_index;

-	prfh_insert(MyPState->prf_hash, slot, &found);
-	Assert(!found);
-
 	prefetch_do_request(slot, force_latest, force_lsn);
 	Assert(slot->status == PRFS_REQUESTED);
-	Assert(ring_index < MyPState->ring_unused);
+	Assert(MyPState->ring_last <= ring_index &&
+		   ring_index < MyPState->ring_unused);
+
+	if (flush_every_n_requests > 0 &&
+		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
+	{
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
 	return ring_index;
 }

 static NeonResponse *
 page_server_request(void const *req)
 {
-	page_server->send((NeonRequest *) req);
-	page_server->flush();
-	consume_prefetch_responses();
-	return page_server->receive();
+	NeonResponse* resp;
+	do {
+		page_server->send((NeonRequest *) req);
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+		consume_prefetch_responses();
+		resp = page_server->receive();
+	} while (resp == NULL);
+	return resp;
+
 }


@@ -1052,14 +1230,18 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 void
 neon_init(void)
 {
-	HASHCTL info;
+	Size prfs_size;

 	if (MyPState != NULL)
 		return;

-	MyPState = MemoryContextAllocZero(TopMemoryContext, sizeof(PrefetchState));
+	prfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);
+
+	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
 	
-	MyPState->n_unused = READ_BUFFER_SIZE;
+	MyPState->n_unused = readahead_buffer_size;

 	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
 										 "NeonSMGR/prefetch",
@@ -1072,11 +1254,8 @@ neon_init(void)
 											  "NeonSMGR/prefetch",
 											  ALLOCSET_DEFAULT_SIZES);

-	info.keysize = sizeof(BufferTag);
-	info.entrysize = sizeof(uint64);
-
 	MyPState->prf_hash = prfh_create(MyPState->hashctx,
-									 READ_BUFFER_SIZE, NULL);
+									 readahead_buffer_size, NULL);

 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
@@ -1470,7 +1649,8 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	uint64 ring_index;
+	BufferTag	tag;
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1486,7 +1666,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	BufferTag tag = (BufferTag) {
+	tag = (BufferTag) {
 		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
 		.blockNum = blocknum
@@ -1565,9 +1745,9 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,

 	if (entry != NULL)
 	{
-		if (entry->slot->effective_request_lsn >= prefetch_lsn)
+		slot = entry->slot;
+		if (slot->effective_request_lsn >= request_lsn)
 		{
-			slot = entry->slot;
 			ring_index = slot->my_ring_index;
 			n_prefetch_hits += 1;
 		}
@@ -1578,36 +1758,36 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			 * unlikely this happens, but it can happen if prefetch distance is
 			 * large enough and a backend didn't consume all prefetch requests.
 			 */
-			if (entry->slot->status == PRFS_REQUESTED)
+			if (slot->status == PRFS_REQUESTED)
 			{
-				page_server->flush();
-				prefetch_wait_for(entry->slot->my_ring_index);
+				prefetch_wait_for(slot->my_ring_index);
 			}
 			/* drop caches */
-			prefetch_set_unused(entry->slot->my_ring_index, true);
+			prefetch_set_unused(slot->my_ring_index);
 			n_prefetch_missed_caches += 1;
 			/* make it look like a prefetch cache miss */
 			entry = NULL;
 		}
 	}

-	if (entry == NULL)
+	do
 	{
-		n_prefetch_misses += 1;
+		if (entry == NULL)
+		{
+			n_prefetch_misses += 1;

-		ring_index = prefetch_register_buffer(buftag, &request_latest,
-											  &request_lsn);
-		slot = &MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)];
-	}
+			ring_index = prefetch_register_buffer(buftag, &request_latest,
+												  &request_lsn);
+			slot = GetPrfSlot(ring_index);
+		}

-	Assert(MyPState->ring_last <= ring_index &&
-		   MyPState->ring_unused > ring_index);
-	Assert(slot->my_ring_index == ring_index);
-	Assert(slot->status != PRFS_UNUSED);
-	Assert(&MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)] == slot);
+		Assert(slot->my_ring_index == ring_index);
+		Assert(MyPState->ring_last <= ring_index &&
+			   MyPState->ring_unused > ring_index);
+		Assert(slot->status != PRFS_UNUSED);
+		Assert(GetPrfSlot(ring_index) == slot);

-	page_server->flush();
-	prefetch_wait_for(ring_index);
+	} while (!prefetch_wait_for(ring_index));

 	Assert(slot->status == PRFS_RECEIVED);

@@ -1637,7 +1817,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	}

 	/* buffer was used, clean up for later reuse */
-	prefetch_set_unused(ring_index, true);
+	prefetch_set_unused(ring_index);
 	prefetch_cleanup();
 }

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -119,6 +119,7 @@ static TimestampTz last_reconnect_attempt;
 static WalproposerShmemState * walprop_shared;

 /* Prototypes for private functions */
+static void WalProposerRegister(void);
 static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
 static void WalProposerStart(void);
 static void WalProposerLoop(void);
@@ -455,7 +456,7 @@ WalProposerPoll(void)
 /*
 * Register a background worker proposing WAL to wal acceptors.
 */
-void
+static void
 WalProposerRegister(void)
 {
 	BackgroundWorker bgw;
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -377,18 +377,18 @@ typedef struct Safekeeper
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

-extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
-void		WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-void		WalProposerPoll(void);
-void		WalProposerRegister(void);
-void		ParseReplicationFeedbackMessage(StringInfo reply_message,
-											ReplicationFeedback * rf);
+extern void WalProposerSync(int argc, char *argv[]);
+extern void WalProposerMain(Datum main_arg);
+extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
+extern void WalProposerPoll(void);
+extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
+											ReplicationFeedback *rf);
 extern void StartProposerReplication(StartReplicationCmd *cmd);

-Size		WalproposerShmemSize(void);
-bool		WalproposerShmemInit(void);
-void		replication_feedback_set(ReplicationFeedback * rf);
-void		replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+extern Size WalproposerShmemSize(void);
+extern bool WalproposerShmemInit(void);
+extern void replication_feedback_set(ReplicationFeedback *rf);
+extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

 /* libpqwalproposer hooks & helper type */

--- a/scripts/docker-compose_test.sh
+++ b/scripts/docker-compose_test.sh
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-# this is a shortcut script to avoid duplication in CI
-set -eux -o pipefail
-
-SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
-
-COMPUTE_CONTAINER_NAME=dockercompose_compute_1
-SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
-PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
-
-cleanup() {
-	echo "show container information"
-	docker ps
-	docker-compose -f $COMPOSE_FILE logs
-	echo "stop containers..."
-	docker-compose -f $COMPOSE_FILE down
-}
-
-echo "clean up containers if exists"
-cleanup
-
-for pg_version in 14 15; do
-	echo "start containers (pg_version=$pg_version)."
-	PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
-
-	echo "wait until the compute is ready. timeout after 60s. "
-	cnt=0
-	while sleep 1; do
-		# check timeout
-		cnt=`expr $cnt + 1`
-		if [ $cnt -gt 60 ]; then
-			echo "timeout before the compute is ready."
-			cleanup
-			exit 1
-		fi
-
-		# check if the compute is ready
-		set +o pipefail
-		result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
-		set -o pipefail
-		if [ $result -eq 1 ]; then
-			echo "OK. The compute is ready to connect."
-			echo "execute simple queries."
-			docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
-			cleanup
-			break
-		fi
-	done
-done
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -784,6 +784,8 @@ class NeonEnvBuilder:

            self.cleanup_remote_storage()

+            self.env.pageserver.assert_no_errors()
+

 class NeonEnv:
    """
@@ -1566,6 +1568,7 @@ class NeonCli(AbstractNeonCli):
    def pageserver_start(
        self,
        overrides: Tuple[str, ...] = (),
+        extra_env_vars: Optional[Dict[str, str]] = None,
    ) -> "subprocess.CompletedProcess[str]":
        start_args = ["pageserver", "start", *overrides]
        append_pageserver_param_overrides(
@@ -1575,11 +1578,11 @@ class NeonCli(AbstractNeonCli):
            pageserver_config_override=self.env.pageserver.config_override,
        )

-        s3_env_vars = None
        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
            s3_env_vars = self.env.remote_storage.access_env_vars()
+            extra_env_vars = (extra_env_vars or {}) | s3_env_vars

-        return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
+        return self.raw_cli(start_args, extra_env_vars=extra_env_vars)

    def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]":
        cmd = ["pageserver", "stop"]
@@ -1723,7 +1726,48 @@ class NeonPageserver(PgProtocol):
        self.config_override = config_override
        self.version = env.get_pageserver_version()

-    def start(self, overrides: Tuple[str, ...] = ()) -> "NeonPageserver":
+        # After a test finishes, we will scrape the log to see if there are any
+        # unexpected error messages. If your test expects an error, add it to
+        # 'allowed_errors' in the test with something like:
+        #
+        # env.pageserver.allowed_errors.append(".*could not open garage door.*")
+        #
+        # The entries in the list are regular experessions.
+        self.allowed_errors = [
+            # All tests print these, when starting up or shutting down
+            ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
+            ".*Shutdown task error: walreceiver connection handling failure.*",
+            ".*Etcd client error: grpc request error: status: Unavailable.*",
+            ".*query handler for .* failed: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Broken pipe.*",
+            ".*Connection aborted: error communicating with the server: Broken pipe.*",
+            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
+            ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
+            ".*kill_and_wait_impl.*: wait successful.*",
+            ".*end streaming to Some.*",
+            # safekeeper connection can fail with this, in the window between timeline creation
+            # and streaming start
+            ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
+            # Tests related to authentication and authorization print these
+            ".*Error processing HTTP request: Forbidden",
+            # intentional failpoints
+            ".*failpoint ",
+            # FIXME: there is a race condition between GC and detach, see
+            # https://github.com/neondatabase/neon/issues/2442
+            ".*could not remove ephemeral file.*No such file or directory.*",
+            # FIXME: These need investigation
+            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
+            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
+            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
+            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
+            ".*Removing intermediate uninit mark file.*",
+        ]
+
+    def start(
+        self,
+        overrides: Tuple[str, ...] = (),
+        extra_env_vars: Optional[Dict[str, str]] = None,
+    ) -> "NeonPageserver":
        """
        Start the page server.
        `overrides` allows to add some config to this pageserver start.
@@ -1731,7 +1775,7 @@ class NeonPageserver(PgProtocol):
        """
        assert self.running is False

-        self.env.neon_cli.pageserver_start(overrides=overrides)
+        self.env.neon_cli.pageserver_start(overrides=overrides, extra_env_vars=extra_env_vars)
        self.running = True
        return self

@@ -1771,6 +1815,26 @@ class NeonPageserver(PgProtocol):
            is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
        )

+    def assert_no_errors(self):
+        logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
+
+        error_or_warn = re.compile("ERROR|WARN")
+        errors = []
+        while True:
+            line = logfile.readline()
+            if not line:
+                break
+
+            if error_or_warn.search(line):
+                # It's an ERROR or WARN. Is it in the allow-list?
+                for a in self.allowed_errors:
+                    if re.match(a, line):
+                        break
+                else:
+                    errors.append(line)
+
+        assert not errors
+

 def append_pageserver_param_overrides(
    params_to_update: List[str],
@@ -2014,9 +2078,9 @@ class NeonProxy(PgProtocol):
        self,
        proxy_port: int,
        http_port: int,
+        mgmt_port: int,
        neon_binpath: Path,
        auth_endpoint=None,
-        mgmt_port=None,
    ):
        super().__init__(dsn=auth_endpoint, port=proxy_port)
        self.host = "127.0.0.1"
@@ -2040,6 +2104,7 @@ class NeonProxy(PgProtocol):
            str(self.neon_binpath / "proxy"),
            *["--http", f"{self.host}:{self.http_port}"],
            *["--proxy", f"{self.host}:{self.proxy_port}"],
+            *["--mgmt", f"{self.host}:{self.mgmt_port}"],
            *["--auth-backend", "postgres"],
            *["--auth-endpoint", self.auth_endpoint],
        ]
@@ -2116,11 +2181,13 @@ def static_proxy(
    auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"

    proxy_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
    http_port = port_distributor.get_port()

    with NeonProxy(
        proxy_port=proxy_port,
        http_port=http_port,
+        mgmt_port=mgmt_port,
        neon_binpath=neon_binpath,
        auth_endpoint=auth_endpoint,
    ) as proxy:
--- a/test_runner/performance/test_branching.py
+++ b/test_runner/performance/test_branching.py
@@ -4,6 +4,7 @@ from typing import List

 from fixtures.benchmark_fixture import PgBenchRunResult
 from fixtures.compare_fixtures import NeonCompare
+from fixtures.neon_fixtures import fork_at_current_lsn
 from performance.test_perf_pgbench import utc_now_timestamp

 # -----------------------------------------------------------------------
@@ -43,7 +44,8 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
    pg_root = env.postgres.create_start("root")
    pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"])

-    env.neon_cli.create_branch("child", "root")
+    fork_at_current_lsn(env, pg_root, "child", "root")
+
    pg_child = env.postgres.create_start("child")

    run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()])
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -6,6 +6,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import PgCompare
 from fixtures.log_helper import log
+from pytest_lazyfixture import lazy_fixture  # type: ignore


@pytest.mark.parametrize(
@@ -20,11 +21,19 @@ from fixtures.log_helper import log
        pytest.param(10000000, 1, 4),
    ],
 )
-def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int):
-    env = neon_with_baseline
-
+@pytest.mark.parametrize(
+    "env",
+    [
+        # Run on all envs
+        pytest.param(lazy_fixture("neon_compare"), id="neon"),
+        pytest.param(lazy_fixture("vanilla_compare"), id="vanilla"),
+        pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster),
+    ],
+)
+def test_seqscans(env: PgCompare, rows: int, iters: int, workers: int):
    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
+            cur.execute("drop table if exists t;")
            cur.execute("create table t (i integer);")
            cur.execute(f"insert into t values (generate_series(1,{rows}));")

--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -116,6 +116,13 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http_client = env.pageserver.http_client()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*invalid branch start lsn: less than latest GC cutoff.*",
+            ".*invalid branch start lsn: less than planned GC cutoff.*",
+        ]
+    )
+
    # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
    tenant, _ = env.neon_cli.create_tenant(
        conf={
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -13,6 +13,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    env = neon_env_builder.init_start()

+    env.pageserver.allowed_errors.append(".*invalid branch start lsn.*")
+    env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
+
    # Branch at the point where only 100 rows were inserted
    env.neon_cli.create_branch("test_branch_behind")
    pgmain = env.postgres.create_start("test_branch_behind")
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -11,10 +11,17 @@ from fixtures.types import TenantId, TimelineId
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
-    # One safekeeper is enough for this test.
-    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*No timelines to attach received.*",
+            ".*Failed to process timeline dir contents.*",
+            ".*Failed to load delta layer.*",
+            ".*Timeline .* was not found.*",
+        ]
+    )
+
    tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = []

    for n in range(4):
@@ -72,23 +79,24 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    # First timeline would not get loaded into pageserver due to corrupt metadata file
    with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err:
        pg1.start()
-    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
+    log.info(
+        f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
+    )

    # Second timeline has no ancestors, only the metadata file and no layer files
    # We don't have the remote storage enabled, which means timeline is in an incorrect state,
    # it's not loaded at all
    with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err:
        pg2.start()
-    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
+    log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")

-    # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline
-    for n in range(3, 4):
-        (bad_tenant, bad_timeline, pg) = tenant_timelines[n]
-        with pytest.raises(Exception, match="extracting base backup failed") as err:
-            pg.start()
-        log.info(
-            f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}"
-        )
+    # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # (We don't check layer file contents on startup, when loading the timeline)
+    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+        pg3.start()
+    log.info(
+        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
+    )


 def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
@@ -111,6 +119,13 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http = env.pageserver.http_client()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
+            ".*Timeline got dropped without initializing, cleaning its files.*",
+        ]
+    )
+
    tenant_id, _ = env.neon_cli.create_tenant()

    timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -2,7 +2,7 @@ import os
 import shutil
 import subprocess
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional

 import pytest
 import toml  # TODO: replace with tomllib for Python >= 3.11
@@ -50,6 +50,12 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o

    env = neon_env_builder.init_start()
    pg = env.postgres.create_start("main")
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
@@ -154,6 +160,7 @@ def test_forward_compatibility(
        from_dir=compatibility_snapshot_dir,
        to_dir=test_output_dir / "compatibility_snapshot",
        port_distributor=port_distributor,
+        pg_distrib_dir=compatibility_postgres_distrib_dir,
    )

    breaking_changes_allowed = (
@@ -183,7 +190,12 @@ def test_forward_compatibility(
    ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"


-def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistributor):
+def prepare_snapshot(
+    from_dir: Path,
+    to_dir: Path,
+    port_distributor: PortDistributor,
+    pg_distrib_dir: Optional[Path] = None,
+):
    assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
    assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory"
    assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql"
@@ -208,7 +220,7 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
-    pageserver_config["remote_storage"]["local_path"] = repo_dir / "local_fs_remote_storage"
+    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
        pageserver_config["listen_http_addr"]
    )
@@ -219,6 +231,9 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
        port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"]
    ]

+    if pg_distrib_dir:
+        pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
+
    with pageserver_toml.open("w") as f:
        toml.dump(pageserver_config, f)

@@ -238,7 +253,10 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])

-    with (snapshot_config_toml).open("w") as f:
+    if pg_distrib_dir:
+        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
+
+    with snapshot_config_toml.open("w") as f:
        toml.dump(snapshot_config, f)

    # Ensure that snapshot doesn't contain references to the original path
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -179,7 +179,16 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    # run compute_ctl and wait for 10s
    try:
        ctl.raw_cli(
-            ["--connstr", ps_connstr, "--pgdata", pgdata, "--spec", spec, "--pgbin", pg_bin_path],
+            [
+                "--connstr",
+                "postgres://invalid/",
+                "--pgdata",
+                pgdata,
+                "--spec",
+                spec,
+                "--pgbin",
+                pg_bin_path,
+            ],
            timeout=10,
        )
    except TimeoutExpired as exc:
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -1,3 +1,4 @@
+import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin


@@ -7,8 +8,14 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 # normally restarts after it. Also, there should be GC ERRORs in the log,
 # but the fixture checks the log for any unexpected ERRORs after every
 # test anyway, so it doesn't need any special attention here.
+@pytest.mark.timeout(600)
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()
+
+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
@@ -30,10 +37,9 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))

-    for i in range(5):
-        try:
-            pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
-        except Exception:
-            env.pageserver.stop()
-            env.pageserver.start()
-            pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
+    for _ in range(5):
+        with pytest.raises(Exception):
+            pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
+        env.pageserver.stop()
+        env.pageserver.start()
+        pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -76,6 +76,26 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
    env = neon_env_builder.init_start()
    env.pageserver.http_client().tenant_create(tenant)

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*error importing base backup .*",
+            ".*Timeline got dropped without initializing, cleaning its files.*",
+            ".*Removing intermediate uninit mark file.*",
+            ".*InternalServerError.*timeline not found.*",
+            ".*InternalServerError.*Tenant .* not found.*",
+            ".*InternalServerError.*Timeline .* not found.*",
+            ".*InternalServerError.*Cannot delete timeline which has child timelines.*",
+        ]
+    )
+
+    # FIXME: we should clean up pageserver to not print this
+    env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*")
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    def import_tar(base, wal):
        env.neon_cli.raw_cli(
            [
@@ -122,6 +142,11 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu
    neon_env_builder.enable_local_fs_remote_storage()
    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    timeline = env.neon_cli.create_branch("test_import_from_pageserver_small")
    pg = env.postgres.create_start("test_import_from_pageserver_small")

--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -67,6 +67,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -8,11 +8,11 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres


-def test_proxy_select_1(static_proxy):
+def test_proxy_select_1(static_proxy: NeonProxy):
    static_proxy.safe_psql("select 1", options="project=generic-project-name")


-def test_password_hack(static_proxy):
+def test_password_hack(static_proxy: NeonProxy):
    user = "borat"
    password = "password"
    static_proxy.safe_psql(
@@ -24,7 +24,7 @@ def test_password_hack(static_proxy):
    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)

    # Must also check that invalid magic won't be accepted.
-    with pytest.raises(psycopg2.errors.OperationalError):
+    with pytest.raises(psycopg2.OperationalError):
        magic = "broken"
        static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)

@@ -135,7 +135,7 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx


 # Pass extra options to the server.
-def test_proxy_options(static_proxy):
+def test_proxy_options(static_proxy: NeonProxy):
    with static_proxy.connect(options="project=irrelevant -cproxytest.option=value") as conn:
        with conn.cursor() as cur:
            cur.execute("SHOW proxytest.option")
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -143,6 +143,8 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
    env = neon_simple_env
    env.neon_cli.create_branch("test_read_validation_neg", "empty")

+    env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
+
    pg = env.postgres.create_start("test_read_validation_neg")
    log.info("postgres is running on 'test_read_validation_neg' branch")

--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -17,6 +17,8 @@ def test_readonly_node(neon_simple_env: NeonEnv):
    pgmain = env.postgres.create_start("test_readonly_node")
    log.info("postgres is running on 'test_readonly_node' branch")

+    env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
+
    main_pg_conn = pgmain.connect()
    main_cur = main_pg_conn.cursor()

--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -17,6 +17,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):

    neon_env_builder.start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -56,6 +56,17 @@ def test_remote_storage_backup_and_restore(

    ##### First start, insert secret data and upload it to the remote storage
    env = neon_env_builder.init_start()
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+
+    env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
+    env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
+    env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,3 +1,4 @@
+import time
 from threading import Thread

 import pytest
@@ -11,15 +12,30 @@ def do_gc_target(
 ):
    """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
    try:
+        log.info("sending gc http request")
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
    except Exception as e:
        log.error("do_gc failed: %s", e)
+    finally:
+        log.info("gc http thread returning")


+@pytest.mark.skip(
+    reason="""
+Commit 'make test_tenant_detach_smoke fail reproducibly' adds failpoint to make this test fail reproducibly.
+Fix in https://github.com/neondatabase/neon/pull/2851 will come as part of
+https://github.com/neondatabase/neon/pull/2785 .
+"""
+)
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

+    env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found in the local state")
+    # FIXME: we have a race condition between GC and detach. GC might fail with this
+    # error. Similar to https://github.com/neondatabase/neon/issues/2671
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*")
+
    # first check for non existing tenant
    tenant_id = TenantId.generate()
    with pytest.raises(
@@ -28,6 +44,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    ):
        pageserver_http.tenant_detach(tenant_id)

+    # the error will be printed to the log too
+    env.pageserver.allowed_errors.append(".*Tenant not found for id.*")
+
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

@@ -43,32 +62,34 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        ]
    )

-    # gc should not try to even start
+    # gc should not try to even start on a timeline that doesn't exist
    with pytest.raises(
        expected_exception=PageserverApiException, match="gc target timeline does not exist"
    ):
        bogus_timeline_id = TimelineId.generate()
        pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)

-    # try to concurrently run gc and detach
+        # the error will be printed to the log too
+    env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
+
+    # Detach while running manual GC.
+    # It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes)
+    pageserver_http.configure_failpoints(
+        ("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
+    )
    gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
    gc_thread.start()
+    time.sleep(1)
+    # By now the gc task is spawned but in sleep for another second due to the failpoint.

-    last_error = None
-    for i in range(3):
-        try:
-            pageserver_http.tenant_detach(tenant_id)
-        except Exception as e:
-            last_error = e
-            log.error(f"try {i} error detaching tenant: {e}")
-            continue
-        else:
-            break
-    # else is called if the loop finished without reaching "break"
-    else:
-        pytest.fail(f"could not detach tenant: {last_error}")
+    log.info("detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    log.info("tenant detached without error")

+    log.info("wait for gc thread to return")
    gc_thread.join(timeout=10)
+    assert not gc_thread.is_alive()
+    log.info("gc thread returned")

    # check that nothing is left on disk for deleted tenant
    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -259,6 +259,11 @@ def test_tenant_relocation(

    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    # create folder for remote storage mock
    remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"

--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -166,6 +166,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder

    env = neon_env_builder.init_start()

+    # FIXME: we have a race condition between GC and delete timeline. GC might fail with this
+    # error. Similar to https://github.com/neondatabase/neon/issues/2671
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*")
+
    tenant_id = env.initial_tenant
    main_branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]

@@ -263,6 +267,8 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
        except PageserverApiException as e:
            # compaction is ok but just retry if this fails; related to #2442
            if "cannot lock compaction critical section" in str(e):
+                # also ignore it in the log
+                env.pageserver.allowed_errors.append(".*cannot lock compaction critical section.*")
                time.sleep(1)
                continue
            raise
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -25,6 +25,13 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
    )
    initial_tenant_dirs = [d for d in tenants_dir.iterdir()]

+    neon_simple_env.pageserver.allowed_errors.extend(
+        [
+            ".*Failed to create directory structure for tenant .*, cleaning tmp data.*",
+            ".*Failed to fsync removed temporary tenant directory .*",
+        ]
+    )
+
    pageserver_http = neon_simple_env.pageserver.http_client()
    pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return"))
    with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"):
@@ -206,6 +213,13 @@ def test_pageserver_with_empty_tenants(
    )

    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(
+        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*Tenant .* has no timelines directory.*")
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+
    client = env.pageserver.http_client()

    tenant_without_timelines_dir = env.initial_tenant
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -66,6 +66,11 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem

    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    tenants_pgs: List[Tuple[TenantId, Postgres]] = []

    for _ in range(1, 5):
@@ -117,6 +122,13 @@ def test_tenants_attached_after_download(

    ##### First start, insert secret data and upload it to the remote storage
    env = neon_env_builder.init_start()
+
+    # FIXME: Are these expected?
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+    env.pageserver.allowed_errors.append(
+        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

@@ -209,6 +221,16 @@ def test_tenant_upgrades_index_json_from_v0(
    # launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
    # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
    env = neon_env_builder.init_start()
+
+    # FIXME: Are these expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+    env.pageserver.allowed_errors.append(
+        ".*Failed to get local tenant state: Tenant .* not found in the local state.*"
+    )
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

@@ -315,6 +337,20 @@ def test_tenant_redownloads_truncated_file_on_startup(
    )

    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(
+        ".*Redownloading locally existing .* due to size mismatch.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*Downloaded layer exists already but layer file metadata mismatches.*"
+    )
+
+    # FIXME: Are these expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -7,6 +7,11 @@ from fixtures.utils import wait_until
 def test_timeline_delete(neon_simple_env: NeonEnv):
    env = neon_simple_env

+    env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
+    env.pageserver.allowed_errors.append(".*timeline not found.*")
+    env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
+    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+
    ps_http = env.pageserver.http_client()

    # first try to delete non existing timeline
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -263,6 +263,12 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

    env.neon_cli.create_branch("test_broker", "main")
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    pg = env.postgres.create_start("test_broker")
    pg.safe_psql("CREATE TABLE t(key int primary key, value text)")

@@ -306,6 +312,11 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    env.neon_cli.create_branch("test_safekeepers_wal_removal")
    pg = env.postgres.create_start("test_safekeepers_wal_removal")

@@ -538,6 +549,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
    )

    pg.stop_and_destroy()
+    ps_cli.timeline_delete(tenant_id, timeline_id)

    # Also delete and manually create timeline on safekeepers -- this tests
    # scenario of manual recovery on different set of safekeepers.
@@ -562,7 +574,6 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
        shutil.copy(f_partial_saved, f_partial_path)

    # recreate timeline on pageserver from scratch
-    ps_cli.timeline_delete(tenant_id, timeline_id)
    ps_cli.timeline_create(tenant_id, timeline_id)

    wait_lsn_timeout = 60 * 3
@@ -1081,6 +1092,14 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
    env = neon_env_builder.init_start()

+    # FIXME: are these expected?
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
+            ".*end streaming to Some.*",
+        ]
+    )
+
    # Create two tenants: one will be deleted, other should be preserved.
    tenant_id = env.initial_tenant
    timeline_id_1 = env.neon_cli.create_branch("br1")  # Active, delete explicitly
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -22,6 +22,8 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese
 # as a zombie process.
 def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
+    # We intentionally test for a non-existent tenant.
+    env.pageserver.allowed_errors.append(".*Tenant not found.*")
    pageserver_http = env.pageserver.http_client()

    pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15