Merge pull request #4000 from neondatabase/releases/2023-04-11

Release 2023-04-11
2026-01-16 09:52:54 +00:00 · 2023-04-11 17:47:41 +03:00
parent 1c1ebe5537 4ed51ad33b
commit 4e44565b71
85 changed files with 2521 additions and 1375 deletions
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -15,10 +15,32 @@ outputs:
  report-url:
    description: 'Allure report URL'
    value: ${{ steps.generate-report.outputs.report-url }}
+  report-json-url:
+    description: 'Allure report JSON URL'
+    value: ${{ steps.generate-report.outputs.report-json-url }}

 runs:
  using: "composite"
+
  steps:
+    # We're using some of env variables quite offen, so let's set them once.
+    #
+    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
+    #
+    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
+    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
+    #
+    - name: Set common environment variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        echo "BUILD_TYPE=${BUILD_TYPE}"   >> $GITHUB_ENV
+        echo "BUCKET=${BUCKET}"           >> $GITHUB_ENV
+        echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
+      env:
+        BUILD_TYPE: ${{ inputs.build_type }}
+        BUCKET: neon-github-public-dev
+        TEST_OUTPUT: /tmp/test_output
+
    - name: Validate input parameters
      shell: bash -euxo pipefail {0}
      run: |
@@ -76,16 +98,14 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.19.0
-        ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
+        ALLURE_VERSION: 2.21.0
+        ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c

    - name: Upload Allure results
      if: ${{ inputs.action == 'store' }}
      env:
        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_OUTPUT: /tmp/test_output
-        BUCKET: neon-github-public-dev
        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      shell: bash -euxo pipefail {0}
      run: |
@@ -104,7 +124,7 @@ runs:
        EOF
        cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
          TEST_SELECTION=${{ inputs.test_selection }}
-          BUILD_TYPE=${{ inputs.build_type }}
+          BUILD_TYPE=${BUILD_TYPE}
        EOF

        ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
@@ -113,13 +133,12 @@ runs:
        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
        aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"

-    # Potentially we could have several running build for the same key (for example for the main branch),  so we use improvised lock for this
+    # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
    - name: Acquire Allure lock
      if: ${{ inputs.action == 'generate' }}
      shell: bash -euxo pipefail {0}
      env:
        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        BUCKET: neon-github-public-dev
        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      run: |
        LOCK_TIMEOUT=300 # seconds
@@ -149,8 +168,6 @@ runs:
      env:
        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_OUTPUT: /tmp/test_output
-        BUCKET: neon-github-public-dev
      shell: bash -euxo pipefail {0}
      run: |
        # Get previously uploaded data for this run
@@ -186,24 +203,24 @@ runs:
        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html

        # Generate redirect
-        cat <<EOF > ./index.html
+        cat <<EOF > ${TEST_OUTPUT}/allure/index.html
          <!DOCTYPE html>

          <meta charset="utf-8">
          <title>Redirecting to ${REPORT_URL}</title>
          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
        EOF
-        aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
+        aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
        echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT

    - name: Release Allure lock
      if: ${{ inputs.action == 'generate' && always() }}
      shell: bash -euxo pipefail {0}
      env:
        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        BUCKET: neon-github-public-dev
        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      run: |
        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
@@ -212,11 +229,16 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        rm -rf ${TEST_OUTPUT}/allure
+
    - uses: actions/github-script@v6
      if: ${{ inputs.action == 'generate' && always() }}
      env:
        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
-        BUILD_TYPE: ${{ inputs.build_type }}
        SHA: ${{ github.event.pull_request.head.sha || github.sha }}
      with:
        script: |
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,6 +14,12 @@ inputs:
  api_host:
    desctiption: 'Neon API host'
    default: console.stage.neon.tech
+  provisioner:
+    desctiption: 'k8s-pod or k8s-neonvm'
+    default: 'k8s-pod'
+  compute_units:
+    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    default: '[1, 1]'

 outputs:
  dsn:
@@ -31,6 +37,10 @@ runs:
      # A shell without `set -x` to not to expose password/dsn in logs
      shell: bash -euo pipefail {0}
      run: |
+        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
+          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
+        fi
+
        project=$(curl \
          "https://${API_HOST}/api/v2/projects" \
          --fail \
@@ -42,6 +52,9 @@ runs:
              \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
              \"pg_version\": ${POSTGRES_VERSION},
              \"region_id\": \"${REGION_ID}\",
+              \"provisioner\": \"${PROVISIONER}\",
+              \"autoscaling_limit_min_cu\": ${MIN_CU},
+              \"autoscaling_limit_max_cu\": ${MAX_CU},
              \"settings\": { }
            }
          }")
@@ -62,3 +75,6 @@ runs:
        API_KEY: ${{ inputs.api_key }}
        REGION_ID: ${{ inputs.region_id }}
        POSTGRES_VERSION: ${{ inputs.postgres_version }}
+        PROVISIONER: ${{ inputs.provisioner }}
+        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
+        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,10 @@ inputs:
    description: 'Secret access key'
    required: false
    default: ''
+  rerun_flaky:
+    description: 'Whether to rerun flaky tests'
+    required: false
+    default: 'false'

 runs:
  using: "composite"
@@ -101,6 +105,7 @@ runs:
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
+        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -143,6 +148,13 @@ runs:
          EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
        fi

+        if [ "${RERUN_FLAKY}" == "true" ]; then
+          mkdir -p $TEST_OUTPUT
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
+
+          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        fi
+
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
--- a/.github/ansible/scripts/init_pageserver.sh
+++ b/.github/ansible/scripts/init_pageserver.sh
@@ -3,6 +3,8 @@
 # fetch params from meta-data service
 INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
 AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
+INSTANCE_TYPE=$(curl -s http://169.254.169.254/latest/meta-data/instance-type)
+DISK_SIZE=$(df -B1 /storage | tail -1 | awk '{print $2}')

 # store fqdn hostname in var
 HOST=$(hostname -f)
@@ -18,7 +20,9 @@ cat <<EOF | tee /tmp/payload
  "http_host": "${HOST}",
  "http_port": 9898,
  "active": false,
-  "availability_zone_id": "${AZ_ID}"
+  "availability_zone_id": "${AZ_ID}",
+  "disk_size": ${DISK_SIZE},
+  "instance_type": "${INSTANCE_TYPE}"
 }
 EOF

--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -23,6 +23,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.us-east-2.aws.neon.build"
+  extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
  sentryEnvironment: "staging"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -24,6 +24,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.ap-southeast-1.aws.neon.tech"
+  extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"]
  sentryEnvironment: "production"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -24,6 +24,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.eu-central-1.aws.neon.tech"
+  extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"]
  sentryEnvironment: "production"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -24,6 +24,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.us-east-2.aws.neon.tech"
+  extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"]
  sentryEnvironment: "production"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -24,6 +24,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.us-west-2.aws.neon.tech"
+  extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"]
  sentryEnvironment: "production"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -107,25 +107,65 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+  generate-matrices:
+    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
+    #
+    # Available platforms:
+    # - neon-captest-new: Freshly created project (1 CU)
+    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neon-captest-reuse: Reusing existing project
+    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+    runs-on: ubuntu-latest
+    outputs:
+      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
+      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
+
+    steps:
+    - name: Generate matrix for pgbench benchmark
+      id: pgbench-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-new",
+            "neon-captest-reuse"
+          ],
+          "db_size": [ "10gb" ],
+          "include": [
+            { "platform": "neon-captest-freetier", "db_size": "3gb"  },
+            { "platform": "neon-captest-new",      "db_size": "50gb" }
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+        fi
+
+        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+    - name: Generate matrix for OLAP benchmarks
+      id: olap-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-reuse"
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
+                                                   { "platform": "rds-aurora"   }]')
+        fi
+
+        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
  pgbench-compare:
+    needs: [ generate-matrices ]
+
    strategy:
      fail-fast: false
-      matrix:
-        # neon-captest-new: Run pgbench in a freshly created project
-        # neon-captest-reuse: Same, but reusing existing project
-        # neon-captest-prefetch: Same, with prefetching enabled (new project)
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
-        db_size: [ 10gb ]
-        runner: [ us-east-2 ]
-        include:
-          - platform: neon-captest-prefetch
-            db_size: 50gb
-            runner: us-east-2
-          - platform: rds-aurora
-            db_size: 50gb
-            runner: us-east-2
+      matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}}

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
@@ -137,7 +177,7 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ]
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -160,13 +200,14 @@ jobs:
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -175,7 +216,7 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-prefetch)
+          neon-captest-new | neon-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -185,7 +226,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -194,17 +235,6 @@ jobs:

        psql ${CONNSTR} -c "SELECT version();"

-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
      with:
@@ -276,15 +306,11 @@ jobs:
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
    if: success() || failure()
-    needs: [ pgbench-compare ]
+    needs: [ generate-matrices, pgbench-compare ]

    strategy:
      fail-fast: false
-      matrix:
-        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -320,7 +346,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neon-captest-prefetch)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
            ;;
          rds-aurora)
@@ -330,7 +356,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -339,17 +365,6 @@ jobs:

        psql ${CONNSTR} -c "SELECT version();"

-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -387,15 +402,11 @@ jobs:
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
    if: success() || failure()
-    needs: [ clickbench-compare ]
+    needs: [ generate-matrices, clickbench-compare ]

    strategy:
      fail-fast: false
-      matrix:
-        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -431,7 +442,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neon-captest-prefetch)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
            ;;
          rds-aurora)
@@ -441,7 +452,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -450,17 +461,6 @@ jobs:

        psql ${CONNSTR} -c "SELECT version();"

-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -492,15 +492,11 @@ jobs:

  user-examples-compare:
    if: success() || failure()
-    needs: [ tpch-compare ]
+    needs: [ generate-matrices, tpch-compare ]

    strategy:
      fail-fast: false
-      matrix:
-        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -536,7 +532,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neon-captest-prefetch)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
            ;;
          rds-aurora)
@@ -546,7 +542,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -555,17 +551,6 @@ jobs:

        psql ${CONNSTR} -c "SELECT version();"

-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
      with:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -335,6 +335,10 @@ jobs:
          real_s3_region: us-west-2
          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          rerun_flaky: true
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug'
@@ -371,42 +375,90 @@ jobs:
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

-  merge-allure-report:
+  create-test-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    needs: [ regress-tests, benchmarks ]
    if: ${{ !cancelled() }}
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: false

-      - name: Create Allure report
-        id: create-allure-report
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Create Allure report (debug)
+        if: ${{ !cancelled() }}
+        id: create-allure-report-debug
        uses: ./.github/actions/allure-report
        with:
          action: generate
-          build_type: ${{ matrix.build_type }}
+          build_type: debug
+
+      - name: Create Allure report (release)
+        if: ${{ !cancelled() }}
+        id: create-allure-report-release
+        uses: ./.github/actions/allure-report
+        with:
+          action: generate
+          build_type: release
+
+      - uses: actions/github-script@v6
+        if: >
+          !cancelled() &&
+          github.event_name == 'pull_request' && (
+            steps.create-allure-report-debug.outputs.report-url ||
+            steps.create-allure-report-release.outputs.report-url
+          )
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const reports = [{
+              buildType: "debug",
+              reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
+              jsonUrl:   "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
+            }, {
+              buildType: "release",
+              reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
+              jsonUrl:   "${{ steps.create-allure-report-release.outputs.report-json-url }}",
+            }]
+
+            const script = require("./scripts/pr-comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              reports,
+            })

      - name: Store Allure test stat in the DB
-        if: ${{ steps.create-allure-report.outputs.report-url }}
+        if: >
+          !cancelled() && (
+            steps.create-allure-report-debug.outputs.report-url ||
+            steps.create-allure-report-release.outputs.report-url
+          )
        env:
-          BUILD_TYPE: ${{ matrix.build_type }}
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
+          REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
+          REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
        run: |
-          curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
          ./scripts/pysync

-          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
+          for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
+            if [ -z "$report_url" ]; then
+              continue
+            fi
+
+            if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
+              BUILD_TYPE=debug
+            else
+              BUILD_TYPE=release
+            fi
+
+            curl --fail --output suites.json "${report_url}"
+            DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
+          done

  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -841,6 +841,18 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "compute_api"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "chrono",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "workspace_hack",
+]
+
 [[package]]
 name = "compute_tools"
 version = "0.1.0"
@@ -848,6 +860,7 @@ dependencies = [
 "anyhow",
 "chrono",
 "clap 4.1.4",
+ "compute_api",
 "futures",
 "hyper",
 "notify",
@@ -3354,6 +3367,7 @@ dependencies = [
 "tempfile",
 "thiserror",
 "tokio",
+ "tokio-io-timeout",
 "tokio-postgres",
 "toml_edit",
 "tracing",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,6 +101,7 @@ test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
+tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
@@ -132,6 +133,7 @@ tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df6
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending

 ## Local libraries
+compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -38,6 +38,7 @@ RUN cd postgres && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
@@ -300,6 +301,27 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control

+#########################################################################################
+#
+# Layer "timescaledb-pg-build"
+# compile timescaledb extension
+#
+#########################################################################################
+FROM build-deps AS timescaledb-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin:$PATH"
+
+RUN apt-get update && \
+    apt-get install -y cmake && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
+    cd build && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make install -j $(getconf _NPROCESSORS_ONLN) && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control
+
 #########################################################################################
 # 
 # Layer "rust extensions"
@@ -404,6 +426,7 @@ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,4 +27,5 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 url.workspace = true

+compute_api.workspace = true
 workspace_hack.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,22 +34,24 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{Arc, RwLock};
+use std::sync::{Arc, Condvar, Mutex};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
 use tracing::{error, info};
+use url::Url;

-use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
+use compute_api::responses::ComputeStatus;
+
+use compute_tools::compute::{ComputeNode, ComputeState};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::pg_helpers::*;
 use compute_tools::spec::*;
-use url::Url;

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -62,7 +64,7 @@ fn main() -> Result<()> {
    let connstr = matches
        .get_one::<String>("connstr")
        .expect("Postgres connection string is required");
-    let spec = matches.get_one::<String>("spec");
+    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

    let compute_id = matches.get_one::<String>("compute-id");
@@ -71,40 +73,106 @@ fn main() -> Result<()> {
    // Try to use just 'postgres' if no path is provided
    let pgbin = matches.get_one::<String>("pgbin").unwrap();

-    let spec: ComputeSpec = match spec {
+    let mut spec = Default::default();
+    let mut spec_set = false;
+    let mut live_config_allowed = false;
+    match spec_json {
        // First, try to get cluster spec from the cli argument
-        Some(json) => serde_json::from_str(json)?,
+        Some(json) => {
+            spec = serde_json::from_str(json)?;
+            spec_set = true;
+        }
        None => {
            // Second, try to read it from the file if path is provided
            if let Some(sp) = spec_path {
                let path = Path::new(sp);
                let file = File::open(path)?;
-                serde_json::from_reader(file)?
+                spec = serde_json::from_reader(file)?;
+                spec_set = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
-                    let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec");
-                    let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
-                        Ok(v) => v,
-                        Err(_) => "".to_string(),
-                    };
-
-                    reqwest::blocking::Client::new()
-                        .get(cp_uri)
-                        .header("Authorization", jwt)
-                        .send()?
-                        .json()?
+                    live_config_allowed = true;
+                    if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
+                        spec = s;
+                        spec_set = true;
+                    }
                } else {
-                    panic!(
-                        "must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"",
-                        control_plane_uri, compute_id
-                    );
+                    panic!("must specify both --control-plane-uri and --compute-id or none");
                }
            } else {
-                panic!("compute spec should be provided via --spec or --spec-path argument");
+                panic!(
+                    "compute spec should be provided by one of the following ways: \
+                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
+                );
            }
        }
    };

+    let mut new_state = ComputeState::new();
+    if spec_set {
+        new_state.spec = spec;
+    }
+    let compute_node = ComputeNode {
+        start_time: Utc::now(),
+        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
+        pgdata: pgdata.to_string(),
+        pgbin: pgbin.to_string(),
+        live_config_allowed,
+        state: Mutex::new(new_state),
+        state_changed: Condvar::new(),
+    };
+    let compute = Arc::new(compute_node);
+
+    // Launch http service first, so we were able to serve control-plane
+    // requests, while configuration is still in progress.
+    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+
+    if !spec_set {
+        // No spec provided, hang waiting for it.
+        info!("no compute spec provided, waiting");
+        let mut state = compute.state.lock().unwrap();
+        while state.status != ComputeStatus::ConfigurationPending {
+            state = compute.state_changed.wait(state).unwrap();
+
+            if state.status == ComputeStatus::ConfigurationPending {
+                info!("got spec, continue configuration");
+                // Spec is already set by the http server handler.
+                break;
+            }
+        }
+    }
+
+    // We got all we need, fill in the state.
+    let mut state = compute.state.lock().unwrap();
+    let pageserver_connstr = state
+        .spec
+        .cluster
+        .settings
+        .find("neon.pageserver_connstring")
+        .expect("pageserver connstr should be provided");
+    let storage_auth_token = state.spec.storage_auth_token.clone();
+    let tenant = state
+        .spec
+        .cluster
+        .settings
+        .find("neon.tenant_id")
+        .expect("tenant id should be provided");
+    let timeline = state
+        .spec
+        .cluster
+        .settings
+        .find("neon.timeline_id")
+        .expect("tenant id should be provided");
+    let startup_tracing_context = state.spec.startup_tracing_context.clone();
+
+    state.pageserver_connstr = pageserver_connstr;
+    state.storage_auth_token = storage_auth_token;
+    state.tenant = tenant;
+    state.timeline = timeline;
+    state.status = ComputeStatus::Init;
+    compute.state_changed.notify_all();
+    drop(state);
+
    // Extract OpenTelemetry context for the startup actions from the spec, and
    // attach it to the current tracing context.
    //
@@ -120,7 +188,7 @@ fn main() -> Result<()> {
    // postgres is configured and up-and-running, we exit this span. Any other
    // actions that are performed on incoming HTTP requests, for example, are
    // performed in separate spans.
-    let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+    let startup_context_guard = if let Some(ref carrier) = startup_tracing_context {
        use opentelemetry::propagation::TextMapPropagator;
        use opentelemetry::sdk::propagation::TraceContextPropagator;
        Some(TraceContextPropagator::new().extract(carrier).attach())
@@ -128,41 +196,7 @@ fn main() -> Result<()> {
        None
    };

-    let pageserver_connstr = spec
-        .cluster
-        .settings
-        .find("neon.pageserver_connstring")
-        .expect("pageserver connstr should be provided");
-    let storage_auth_token = spec.storage_auth_token.clone();
-    let tenant = spec
-        .cluster
-        .settings
-        .find("neon.tenant_id")
-        .expect("tenant id should be provided");
-    let timeline = spec
-        .cluster
-        .settings
-        .find("neon.timeline_id")
-        .expect("tenant id should be provided");
-
-    let compute_state = ComputeNode {
-        start_time: Utc::now(),
-        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        spec,
-        tenant,
-        timeline,
-        pageserver_connstr,
-        storage_auth_token,
-        metrics: ComputeMetrics::default(),
-        state: RwLock::new(ComputeState::new()),
-    };
-    let compute = Arc::new(compute_state);
-
-    // Launch service threads first, so we were able to serve availability
-    // requests, while configuration is still in progress.
-    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    // Launch remaining service threads
    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");

    // Start Postgres
@@ -172,7 +206,7 @@ fn main() -> Result<()> {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
-            let mut state = compute.state.write().unwrap();
+            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
            drop(state);
@@ -262,7 +296,7 @@ fn cli() -> clap::Command {
            Arg::new("control-plane-uri")
                .short('p')
                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE"),
+                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -19,16 +19,17 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::RwLock;
+use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use serde::{Serialize, Serializer};
 use tokio_postgres;
 use tracing::{info, instrument, warn};

+use compute_api::responses::{ComputeMetrics, ComputeStatus};
+use compute_api::spec::ComputeSpec;
+
 use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
@@ -41,41 +42,54 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    /// We should only allow live re- / configuration of the compute node if
+    /// it uses 'pull model', i.e. it can go to control-plane and fetch
+    /// the latest configuration. Otherwise, there could be a case:
+    /// - we start compute with some spec provided as argument
+    /// - we push new spec and it does reconfiguration
+    /// - but then something happens and compute pod / VM is destroyed,
+    ///   so k8s controller starts it again with the **old** spec
+    /// and the same for empty computes:
+    /// - we started compute without any spec
+    /// - we push spec and it does configuration
+    /// - but then it is restarted without any spec again
+    pub live_config_allowed: bool,
+    /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
+    /// To allow HTTP API server to serving status requests, while configuration
+    /// is in progress, lock should be held only for short periods of time to do
+    /// read/write, not the whole configuration process.
+    pub state: Mutex<ComputeState>,
+    /// `Condvar` to allow notifying waiters about state changes.
+    pub state_changed: Condvar,
+}
+
+#[derive(Clone, Debug)]
+pub struct ComputeState {
+    pub status: ComputeStatus,
+    /// Timestamp of the last Postgres activity
+    pub last_active: DateTime<Utc>,
+    pub error: Option<String>,
    pub spec: ComputeSpec,
    pub tenant: String,
    pub timeline: String,
    pub pageserver_connstr: String,
    pub storage_auth_token: Option<String>,
+
    pub metrics: ComputeMetrics,
-    /// Volatile part of the `ComputeNode` so should be used under `RwLock`
-    /// to allow HTTP API server to serve status requests, while configuration
-    /// is in progress.
-    pub state: RwLock<ComputeState>,
-}
-
-fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-{
-    x.to_rfc3339().serialize(s)
-}
-
-#[derive(Serialize)]
-#[serde(rename_all = "snake_case")]
-pub struct ComputeState {
-    pub status: ComputeStatus,
-    /// Timestamp of the last Postgres activity
-    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: DateTime<Utc>,
-    pub error: Option<String>,
 }

 impl ComputeState {
    pub fn new() -> Self {
        Self {
-            status: ComputeStatus::Init,
+            status: ComputeStatus::Empty,
            last_active: Utc::now(),
            error: None,
+            spec: ComputeSpec::default(),
+            tenant: String::new(),
+            timeline: String::new(),
+            pageserver_connstr: String::new(),
+            storage_auth_token: None,
+            metrics: ComputeMetrics::default(),
        }
    }
 }
@@ -86,29 +100,15 @@ impl Default for ComputeState {
    }
 }

-#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ComputeStatus {
-    Init,
-    Running,
-    Failed,
-}
-
-#[derive(Default, Serialize)]
-pub struct ComputeMetrics {
-    pub sync_safekeepers_ms: AtomicU64,
-    pub basebackup_ms: AtomicU64,
-    pub config_ms: AtomicU64,
-    pub total_startup_ms: AtomicU64,
-}
-
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
-        self.state.write().unwrap().status = status;
+        let mut state = self.state.lock().unwrap();
+        state.status = status;
+        self.state_changed.notify_all();
    }

    pub fn get_status(&self) -> ComputeStatus {
-        self.state.read().unwrap().status
+        self.state.lock().unwrap().status
    }

    // Remove `pgdata` directory and create it again with right permissions.
@@ -124,15 +124,15 @@ impl ComputeNode {

    // Get basebackup from the libpq connection to pageserver using `connstr` and
    // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip(self))]
-    fn get_basebackup(&self, lsn: &str) -> Result<()> {
+    #[instrument(skip(self, compute_state))]
+    fn get_basebackup(&self, compute_state: &ComputeState, lsn: &str) -> Result<()> {
        let start_time = Utc::now();

-        let mut config = postgres::Config::from_str(&self.pageserver_connstr)?;
+        let mut config = postgres::Config::from_str(&compute_state.pageserver_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
-        if let Some(storage_auth_token) = &self.storage_auth_token {
+        if let Some(storage_auth_token) = &compute_state.storage_auth_token {
            info!("Got storage auth token from spec file");
            config.password(storage_auth_token);
        } else {
@@ -141,8 +141,14 @@ impl ComputeNode {

        let mut client = config.connect(NoTls)?;
        let basebackup_cmd = match lsn {
-            "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
-            _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
+            "0/0" => format!(
+                "basebackup {} {}",
+                &compute_state.tenant, &compute_state.timeline
+            ), // First start of the compute
+            _ => format!(
+                "basebackup {} {} {}",
+                &compute_state.tenant, &compute_state.timeline, lsn
+            ),
        };
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;

@@ -155,28 +161,24 @@ impl ComputeNode {
        ar.set_ignore_zeros(true);
        ar.unpack(&self.pgdata)?;

-        self.metrics.basebackup_ms.store(
-            Utc::now()
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-
+        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
        Ok(())
    }

    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
-    #[instrument(skip(self))]
-    fn sync_safekeepers(&self) -> Result<String> {
+    #[instrument(skip(self, storage_auth_token))]
+    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<String> {
        let start_time = Utc::now();

        let sync_handle = Command::new(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
-            .envs(if let Some(storage_auth_token) = &self.storage_auth_token {
+            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
            } else {
                vec![]
@@ -201,14 +203,11 @@ impl ComputeNode {
            );
        }

-        self.metrics.sync_safekeepers_ms.store(
-            Utc::now()
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
+        self.state.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;

        let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());

@@ -217,9 +216,9 @@ impl ComputeNode {

    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
-    #[instrument(skip(self))]
-    pub fn prepare_pgdata(&self) -> Result<()> {
-        let spec = &self.spec;
+    #[instrument(skip(self, compute_state))]
+    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+        let spec = &compute_state.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
@@ -228,18 +227,18 @@ impl ComputeNode {

        info!("starting safekeepers syncing");
        let lsn = self
-            .sync_safekeepers()
+            .sync_safekeepers(compute_state.storage_auth_token.clone())
            .with_context(|| "failed to sync safekeepers")?;
        info!("safekeepers synced at LSN {}", lsn);

        info!(
            "getting basebackup@{} from pageserver {}",
-            lsn, &self.pageserver_connstr
+            lsn, &compute_state.pageserver_connstr
        );
-        self.get_basebackup(&lsn).with_context(|| {
+        self.get_basebackup(compute_state, &lsn).with_context(|| {
            format!(
                "failed to get basebackup@{} from pageserver {}",
-                lsn, &self.pageserver_connstr
+                lsn, &compute_state.pageserver_connstr
            )
        })?;

@@ -252,13 +251,16 @@ impl ComputeNode {
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip(self))]
-    pub fn start_postgres(&self) -> Result<std::process::Child> {
+    pub fn start_postgres(
+        &self,
+        storage_auth_token: Option<String>,
+    ) -> Result<std::process::Child> {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
        let mut pg = Command::new(&self.pgbin)
            .args(["-D", &self.pgdata])
-            .envs(if let Some(storage_auth_token) = &self.storage_auth_token {
+            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
            } else {
                vec![]
@@ -271,8 +273,9 @@ impl ComputeNode {
        Ok(pg)
    }

-    #[instrument(skip(self))]
-    pub fn apply_config(&self) -> Result<()> {
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip(self, compute_state))]
+    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
        //
@@ -303,19 +306,19 @@ impl ComputeNode {
        };

        // Proceed with post-startup configuration. Note, that order of operations is important.
-        handle_roles(&self.spec, &mut client)?;
-        handle_databases(&self.spec, &mut client)?;
-        handle_role_deletions(self, &mut client)?;
-        handle_grants(self, &mut client)?;
+        handle_roles(&compute_state.spec, &mut client)?;
+        handle_databases(&compute_state.spec, &mut client)?;
+        handle_role_deletions(&compute_state.spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(&compute_state.spec, self.connstr.as_str(), &mut client)?;
        create_writability_check_data(&mut client)?;
-        handle_extensions(&self.spec, &mut client)?;
+        handle_extensions(&compute_state.spec, &mut client)?;

        // 'Close' connection
        drop(client);

        info!(
            "finished configuration of compute for project {}",
-            self.spec.cluster.cluster_id
+            compute_state.spec.cluster.cluster_id
        );

        Ok(())
@@ -323,40 +326,37 @@ impl ComputeNode {

    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
+        let compute_state = self.state.lock().unwrap().clone();
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            self.spec.cluster.cluster_id,
-            self.spec.operation_uuid.as_ref().unwrap(),
-            self.tenant,
-            self.timeline,
+            compute_state.spec.cluster.cluster_id,
+            compute_state.spec.operation_uuid.as_ref().unwrap(),
+            compute_state.tenant,
+            compute_state.timeline,
        );

-        self.prepare_pgdata()?;
+        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();

-        let pg = self.start_postgres()?;
+        let pg = self.start_postgres(compute_state.storage_auth_token.clone())?;

-        self.apply_config()?;
+        self.apply_config(&compute_state)?;

        let startup_end_time = Utc::now();
-        self.metrics.config_ms.store(
-            startup_end_time
+        {
+            let mut state = self.state.lock().unwrap();
+            state.metrics.config_ms = startup_end_time
                .signed_duration_since(start_time)
                .to_std()
                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-        self.metrics.total_startup_ms.store(
-            startup_end_time
+                .as_millis() as u64;
+            state.metrics.total_startup_ms = startup_end_time
                .signed_duration_since(self.start_time)
                .to_std()
                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-
+                .as_millis() as u64;
+        }
        self.set_status(ComputeStatus::Running);

        Ok(pg)
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,7 +6,7 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::PgOptionsSerialize;
-use crate::spec::ComputeSpec;
+use compute_api::spec::ComputeSpec;

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,15 +3,29 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;

-use crate::compute::ComputeNode;
+use crate::compute::{ComputeNode, ComputeState};
+use compute_api::requests::ConfigurationRequest;
+use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
+
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use num_cpus;
 use serde_json;
+use tokio::task;
 use tracing::{error, info};
 use tracing_utils::http::OtelName;

+fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
+    ComputeStatusResponse {
+        tenant: state.tenant.clone(),
+        timeline: state.timeline.clone(),
+        status: state.status,
+        last_active: state.last_active,
+        error: state.error.clone(),
+    }
+}
+
 // Service function to handle all available routes.
 async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
    //
@@ -23,26 +37,45 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
        // Serialized compute state.
        (&Method::GET, "/status") => {
            info!("serving /status GET request");
-            let state = compute.state.read().unwrap();
-            Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
+            let state = compute.state.lock().unwrap();
+            let status_response = status_response_from_state(&state);
+            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
        }

        // Startup metrics in JSON format. Keep /metrics reserved for a possible
        // future use for Prometheus metrics format.
        (&Method::GET, "/metrics.json") => {
            info!("serving /metrics.json GET request");
-            Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
+            let metrics = compute.state.lock().unwrap().metrics.clone();
+            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
        }

        // Collect Postgres current usage insights
        (&Method::GET, "/insights") => {
            info!("serving /insights GET request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!("compute is not running, current status: {:?}", status);
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
            let insights = compute.collect_insights().await;
            Response::new(Body::from(insights))
        }

        (&Method::POST, "/check_writability") => {
            info!("serving /check_writability POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for check_writability request: {:?}",
+                    status
+                );
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
            let res = crate::checker::check_writability(compute).await;
            match res {
                Ok(_) => Response::new(Body::from("true")),
@@ -61,6 +94,23 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            ))
        }

+        // Accept spec in JSON format and request compute configuration. If
+        // anything goes wrong after we set the compute status to `ConfigurationPending`
+        // and update compute state with new spec, we basically leave compute
+        // in the potentially wrong state. That said, it's control-plane's
+        // responsibility to watch compute state after reconfiguration request
+        // and to clean restart in case of errors.
+        (&Method::POST, "/configure") => {
+            info!("serving /configure POST request");
+            match handle_configure_request(req, compute).await {
+                Ok(msg) => Response::new(Body::from(msg)),
+                Err((msg, code)) => {
+                    error!("error handling /configure request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
@@ -70,6 +120,88 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
    }
 }

+async fn handle_configure_request(
+    req: Request<Body>,
+    compute: &Arc<ComputeNode>,
+) -> Result<String, (String, StatusCode)> {
+    if !compute.live_config_allowed {
+        return Err((
+            "live configuration is not allowed for this compute node".to_string(),
+            StatusCode::PRECONDITION_FAILED,
+        ));
+    }
+
+    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
+    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
+    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
+        let spec = request.spec;
+        // XXX: wrap state update under lock in code blocks. Otherwise,
+        // we will try to `Send` `mut state` into the spawned thread
+        // bellow, which will cause error:
+        // ```
+        // error: future cannot be sent between threads safely
+        // ```
+        {
+            let mut state = compute.state.lock().unwrap();
+            if state.status != ComputeStatus::Empty {
+                let msg = format!(
+                    "invalid compute status for configuration request: {:?}",
+                    state.status.clone()
+                );
+                return Err((msg, StatusCode::PRECONDITION_FAILED));
+            }
+            state.spec = spec;
+            state.status = ComputeStatus::ConfigurationPending;
+            compute.state_changed.notify_all();
+            drop(state);
+            info!("set new spec and notified waiters");
+        }
+
+        // Spawn a blocking thread to wait for compute to become Running.
+        // This is needed to do not block the main pool of workers and
+        // be able to serve other requests while some particular request
+        // is waiting for compute to finish configuration.
+        let c = compute.clone();
+        task::spawn_blocking(move || {
+            let mut state = c.state.lock().unwrap();
+            while state.status != ComputeStatus::Running {
+                state = c.state_changed.wait(state).unwrap();
+                info!(
+                    "waiting for compute to become Running, current status: {:?}",
+                    state.status
+                );
+
+                if state.status == ComputeStatus::Failed {
+                    let err = state.error.as_ref().map_or("unknown error", |x| x);
+                    let msg = format!("compute configuration failed: {:?}", err);
+                    return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
+                }
+            }
+
+            Ok(())
+        })
+        .await
+        .unwrap()?;
+
+        // Return current compute state if everything went well.
+        let state = compute.state.lock().unwrap().clone();
+        let status_response = status_response_from_state(&state);
+        Ok(serde_json::to_string(&status_response).unwrap())
+    } else {
+        Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
+    }
+}
+
+fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
+    let error = GenericAPIError {
+        error: e.to_string(),
+    };
+    Response::builder()
+        .status(status)
+        .body(Body::from(serde_json::to_string(&error).unwrap()))
+        .unwrap()
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(state: Arc<ComputeNode>) {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -11,7 +11,7 @@ paths:
    get:
      tags:
      - Info
-      summary: Get compute node internal status
+      summary: Get compute node internal status.
      description: ""
      operationId: getComputeStatus
      responses:
@@ -26,7 +26,7 @@ paths:
    get:
      tags:
      - Info
-      summary: Get compute node startup metrics in JSON format
+      summary: Get compute node startup metrics in JSON format.
      description: ""
      operationId: getComputeMetricsJSON
      responses:
@@ -41,9 +41,9 @@ paths:
    get:
      tags:
      - Info
-      summary: Get current compute insights in JSON format
+      summary: Get current compute insights in JSON format.
      description: |
-        Note, that this doesn't include any historical data
+        Note, that this doesn't include any historical data.
      operationId: getComputeInsights
      responses:
        200:
@@ -56,12 +56,12 @@ paths:
  /info:
    get:
      tags:
-      - "info"
-      summary: Get info about the compute Pod/VM
+      - Info
+      summary: Get info about the compute pod / VM.
      description: ""
      operationId: getInfo
      responses:
-        "200":
+        200:
          description: Info
          content:
            application/json:
@@ -72,7 +72,7 @@ paths:
    post:
      tags:
      - Check
-      summary: Check that we can write new data on this compute
+      summary: Check that we can write new data on this compute.
      description: ""
      operationId: checkComputeWritability
      responses:
@@ -82,9 +82,64 @@ paths:
            text/plain:
              schema:
                type: string
-                description: Error text or 'true' if check passed
+                description: Error text or 'true' if check passed.
                example: "true"

+  /configure:
+    post:
+      tags:
+      - Configure
+      summary: Perform compute node configuration.
+      description: |
+        This is a blocking API endpoint, i.e. it blocks waiting until
+        compute is finished configuration and is in `Running` state.
+        Optional non-blocking mode could be added later.
+      operationId: configureCompute
+      requestBody:
+        description: Configuration request.
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - spec
+              properties:
+                spec:
+                  # XXX: I don't want to explain current spec in the OpenAPI format,
+                  # as it could be changed really soon. Consider doing it later.
+                  type: object
+      responses:
+        200:
+          description: Compute configuration finished.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeState"
+        400:
+          description: Provided spec is invalid.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        412:
+          description: |
+            It's not possible to do live-configuration of the compute.
+            It's either in the wrong state, or compute doesn't use pull
+            mode of configuration.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: |
+            Compute configuration request was processed, but error
+            occurred. Compute will likely shutdown soon.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
  securitySchemes:
    JWT:
@@ -95,7 +150,7 @@ components:
  schemas:
    ComputeMetrics:
      type: object
-      description: Compute startup metrics
+      description: Compute startup metrics.
      required:
        - sync_safekeepers_ms
        - basebackup_ms
@@ -113,7 +168,7 @@ components:

    Info:
      type: object
-      description: Information about VM/Pod
+      description: Information about VM/Pod.
      required:
        - num_cpus
      properties:
@@ -130,17 +185,26 @@ components:
          $ref: '#/components/schemas/ComputeStatus'
        last_active:
          type: string
-          description: The last detected compute activity timestamp in UTC and RFC3339 format
+          description: The last detected compute activity timestamp in UTC and RFC3339 format.
          example: "2022-10-12T07:20:50.52Z"
        error:
          type: string
-          description: Text of the error during compute startup, if any
+          description: Text of the error during compute startup, if any.
+          example: ""
+        tenant:
+          type: string
+          description: Identifier of the current tenant served by compute node, if any.
+          example: c9269c359e9a199fad1ea0981246a78f
+        timeline:
+          type: string
+          description: Identifier of the current timeline served by compute node, if any.
+          example: ece7de74d4b8cbe5433a68ce4d1b97b4

    ComputeInsights:
      type: object
      properties:
        pg_stat_statements:
-          description: Contains raw output from pg_stat_statements in JSON format
+          description: Contains raw output from pg_stat_statements in JSON format.
          type: array
          items:
            type: object
@@ -151,6 +215,19 @@ components:
        - init
        - failed
        - running
+      example: running
+
+    #
+    # Errors
+    #
+
+    GenericError:
+      type: object
+      required:
+        - error
+      properties:
+        error:
+          type: string

 security:
  - JWT: []
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                            AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
                        &[],
                    );
-                let mut last_active = compute.state.read().unwrap().last_active;
+                let mut last_active = compute.state.lock().unwrap().last_active;

                if let Ok(backs) = backends {
                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
@@ -87,7 +87,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                }

                // Update the last activity in the shared state if we got a more recent one.
-                let mut state = compute.state.write().unwrap();
+                let mut state = compute.state.lock().unwrap();
                if last_active > state.last_active {
                    state.last_active = last_active;
                    debug!("set the last compute activity time to: {}", last_active);
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -10,43 +10,12 @@ use std::time::{Duration, Instant};
 use anyhow::{bail, Result};
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use serde::Deserialize;
 use tracing::{debug, instrument};

+use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
+
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

-/// Rust representation of Postgres role info with only those fields
-/// that matter for us.
-#[derive(Clone, Deserialize)]
-pub struct Role {
-    pub name: PgIdent,
-    pub encrypted_password: Option<String>,
-    pub options: GenericOptions,
-}
-
-/// Rust representation of Postgres database info with only those fields
-/// that matter for us.
-#[derive(Clone, Deserialize)]
-pub struct Database {
-    pub name: PgIdent,
-    pub owner: PgIdent,
-    pub options: GenericOptions,
-}
-
-/// Common type representing both SQL statement params with or without value,
-/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
-/// options like `wal_level = logical`.
-#[derive(Clone, Deserialize)]
-pub struct GenericOption {
-    pub name: String,
-    pub value: Option<String>,
-    pub vartype: String,
-}
-
-/// Optional collection of `GenericOption`'s. Type alias allows us to
-/// declare a `trait` on it.
-pub type GenericOptions = Option<Vec<GenericOption>>;
-
 /// Escape a string for including it in a SQL literal
 fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
@@ -58,9 +27,14 @@ fn escape_conf_value(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

-impl GenericOption {
+trait GenericOptionExt {
+    fn to_pg_option(&self) -> String;
+    fn to_pg_setting(&self) -> String;
+}
+
+impl GenericOptionExt for GenericOption {
    /// Represent `GenericOption` as SQL statement parameter.
-    pub fn to_pg_option(&self) -> String {
+    fn to_pg_option(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
                "string" => format!("{} '{}'", self.name, escape_literal(val)),
@@ -72,7 +46,7 @@ impl GenericOption {
    }

    /// Represent `GenericOption` as configuration option.
-    pub fn to_pg_setting(&self) -> String {
+    fn to_pg_setting(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
                "string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
@@ -131,10 +105,14 @@ impl GenericOptionsSearch for GenericOptions {
    }
 }

-impl Role {
+pub trait RoleExt {
+    fn to_pg_options(&self) -> String;
+}
+
+impl RoleExt for Role {
    /// Serialize a list of role parameters into a Postgres-acceptable
    /// string of arguments.
-    pub fn to_pg_options(&self) -> String {
+    fn to_pg_options(&self) -> String {
        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
        // For now, we do not use generic `options` for roles. Once used, add
        // `self.options.as_pg_options()` somewhere here.
@@ -159,21 +137,17 @@ impl Role {
    }
 }

-impl Database {
-    pub fn new(name: PgIdent, owner: PgIdent) -> Self {
-        Self {
-            name,
-            owner,
-            options: None,
-        }
-    }
+pub trait DatabaseExt {
+    fn to_pg_options(&self) -> String;
+}

+impl DatabaseExt for Database {
    /// Serialize a list of database parameters into a Postgres-acceptable
    /// string of arguments.
    /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
    /// to use `template0` and `template1`, so it is not a problem. Yet in the future
    /// it may require a proper quoting too.
-    pub fn to_pg_options(&self) -> String {
+    fn to_pg_options(&self) -> String {
        let mut params: String = self.options.as_pg_options();
        write!(params, " OWNER {}", &self.owner.pg_quote())
            .expect("String is documented to not to error during write operations");
@@ -182,10 +156,6 @@ impl Database {
    }
 }

-/// String type alias representing Postgres identifier and
-/// intended to be used for DB / role names.
-pub type PgIdent = String;
-
 /// Generic trait used to provide quoting / encoding for strings used in the
 /// Postgres SQL queries and DATABASE_URL.
 pub trait Escaping {
@@ -226,7 +196,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
            &[],
        )?
        .iter()
-        .map(|row| Database::new(row.get("datname"), row.get("owner")))
+        .map(|row| Database {
+            name: row.get("datname"),
+            owner: row.get("owner"),
+            options: None,
+        })
        .collect();

    Ok(postgres_dbs)
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,57 +1,38 @@
-use std::collections::HashMap;
 use std::path::Path;
 use std::str::FromStr;

 use anyhow::Result;
 use postgres::config::Config;
 use postgres::{Client, NoTls};
-use serde::Deserialize;
 use tracing::{info, info_span, instrument, span_enabled, warn, Level};

-use crate::compute::ComputeNode;
 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

-/// Cluster spec or configuration represented as an optional number of
-/// delta operations + final cluster state description.
-#[derive(Clone, Deserialize)]
-pub struct ComputeSpec {
-    pub format_version: f32,
-    pub timestamp: String,
-    pub operation_uuid: Option<String>,
-    /// Expected cluster state at the end of transition process.
-    pub cluster: Cluster,
-    pub delta_operations: Option<Vec<DeltaOp>>,
+use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};

-    pub storage_auth_token: Option<String>,
+/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// env variable is set, it will be used for authorization.
+pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
+    let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
+        Ok(v) => v,
+        Err(_) => "".to_string(),
+    };
+    info!("getting spec from control plane: {}", cp_uri);

-    pub startup_tracing_context: Option<HashMap<String, String>>,
-}
+    // TODO: check the response. We should distinguish cases when it's
+    // - network error, then retry
+    // - no spec for compute yet, then wait
+    // - compute id is unknown or any other error, then bail out
+    let spec = reqwest::blocking::Client::new()
+        .get(cp_uri)
+        .header("Authorization", jwt)
+        .send()?
+        .json()?;

-/// Cluster state seen from the perspective of the external tools
-/// like Rails web console.
-#[derive(Clone, Deserialize)]
-pub struct Cluster {
-    pub cluster_id: String,
-    pub name: String,
-    pub state: Option<String>,
-    pub roles: Vec<Role>,
-    pub databases: Vec<Database>,
-    pub settings: GenericOptions,
-}
-
-/// Single cluster state changing operation that could not be represented as
-/// a static `Cluster` structure. For example:
-/// - DROP DATABASE
-/// - DROP ROLE
-/// - ALTER ROLE name RENAME TO new_name
-/// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Deserialize)]
-pub struct DeltaOp {
-    pub action: String,
-    pub name: PgIdent,
-    pub new_name: Option<PgIdent>,
+    Ok(spec)
 }

 /// It takes cluster specification and does the following:
@@ -226,8 +207,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

 /// Reassign all dependent objects and delete requested roles.
 #[instrument(skip_all)]
-pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    if let Some(ops) = &node.spec.delta_operations {
+pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
+    if let Some(ops) = &spec.delta_operations {
        // First, reassign all dependent objects to db owners.
        info!("reassigning dependent objects of to-be-deleted roles");

@@ -244,7 +225,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
            // Check that role is still present in Postgres, as this could be a
            // restart with the same spec after role deletion.
            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
-                reassign_owned_objects(node, &op.name)?;
+                reassign_owned_objects(spec, connstr, &op.name)?;
            }
        }

@@ -268,10 +249,10 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
 }

 // Reassign all owned objects in all databases to the owner of the database.
-fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
-    for db in &node.spec.cluster.databases {
+fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
+    for db in &spec.cluster.databases {
        if db.owner != *role_name {
-            let mut conf = Config::from_str(node.connstr.as_str())?;
+            let mut conf = Config::from_str(connstr)?;
            conf.dbname(&db.name);

            let mut client = conf.connect(NoTls)?;
@@ -416,9 +397,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    let spec = &node.spec;
-
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
    info!("cluster spec grants:");

    // We now have a separate `web_access` role to connect to the database
@@ -450,8 +429,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
-    for db in &node.spec.cluster.databases {
-        let mut conf = Config::from_str(node.connstr.as_str())?;
+    for db in &spec.cluster.databases {
+        let mut conf = Config::from_str(connstr)?;
        conf.dbname(&db.name);

        let mut db_client = conf.connect(NoTls)?;
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -1,14 +1,13 @@
 #[cfg(test)]
 mod pg_helpers_tests {
-
    use std::fs::File;

+    use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent};
    use compute_tools::pg_helpers::*;
-    use compute_tools::spec::ComputeSpec;

    #[test]
    fn params_serialize() {
-        let file = File::open("tests/cluster_spec.json").unwrap();
+        let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();

        assert_eq!(
@@ -23,7 +22,7 @@ mod pg_helpers_tests {

    #[test]
    fn settings_serialize() {
-        let file = File::open("tests/cluster_spec.json").unwrap();
+        let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();

        assert_eq!(
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "compute_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+chrono.workspace = true
+serde.workspace = true
+serde_with.workspace = true
+serde_json.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -0,0 +1,3 @@
+pub mod requests;
+pub mod responses;
+pub mod spec;
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -0,0 +1,14 @@
+//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
+
+use crate::spec::ComputeSpec;
+use serde::Deserialize;
+
+/// Request of the /configure API
+///
+/// We now pass only `spec` in the configuration request, but later we can
+/// extend it and something like `restart: bool` or something else. So put
+/// `spec` into a struct initially to be more flexible in the future.
+#[derive(Deserialize, Debug)]
+pub struct ConfigurationRequest {
+    pub spec: ComputeSpec,
+}
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -0,0 +1,66 @@
+//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
+
+use chrono::{DateTime, Utc};
+use serde::{Serialize, Serializer};
+
+#[derive(Serialize, Debug)]
+pub struct GenericAPIError {
+    pub error: String,
+}
+
+/// Response of the /status API
+#[derive(Serialize, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeStatusResponse {
+    pub tenant: String,
+    pub timeline: String,
+    pub status: ComputeStatus,
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: DateTime<Utc>,
+    pub error: Option<String>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeState {
+    pub status: ComputeStatus,
+    /// Timestamp of the last Postgres activity
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: DateTime<Utc>,
+    pub error: Option<String>,
+}
+
+#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeStatus {
+    // Spec wasn't provided at start, waiting for it to be
+    // provided by control-plane.
+    Empty,
+    // Compute configuration was requested.
+    ConfigurationPending,
+    // Compute node has spec and initial startup and
+    // configuration is in progress.
+    Init,
+    // Compute is configured and running.
+    Running,
+    // Either startup or configuration failed,
+    // compute will exit soon or is waiting for
+    // control-plane to terminate it.
+    Failed,
+}
+
+fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    x.to_rfc3339().serialize(s)
+}
+
+/// Response of the /metrics.json API
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct ComputeMetrics {
+    pub sync_safekeepers_ms: u64,
+    pub basebackup_ms: u64,
+    pub config_ms: u64,
+    pub total_startup_ms: u64,
+}
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -0,0 +1,94 @@
+//! `ComputeSpec` represents the contents of the spec.json file.
+//!
+//! The spec.json file is used to pass information to 'compute_ctl'. It contains
+//! all the information needed to start up the right version of PostgreSQL,
+//! and connect it to the storage nodes.
+use serde::Deserialize;
+use std::collections::HashMap;
+
+/// String type alias representing Postgres identifier and
+/// intended to be used for DB / role names.
+pub type PgIdent = String;
+
+/// Cluster spec or configuration represented as an optional number of
+/// delta operations + final cluster state description.
+#[derive(Clone, Debug, Default, Deserialize)]
+pub struct ComputeSpec {
+    pub format_version: f32,
+    pub timestamp: String,
+    pub operation_uuid: Option<String>,
+    /// Expected cluster state at the end of transition process.
+    pub cluster: Cluster,
+    pub delta_operations: Option<Vec<DeltaOp>>,
+
+    pub storage_auth_token: Option<String>,
+
+    pub startup_tracing_context: Option<HashMap<String, String>>,
+}
+
+#[derive(Clone, Debug, Default, Deserialize)]
+pub struct Cluster {
+    pub cluster_id: String,
+    pub name: String,
+    pub state: Option<String>,
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+    pub settings: GenericOptions,
+}
+
+/// Single cluster state changing operation that could not be represented as
+/// a static `Cluster` structure. For example:
+/// - DROP DATABASE
+/// - DROP ROLE
+/// - ALTER ROLE name RENAME TO new_name
+/// - ALTER DATABASE name RENAME TO new_name
+#[derive(Clone, Debug, Deserialize)]
+pub struct DeltaOp {
+    pub action: String,
+    pub name: PgIdent,
+    pub new_name: Option<PgIdent>,
+}
+
+/// Rust representation of Postgres role info with only those fields
+/// that matter for us.
+#[derive(Clone, Debug, Deserialize)]
+pub struct Role {
+    pub name: PgIdent,
+    pub encrypted_password: Option<String>,
+    pub options: GenericOptions,
+}
+
+/// Rust representation of Postgres database info with only those fields
+/// that matter for us.
+#[derive(Clone, Debug, Deserialize)]
+pub struct Database {
+    pub name: PgIdent,
+    pub owner: PgIdent,
+    pub options: GenericOptions,
+}
+
+/// Common type representing both SQL statement params with or without value,
+/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
+/// options like `wal_level = logical`.
+#[derive(Clone, Debug, Deserialize)]
+pub struct GenericOption {
+    pub name: String,
+    pub value: Option<String>,
+    pub vartype: String,
+}
+
+/// Optional collection of `GenericOption`'s. Type alias allows us to
+/// declare a `trait` on it.
+pub type GenericOptions = Option<Vec<GenericOption>>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::File;
+
+    #[test]
+    fn parse_spec_file() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+    }
+}
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -54,7 +54,7 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
    use io::ErrorKind::*;
    matches!(
        e.kind(),
-        ConnectionRefused | ConnectionAborted | ConnectionReset
+        ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
    )
 }

@@ -320,9 +320,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        if let ProtoState::Closed = self.state {
            Ok(None)
        } else {
-            let m = self.framed.read_message().await?;
-            trace!("read msg {:?}", m);
-            Ok(m)
+            match self.framed.read_message().await {
+                Ok(m) => {
+                    trace!("read msg {:?}", m);
+                    Ok(m)
+                }
+                Err(e) => {
+                    // remember not to try to read anymore
+                    self.state = ProtoState::Closed;
+                    Err(e)
+                }
+            }
        }
    }

@@ -493,7 +501,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            MaybeWriteOnly::Full(framed) => {
                let (reader, writer) = framed.split();
                self.framed = MaybeWriteOnly::WriteOnly(writer);
-                Ok(PostgresBackendReader(reader))
+                Ok(PostgresBackendReader {
+                    reader,
+                    closed: false,
+                })
            }
            MaybeWriteOnly::WriteOnly(_) => {
                anyhow::bail!("PostgresBackend is already split")
@@ -510,8 +521,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                anyhow::bail!("PostgresBackend is not split")
            }
            MaybeWriteOnly::WriteOnly(writer) => {
-                let joined = Framed::unsplit(reader.0, writer);
+                let joined = Framed::unsplit(reader.reader, writer);
                self.framed = MaybeWriteOnly::Full(joined);
+                // if reader encountered connection error, do not attempt reading anymore
+                if reader.closed {
+                    self.state = ProtoState::Closed;
+                }
                Ok(())
            }
            MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"),
@@ -797,15 +812,25 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }
 }

-pub struct PostgresBackendReader<IO>(FramedReader<MaybeTlsStream<IO>>);
+pub struct PostgresBackendReader<IO> {
+    reader: FramedReader<MaybeTlsStream<IO>>,
+    closed: bool, // true if received error closing the connection
+}

 impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
    /// Read full message or return None if connection is cleanly closed with no
    /// unprocessed data.
    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
-        let m = self.0.read_message().await?;
-        trace!("read msg {:?}", m);
-        Ok(m)
+        match self.reader.read_message().await {
+            Ok(m) => {
+                trace!("read msg {:?}", m);
+                Ok(m)
+            }
+            Err(e) => {
+                self.closed = true;
+                Err(e)
+            }
+        }
    }

    /// Get CopyData contents of the next message in COPY stream or error
@@ -923,7 +948,7 @@ pub enum CopyStreamHandlerEnd {
    #[error("EOF on COPY stream")]
    EOF,
    /// The connection was lost
-    #[error(transparent)]
+    #[error("connection error: {0}")]
    Disconnected(#[from] ConnectionError),
    /// Some other error
    #[error(transparent)]
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -293,6 +293,9 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
+        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // which is less readable
+        #[allow(clippy::manual_range_contains)]
        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,7 +13,6 @@ use std::{
    collections::HashMap,
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
-    ops::Deref,
    path::{Path, PathBuf},
    pin::Pin,
    sync::Arc,
@@ -90,7 +89,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
@@ -161,14 +160,67 @@ pub enum GenericRemoteStorage {
    Unreliable(Arc<UnreliableWrapper>),
 }

-impl Deref for GenericRemoteStorage {
-    type Target = dyn RemoteStorage;
-
-    fn deref(&self) -> &Self::Target {
+impl GenericRemoteStorage {
+    pub async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
-            GenericRemoteStorage::LocalFs(local_fs) => local_fs,
-            GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
-            GenericRemoteStorage::Unreliable(s) => s.as_ref(),
+            Self::LocalFs(s) => s.list_prefixes(prefix).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix).await,
+        }
+    }
+
+    pub async fn upload(
+        &self,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
+        }
+    }
+
+    pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.download(from).await,
+            Self::AwsS3(s) => s.download(from).await,
+            Self::Unreliable(s) => s.download(from).await,
+        }
+    }
+
+    pub async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        match self {
+            Self::LocalFs(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
+            Self::AwsS3(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
+            Self::Unreliable(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
+        }
+    }
+
+    pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.delete(path).await,
+            Self::AwsS3(s) => s.delete(path).await,
+            Self::Unreliable(s) => s.delete(path).await,
        }
    }
 }
@@ -199,7 +251,7 @@ impl GenericRemoteStorage {
    /// this path is used for the remote object id conversion only.
    pub async fn upload_storage_object(
        &self,
-        from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
+        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
    ) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -118,7 +118,7 @@ impl RemoteStorage for LocalFs {

    async fn upload(
        &self,
-        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -343,7 +343,7 @@ impl RemoteStorage for S3Bucket {

    async fn upload(
        &self,
-        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -84,7 +84,7 @@ impl RemoteStorage for UnreliableWrapper {

    async fn upload(
        &self,
-        data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
--- a/libs/utils/src/serde_percent.rs
+++ b/libs/utils/src/serde_percent.rs
@@ -11,6 +11,14 @@ use serde::{Deserialize, Serialize};
 pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8);

 impl Percent {
+    pub const fn new(pct: u8) -> Option<Self> {
+        if pct <= 100 {
+            Some(Percent(pct))
+        } else {
+            None
+        }
+    }
+
    pub fn get(&self) -> u8 {
        self.0
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -639,7 +639,7 @@ mod filesystem_level_usage {
                ),
                (
                    "max_usage_pct",
-                    usage_pct > self.config.max_usage_pct.get() as u64,
+                    usage_pct >= self.config.max_usage_pct.get() as u64,
                ),
            ];

@@ -686,4 +686,43 @@ mod filesystem_level_usage {
            avail_bytes,
        })
    }
+
+    #[test]
+    fn max_usage_pct_pressure() {
+        use super::Usage as _;
+        use std::time::Duration;
+        use utils::serde_percent::Percent;
+
+        let mut usage = Usage {
+            config: &DiskUsageEvictionTaskConfig {
+                max_usage_pct: Percent::new(85).unwrap(),
+                min_avail_bytes: 0,
+                period: Duration::MAX,
+                #[cfg(feature = "testing")]
+                mock_statvfs: None,
+            },
+            total_bytes: 100_000,
+            avail_bytes: 0,
+        };
+
+        assert!(usage.has_pressure(), "expected pressure at 100%");
+
+        usage.add_available_bytes(14_000);
+        assert!(usage.has_pressure(), "expected pressure at 86%");
+
+        usage.add_available_bytes(999);
+        assert!(usage.has_pressure(), "expected pressure at 85.001%");
+
+        usage.add_available_bytes(1);
+        assert!(usage.has_pressure(), "expected pressure at precisely 85%");
+
+        usage.add_available_bytes(1);
+        assert!(!usage.has_pressure(), "no pressure at 84.999%");
+
+        usage.add_available_bytes(999);
+        assert!(!usage.has_pressure(), "no pressure at 84%");
+
+        usage.add_available_bytes(16_000);
+        assert!(!usage.has_pressure());
+    }
 }
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]

 [[package]]
 name = "allure-pytest"
-version = "2.10.0"
+version = "2.13.1"
 description = "Allure pytest integration"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"},
-    {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"},
+    {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
+    {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
 ]

 [package.dependencies]
-allure-python-commons = "2.10.0"
+allure-python-commons = "2.13.1"
 pytest = ">=4.5.0"
-six = ">=1.9.0"

 [[package]]
 name = "allure-python-commons"
-version = "2.10.0"
+version = "2.13.1"
 description = "Common module for integrate allure with python-based frameworks"
 category = "main"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
 files = [
-    {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"},
-    {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
+    {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
+    {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
 ]

 [package.dependencies]
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
-six = ">=1.9.0"

 [[package]]
 name = "async-timeout"
@@ -1932,6 +1930,22 @@ pytest = [
    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]

+[[package]]
+name = "pytest-rerunfailures"
+version = "11.1.2"
+description = "pytest plugin to re-run tests to eliminate flaky failures"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
+    {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
+]
+
+[package.dependencies]
+packaging = ">=17.1"
+pytest = ">=5.3"
+
 [[package]]
 name = "pytest-timeout"
 version = "2.1.0"
@@ -2597,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91"
+content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -53,7 +53,7 @@ pub async fn password_hack(
        .await?;

    info!(project = &payload.project, "received missing parameter");
-    creds.project = Some(payload.project.into());
+    creds.project = Some(payload.project);

    let mut node = api.wake_compute(extra, creds).await?;
    node.config.password(payload.password);
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,7 @@

 use crate::error::UserFacingError;
 use pq_proto::StartupMessageParams;
-use std::borrow::Cow;
+use std::collections::HashSet;
 use thiserror::Error;
 use tracing::info;

@@ -19,11 +19,10 @@ pub enum ClientCredsParseError {
    InconsistentProjectNames { domain: String, option: String },

    #[error(
-        "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
-         SNI should be formatted as '<project-name>.{}'.",
-        .sni, .cn, .cn,
+        "Common name inferred from SNI ('{}') is not known",
+        .cn,
    )]
-    InconsistentSni { sni: String, cn: String },
+    UnknownCommonName { cn: String },

    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
    MalformedProjectName(String),
@@ -37,7 +36,7 @@ impl UserFacingError for ClientCredsParseError {}
 pub struct ClientCredentials<'a> {
    pub user: &'a str,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<Cow<'a, str>>,
+    pub project: Option<String>,
 }

 impl ClientCredentials<'_> {
@@ -51,7 +50,7 @@ impl<'a> ClientCredentials<'a> {
    pub fn parse(
        params: &'a StartupMessageParams,
        sni: Option<&str>,
-        common_name: Option<&str>,
+        common_names: Option<HashSet<String>>,
    ) -> Result<Self, ClientCredsParseError> {
        use ClientCredsParseError::*;

@@ -60,37 +59,43 @@ impl<'a> ClientCredentials<'a> {
        let user = get_param("user")?;

        // Project name might be passed via PG's command-line options.
-        let project_option = params.options_raw().and_then(|mut options| {
-            options
-                .find_map(|opt| opt.strip_prefix("project="))
-                .map(Cow::Borrowed)
-        });
+        let project_option = params
+            .options_raw()
+            .and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project=")))
+            .map(|name| name.to_string());

-        // Alternative project name is in fact a subdomain from SNI.
-        // NOTE: we do not consider SNI if `common_name` is missing.
-        let project_domain = sni
-            .zip(common_name)
-            .map(|(sni, cn)| {
-                subdomain_from_sni(sni, cn)
-                    .ok_or_else(|| InconsistentSni {
-                        sni: sni.into(),
-                        cn: cn.into(),
+        let project_from_domain = if let Some(sni_str) = sni {
+            if let Some(cn) = common_names {
+                let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain);
+
+                let project = common_name_from_sni
+                    .and_then(|domain| {
+                        if cn.contains(domain) {
+                            subdomain_from_sni(sni_str, domain)
+                        } else {
+                            None
+                        }
                    })
-                    .map(Cow::<'static, str>::Owned)
-            })
-            .transpose()?;
+                    .ok_or_else(|| UnknownCommonName {
+                        cn: common_name_from_sni.unwrap_or("").into(),
+                    })?;

-        let project = match (project_option, project_domain) {
+                Some(project)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        let project = match (project_option, project_from_domain) {
            // Invariant: if we have both project name variants, they should match.
            (Some(option), Some(domain)) if option != domain => {
-                Some(Err(InconsistentProjectNames {
-                    domain: domain.into(),
-                    option: option.into(),
-                }))
+                Some(Err(InconsistentProjectNames { domain, option }))
            }
            // Invariant: project name may not contain certain characters.
            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
-                false => Err(MalformedProjectName(name.into())),
+                false => Err(MalformedProjectName(name)),
                true => Ok(name),
            }),
        }
@@ -149,9 +154,9 @@ mod tests {
        let options = StartupMessageParams::new([("user", "john_doe")]);

        let sni = Some("foo.localhost");
-        let common_name = Some("localhost");
+        let common_names = Some(["localhost".into()].into());

-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));

@@ -177,24 +182,41 @@ mod tests {
        let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);

        let sni = Some("baz.localhost");
-        let common_name = Some("localhost");
+        let common_names = Some(["localhost".into()].into());

-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("baz"));

        Ok(())
    }

+    #[test]
+    fn parse_multi_common_names() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([("user", "john_doe")]);
+
+        let common_names = Some(["a.com".into(), "b.com".into()].into());
+        let sni = Some("p1.a.com");
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        assert_eq!(creds.project.as_deref(), Some("p1"));
+
+        let common_names = Some(["a.com".into(), "b.com".into()].into());
+        let sni = Some("p1.b.com");
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        assert_eq!(creds.project.as_deref(), Some("p1"));
+
+        Ok(())
+    }
+
    #[test]
    fn parse_projects_different() {
        let options =
            StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);

        let sni = Some("second.localhost");
-        let common_name = Some("localhost");
+        let common_names = Some(["localhost".into()].into());

-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -209,13 +231,12 @@ mod tests {
        let options = StartupMessageParams::new([("user", "john_doe")]);

        let sni = Some("project.localhost");
-        let common_name = Some("example.com");
+        let common_names = Some(["example.com".into()].into());

-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
        match err {
-            InconsistentSni { sni, cn } => {
-                assert_eq!(sni, "project.localhost");
-                assert_eq!(cn, "example.com");
+            UnknownCommonName { cn } => {
+                assert_eq!(cn, "localhost");
            }
            _ => panic!("bad error: {err:?}"),
        }
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,12 @@
 use crate::auth;
-use anyhow::{bail, ensure, Context};
-use std::{str::FromStr, sync::Arc, time::Duration};
+use anyhow::{bail, ensure, Context, Ok};
+use rustls::sign;
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    sync::Arc,
+    time::Duration,
+};

 pub struct ProxyConfig {
    pub tls_config: Option<TlsConfig>,
@@ -16,7 +22,7 @@ pub struct MetricCollectionConfig {

 pub struct TlsConfig {
    pub config: Arc<rustls::ServerConfig>,
-    pub common_name: Option<String>,
+    pub common_names: Option<HashSet<String>>,
 }

 impl TlsConfig {
@@ -26,28 +32,34 @@ impl TlsConfig {
 }

 /// Configure TLS for the main endpoint.
-pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
-    let key = {
-        let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-        let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-            .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+pub fn configure_tls(
+    key_path: &str,
+    cert_path: &str,
+    certs_dir: Option<&String>,
+) -> anyhow::Result<TlsConfig> {
+    let mut cert_resolver = CertResolver::new();

-        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-        keys.pop().map(rustls::PrivateKey).unwrap()
-    };
+    // add default certificate
+    cert_resolver.add_cert(key_path, cert_path)?;

-    let cert_chain_bytes = std::fs::read(cert_path)
-        .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+    // add extra certificates
+    if let Some(certs_dir) = certs_dir {
+        for entry in std::fs::read_dir(certs_dir)? {
+            let entry = entry?;
+            let path = entry.path();
+            if path.is_dir() {
+                // file names aligned with default cert-manager names
+                let key_path = path.join("tls.key");
+                let cert_path = path.join("tls.crt");
+                if key_path.exists() && cert_path.exists() {
+                    cert_resolver
+                        .add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
+                }
+            }
+        }
+    }

-    let cert_chain = {
-        rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-            .context(format!(
-                "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-            ))?
-            .into_iter()
-            .map(rustls::Certificate)
-            .collect()
-    };
+    let common_names = cert_resolver.get_common_names();

    let config = rustls::ServerConfig::builder()
        .with_safe_default_cipher_suites()
@@ -55,27 +67,116 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfi
        // allow TLS 1.2 to be compatible with older client libraries
        .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
        .with_no_client_auth()
-        .with_single_cert(cert_chain, key)?
+        .with_cert_resolver(Arc::new(cert_resolver))
        .into();

-    // determine common name from tls-cert (-c server.crt param).
-    // used in asserting project name formatting invariant.
-    let common_name = {
-        let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
-            .context(format!(
-                "Failed to parse PEM object from bytes from file at '{cert_path}'."
-            ))?
-            .1;
-        let common_name = pem.parse_x509()?.subject().to_string();
-        common_name.strip_prefix("CN=*.").map(|s| s.to_string())
-    };
-
    Ok(TlsConfig {
        config,
-        common_name,
+        common_names: Some(common_names),
    })
 }

+struct CertResolver {
+    certs: HashMap<String, Arc<rustls::sign::CertifiedKey>>,
+}
+
+impl CertResolver {
+    fn new() -> Self {
+        Self {
+            certs: HashMap::new(),
+        }
+    }
+
+    fn add_cert(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
+        let priv_key = {
+            let key_bytes = std::fs::read(key_path).context("TLS key file")?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+
+            ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+            keys.pop().map(rustls::PrivateKey).unwrap()
+        };
+
+        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
+
+        let cert_chain_bytes = std::fs::read(cert_path)
+            .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+        let cert_chain = {
+            rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                .context(format!(
+                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
+                ))?
+                .into_iter()
+                .map(rustls::Certificate)
+                .collect()
+        };
+
+        let common_name = {
+            let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
+                .context(format!(
+                    "Failed to parse PEM object from bytes from file at '{cert_path}'."
+                ))?
+                .1;
+            let common_name = pem.parse_x509()?.subject().to_string();
+
+            // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
+            // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
+            // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
+            // and passed None instead, which blows up number of cases downstream code should handle. Proper coding
+            // here should better avoid Option for common_names, and do wildcard-based certificate selection instead
+            // of cutting off '*.' parts.
+            if common_name.starts_with("CN=*.") {
+                common_name.strip_prefix("CN=*.").map(|s| s.to_string())
+            } else {
+                common_name.strip_prefix("CN=").map(|s| s.to_string())
+            }
+        }
+        .context(format!(
+            "Failed to parse common name from certificate at '{cert_path}'."
+        ))?;
+
+        self.certs.insert(
+            common_name,
+            Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)),
+        );
+
+        Ok(())
+    }
+
+    fn get_common_names(&self) -> HashSet<String> {
+        self.certs.keys().map(|s| s.to_string()).collect()
+    }
+}
+
+impl rustls::server::ResolvesServerCert for CertResolver {
+    fn resolve(
+        &self,
+        _client_hello: rustls::server::ClientHello,
+    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
+        // loop here and cut off more and more subdomains until we find
+        // a match to get a proper wildcard support. OTOH, we now do not
+        // use nested domains, so keep this simple for now.
+        //
+        // With the current coding foo.com will match *.foo.com and that
+        // repeats behavior of the old code.
+        if let Some(mut sni_name) = _client_hello.server_name() {
+            loop {
+                if let Some(cert) = self.certs.get(sni_name) {
+                    return Some(cert.clone());
+                }
+                if let Some((_, rest)) = sni_name.split_once('.') {
+                    sni_name = rest;
+                } else {
+                    return None;
+                }
+            }
+        } else {
+            None
+        }
+    }
+}
+
 /// Helper for cmdline cache options parsing.
 pub struct CacheOptions {
    /// Max number of entries.
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -132,7 +132,11 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
        args.get_one::<String>("tls-key"),
        args.get_one::<String>("tls-cert"),
    ) {
-        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?),
+        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
+            key_path,
+            cert_path,
+            args.get_one::<String>("certs-dir"),
+        )?),
        (None, None) => None,
        _ => bail!("either both or neither tls-key and tls-cert must be specified"),
    };
@@ -254,6 +258,12 @@ fn cli() -> clap::Command {
                .alias("ssl-cert") // backwards compatibility
                .help("path to TLS cert for client postgres connections"),
        )
+        // tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
+        .arg(
+            Arg::new("certs-dir")
+                .long("certs-dir")
+                .help("path to directory with TLS certificates for client postgres connections"),
+        )
        .arg(
            Arg::new("metric-collection-endpoint")
                .long("metric-collection-endpoint")
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
 use std::collections::HashMap;
-use tracing::{debug, error, info, instrument, trace};
+use tracing::{error, info, instrument, trace, warn};

 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

@@ -84,10 +84,14 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {

                    let value = ms.get_counter().get_value() as u64;

-                    debug!(
-                        "branch_id {} endpoint_id {} val: {}",
-                        branch_id, endpoint_id, value
-                    );
+                    // Report if the metric value is suspiciously large
+                    if value > (1u64 << 40) {
+                        warn!(
+                            "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
+                            branch_id, endpoint_id, value
+                        );
+                    }
+
                    current_metrics.push((
                        Ids {
                            endpoint_id: endpoint_id.to_string(),
@@ -124,11 +128,15 @@ async fn collect_metrics_iteration(
            let mut value = *curr_val;

            if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
-                // Only send metrics updates if the metric has changed
-                if curr_val - prev_val > 0 {
+                // Only send metrics updates if the metric has increased
+                if curr_val > prev_val {
                    value = curr_val - prev_val;
                    start_time = *prev_time;
                } else {
+                    if curr_val < prev_val {
+                        error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
+                        prev_val, curr_val, curr_key);
+                    }
                    return None;
                }
            };
@@ -189,7 +197,7 @@ async fn collect_metrics_iteration(
                    })
                    // update cached value (add delta) and time
                    .and_modify(|e| {
-                        e.0 += send_metric.value;
+                        e.0 = e.0.saturating_add(send_metric.value);
                        e.1 = stop_time
                    })
                    // cache new metric
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -124,11 +124,11 @@ pub async fn handle_ws_client(

    // Extract credentials which we're going to use for auth.
    let creds = {
-        let common_name = tls.and_then(|tls| tls.common_name.as_deref());
+        let common_names = tls.and_then(|tls| tls.common_names.clone());
        let result = config
            .auth_backend
            .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name))
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_names))
            .transpose();

        async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -163,11 +163,11 @@ async fn handle_client(
    // Extract credentials which we're going to use for auth.
    let creds = {
        let sni = stream.get_ref().sni_hostname();
-        let common_name = tls.and_then(|tls| tls.common_name.as_deref());
+        let common_names = tls.and_then(|tls| tls.common_names.clone());
        let result = config
            .auth_backend
            .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_names))
            .transpose();

        async { result }.or_else(|e| stream.throw_error(e)).await?
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -54,9 +54,11 @@ fn generate_tls_config<'a>(
            .with_single_cert(vec![cert], key)?
            .into();

+        let common_names = Some([common_name.to_owned()].iter().cloned().collect());
+
        TlsConfig {
            config,
-            common_name: Some(common_name.to_string()),
+            common_names,
        }
    };

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
 Werkzeug = "^2.2.3"
 pytest-order = "^1.0.1"
-allure-pytest = "^2.10.0"
+allure-pytest = "^2.13.1"
 pytest-asyncio = "^0.19.0"
 toml = "^0.10.2"
 psutil = "^5.9.4"
@@ -34,6 +34,7 @@ types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
 aiohttp = "3.7.4"
+pytest-rerunfailures = "^11.1.2"

 [tool.poetry.group.dev.dependencies]
 black = "^23.1.0"
@@ -69,6 +70,9 @@ strict = true
 module = [
    "asyncpg.*",
    "pg8000.*",
+    "allure.*",
+    "allure_commons.*",
+    "allure_pytest.*",
 ]
 ignore_missing_imports = true

--- a/run_clippy.sh
+++ b/run_clippy.sh
@@ -8,13 +8,7 @@
 # warnings and errors right in the editor.
 # In vscode, this setting is Rust-analyzer>Check On Save:Command

-# manual-range-contains wants
-#   !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)
-# instead of
-#   len < 4 || len > MAX_STARTUP_PACKET_LENGTH
-# , let's disagree.
-
 # * `-A unknown_lints` – do not warn about unknown lint suppressions
 #                        that people with newer toolchains might use
 # * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
-cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings
+cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -30,6 +30,7 @@ serde_with.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["fs"] }
+tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -674,7 +674,8 @@ impl Timeline {
            bail!(TimelineError::Cancelled(self.ttid));
        }

-        self.write_shared_state().sk.inmem.backup_lsn = backup_lsn;
+        let mut state = self.write_shared_state();
+        state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn);
        // we should check whether to shut down offloader, but this will be done
        // soon by peer communication anyway.
        Ok(())
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -323,7 +323,8 @@ impl WalBackupTask {
            }

            match backup_lsn_range(
-                backup_lsn,
+                &self.timeline,
+                &mut backup_lsn,
                commit_lsn,
                self.wal_seg_size,
                &self.timeline_dir,
@@ -331,13 +332,7 @@ impl WalBackupTask {
            )
            .await
            {
-                Ok(backup_lsn_result) => {
-                    backup_lsn = backup_lsn_result;
-                    let res = self.timeline.set_wal_backup_lsn(backup_lsn_result);
-                    if let Err(e) = res {
-                        error!("failed to set wal_backup_lsn: {}", e);
-                        return;
-                    }
+                Ok(()) => {
                    retry_attempt = 0;
                }
                Err(e) => {
@@ -354,20 +349,25 @@ impl WalBackupTask {
 }

 pub async fn backup_lsn_range(
-    start_lsn: Lsn,
+    timeline: &Arc<Timeline>,
+    backup_lsn: &mut Lsn,
    end_lsn: Lsn,
    wal_seg_size: usize,
    timeline_dir: &Path,
    workspace_dir: &Path,
-) -> Result<Lsn> {
-    let mut res = start_lsn;
+) -> Result<()> {
+    let start_lsn = *backup_lsn;
    let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
    for s in &segments {
        backup_single_segment(s, timeline_dir, workspace_dir)
            .await
            .with_context(|| format!("offloading segno {}", s.seg_no))?;

-        res = s.end_lsn;
+        let new_backup_lsn = s.end_lsn;
+        timeline
+            .set_wal_backup_lsn(new_backup_lsn)
+            .context("setting wal_backup_lsn")?;
+        *backup_lsn = new_backup_lsn;
    }
    info!(
        "offloaded segnos {:?} up to {}, previous backup_lsn {}",
@@ -375,7 +375,7 @@ pub async fn backup_lsn_range(
        end_lsn,
        start_lsn,
    );
-    Ok(res)
+    Ok(())
 }

 async fn backup_single_segment(
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,8 +4,9 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
-use std::{future, thread};
+use std::{future, thread, time::Duration};
 use tokio::net::TcpStream;
+use tokio_io_timeout::TimeoutReader;
 use tracing::*;
 use utils::measured_stream::MeasuredStream;

@@ -67,41 +68,52 @@ fn handle_socket(
    let runtime = tokio::runtime::Builder::new_current_thread()
        .enable_all()
        .build()?;
-    let local = tokio::task::LocalSet::new();

    socket.set_nodelay(true)?;
    let peer_addr = socket.peer_addr()?;

-    let traffic_metrics = TrafficMetrics::new();
-    if let Some(current_az) = conf.availability_zone.as_deref() {
-        traffic_metrics.set_sk_az(current_az);
-    }
+    // TimeoutReader wants async runtime during creation.
+    runtime.block_on(async move {
+        // Set timeout on reading from the socket. It prevents hanged up connection
+        // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
+        // default, and tokio doesn't provide ability to set it out of the box.
+        let mut socket = TimeoutReader::new(socket);
+        let wal_service_timeout = Duration::from_secs(60 * 10);
+        socket.set_timeout(Some(wal_service_timeout));
+        // pin! is here because TimeoutReader (due to storing sleep future inside)
+        // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
+        // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
+        // shouldn't be moved.
+        tokio::pin!(socket);

-    let socket = MeasuredStream::new(
-        socket,
-        |cnt| {
-            traffic_metrics.observe_read(cnt);
-        },
-        |cnt| {
-            traffic_metrics.observe_write(cnt);
-        },
-    );
+        let traffic_metrics = TrafficMetrics::new();
+        if let Some(current_az) = conf.availability_zone.as_deref() {
+            traffic_metrics.set_sk_az(current_az);
+        }

-    let auth_type = match conf.auth {
-        None => AuthType::Trust,
-        Some(_) => AuthType::NeonJWT,
-    };
-    let mut conn_handler =
-        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
-    // libpq protocol between safekeeper and walproposer / pageserver
-    // We don't use shutdown.
-    local.block_on(
-        &runtime,
-        pgbackend.run(&mut conn_handler, future::pending::<()>),
-    )?;
+        let socket = MeasuredStream::new(
+            socket,
+            |cnt| {
+                traffic_metrics.observe_read(cnt);
+            },
+            |cnt| {
+                traffic_metrics.observe_write(cnt);
+            },
+        );

-    Ok(())
+        let auth_type = match conf.auth {
+            None => AuthType::Trust,
+            Some(_) => AuthType::NeonJWT,
+        };
+        let mut conn_handler =
+            SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
+        let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+        // libpq protocol between safekeeper and walproposer / pageserver
+        // We don't use shutdown.
+        pgbackend
+            .run(&mut conn_handler, future::pending::<()>)
+            .await
+    })
 }

 /// Unique WAL service connection ids are logged in spans for observability.
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -0,0 +1,87 @@
+#! /usr/bin/env python3
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+from typing import DefaultDict, Dict
+
+import psycopg2
+import psycopg2.extras
+
+# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
+FLAKY_TESTS_QUERY = """
+    SELECT
+        DISTINCT parent_suite, suite, test
+    FROM
+        (
+            SELECT
+                revision,
+                jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
+            FROM
+                regress_test_results
+            WHERE
+                reference = 'refs/heads/main'
+        ) data
+    WHERE
+        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        AND status::text IN ('"failed"', '"broken"')
+    ;
+"""
+
+
+def main(args: argparse.Namespace):
+    connstr = args.connstr
+    interval_days = args.days
+    output = args.output
+
+    res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
+    res = defaultdict(lambda: defaultdict(dict))
+
+    logging.info("connecting to the database...")
+    with psycopg2.connect(connstr, connect_timeout=10) as conn:
+        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            logging.info("fetching flaky tests...")
+            cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
+            rows = cur.fetchall()
+
+    for row in rows:
+        logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
+        res[row["parent_suite"]][row["suite"]][row["test"]] = True
+
+    logging.info(f"saving results to {output.name}")
+    json.dump(res, output, indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
+    parser.add_argument(
+        "--output",
+        type=argparse.FileType("w"),
+        default="flaky.json",
+        help="path to output json file (default: flaky.json)",
+    )
+    parser.add_argument(
+        "--days",
+        required=False,
+        default=10,
+        type=int,
+        help="how many days to look back for flaky tests (default: 10)",
+    )
+    parser.add_argument(
+        "connstr",
+        help="connection string to the test results database",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    logging.basicConfig(
+        format="%(message)s",
+        level=level,
+    )
+
+    main(args)
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -0,0 +1,125 @@
+//
+// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
+// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated).
+//
+// The comment is updated on each run with the latest results.
+//
+// It is designed to be used with actions/github-script from GitHub Workflows:
+// - uses: actions/github-script@v6
+//   with:
+//     script: |
+//       const script = require("./scripts/pr-comment-test-report.js")
+//       await script({
+//         github,
+//         context,
+//         fetch,
+//         reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties
+//       })
+//
+
+module.exports = async ({ github, context, fetch, reports }) => {
+    // Marker to find the comment in the subsequent runs
+    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
+    const githubActionsBotId = 41898282
+    // The latest commit in the PR URL
+    const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}`
+    // Commend body itself
+    let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n`
+
+    // Common parameters for GitHub API requests
+    const ownerRepoParams = {
+        owner: context.repo.owner,
+        repo: context.repo.repo,
+    }
+
+    for (const report of reports) {
+        const {buildType, reportUrl, jsonUrl} = report
+
+        if (!reportUrl || !jsonUrl) {
+            console.warn(`"reportUrl" or "jsonUrl" aren't set for ${buildType} build`)
+            continue
+        }
+
+        const suites = await (await fetch(jsonUrl)).json()
+
+        // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
+        // For this report it's ok to treat them in the same way (as failed).
+        failedTests = []
+        passedTests = []
+        skippedTests = []
+
+        retriedTests = []
+        retriedStatusChangedTests = []
+
+        for (const parentSuite of suites.children) {
+            for (const suite of parentSuite.children) {
+                for (const test of suite.children) {
+                    pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}`
+                    test.pytestName = pytestName
+
+                    if (test.status === "passed") {
+                        passedTests.push(test);
+                    } else if (test.status === "failed" || test.status === "broken") {
+                        failedTests.push(test);
+                    } else if (test.status === "skipped") {
+                        skippedTests.push(test);
+                    }
+
+                    if (test.retriesCount > 0) {
+                        retriedTests.push(test);
+
+                        if (test.retriedStatusChangedTests) {
+                            retriedStatusChangedTests.push(test);
+                        }
+                    }
+                }
+            }
+        }
+
+        const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length
+        commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} ([full report](${reportUrl}))\n`
+        if (failedTests.length > 0) {
+            commentBody += `Failed tests:\n`
+            for (const test of failedTests) {
+                const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+
+                commentBody += `- [\`${test.pytestName}\`](${allureLink})`
+                if (test.retriesCount > 0) {
+                    commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
+                }
+                commentBody += "\n"
+            }
+            commentBody += "\n"
+        }
+        if (retriedStatusChangedTests > 0) {
+            commentBody += `Flaky tests:\n`
+            for (const test of retriedStatusChangedTests) {
+                const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
+                commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
+            }
+            commentBody += "\n"
+        }
+        commentBody += "___\n"
+    }
+
+    const { data: comments } = await github.rest.issues.listComments({
+        issue_number: context.payload.number,
+        ...ownerRepoParams,
+    })
+
+    const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
+    if (comment) {
+        await github.rest.issues.updateComment({
+            comment_id: comment.id,
+            body: commentBody,
+            ...ownerRepoParams,
+        })
+    } else {
+        await github.rest.issues.createComment({
+            issue_number: context.payload.number,
+            body: commentBody,
+            ...ownerRepoParams,
+        })
+    }
+}
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -4,4 +4,5 @@ pytest_plugins = (
    "fixtures.pg_stats",
    "fixtures.compare_fixtures",
    "fixtures.slow",
+    "fixtures.flaky",
 )
--- a/test_runner/fixtures/flaky.py
+++ b/test_runner/fixtures/flaky.py
@@ -0,0 +1,58 @@
+import json
+from pathlib import Path
+from typing import List
+
+import pytest
+from _pytest.config import Config
+from _pytest.config.argparsing import Parser
+from allure_commons.types import LabelType
+from allure_pytest.utils import allure_name, allure_suite_labels
+
+from fixtures.log_helper import log
+
+"""
+The plugin reruns flaky tests.
+It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
+
+Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
+"""
+
+
+def pytest_addoption(parser: Parser):
+    parser.addoption(
+        "--flaky-tests-json",
+        action="store",
+        type=Path,
+        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
+    )
+
+
+def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]):
+    if not config.getoption("--flaky-tests-json"):
+        return
+
+    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
+    flaky_json = config.getoption("--flaky-tests-json")
+    if not flaky_json.exists():
+        return
+
+    content = flaky_json.read_text()
+    try:
+        flaky_tests = json.loads(content)
+    except ValueError:
+        log.error(f"Can't parse {content} as json")
+        return
+
+    for item in items:
+        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
+        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
+        allure_labels = dict(allure_suite_labels(item))
+        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
+        suite = str(allure_labels.get(LabelType.SUITE))
+        params = item.callspec.params if hasattr(item, "callspec") else {}
+        name = allure_name(item, params)
+
+        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
+            # Rerun 3 times = 1 original run + 2 reruns
+            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
+            item.add_marker(pytest.mark.flaky(reruns=2))
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,9 +14,9 @@ import tempfile
 import textwrap
 import time
 import uuid
-from collections import defaultdict
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
+from datetime import datetime
 from enum import Flag, auto
 from functools import cached_property
 from itertools import chain, product
@@ -43,11 +43,12 @@ from psycopg2.extensions import make_dsn, parse_dsn
 from typing_extensions import Literal

 from fixtures.log_helper import log
-from fixtures.metrics import Metrics, parse_metrics
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
-    Fn,
+    allure_add_grafana_links,
    allure_attach_from_dir,
    get_self_dir,
    subprocess_capture,
@@ -1118,538 +1119,6 @@ def neon_env_builder(
        yield builder


-class PageserverApiException(Exception):
-    def __init__(self, message, status_code: int):
-        super().__init__(message)
-        self.status_code = status_code
-
-
-class PageserverHttpClient(requests.Session):
-    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
-        super().__init__()
-        self.port = port
-        self.auth_token = auth_token
-        self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
-
-        if auth_token is not None:
-            self.headers["Authorization"] = f"Bearer {auth_token}"
-
-    def verbose_error(self, res: requests.Response):
-        try:
-            res.raise_for_status()
-        except requests.RequestException as e:
-            try:
-                msg = res.json()["msg"]
-            except:  # noqa: E722
-                msg = ""
-            raise PageserverApiException(msg, res.status_code) from e
-
-    def check_status(self):
-        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
-
-    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
-        self.is_testing_enabled_or_skip()
-
-        if isinstance(config_strings, tuple):
-            pairs = [config_strings]
-        else:
-            pairs = config_strings
-
-        log.info(f"Requesting config failpoints: {repr(pairs)}")
-
-        res = self.put(
-            f"http://localhost:{self.port}/v1/failpoints",
-            json=[{"name": name, "actions": actions} for name, actions in pairs],
-        )
-        log.info(f"Got failpoints request response code {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-        return res_json
-
-    def tenant_list(self) -> List[Dict[Any, Any]]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant",
-            json={
-                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
-            },
-        )
-        self.verbose_error(res)
-        if res.status_code == 409:
-            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
-        new_tenant_id = res.json()
-        assert isinstance(new_tenant_id, str)
-        return TenantId(new_tenant_id)
-
-    def tenant_attach(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
-        self.verbose_error(res)
-
-    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
-        params = {}
-        if detach_ignored:
-            params["detach_ignored"] = "true"
-
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
-        self.verbose_error(res)
-
-    def tenant_load(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
-        self.verbose_error(res)
-
-    def tenant_ignore(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
-        self.verbose_error(res)
-
-    def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
-        self.verbose_error(res)
-        return TenantConfig.from_json(res.json())
-
-    def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
-        assert "tenant_id" not in config.keys()
-        res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/config",
-            json={**config, "tenant_id": str(tenant_id)},
-        )
-        self.verbose_error(res)
-
-    def patch_tenant_config_client_side(
-        self,
-        tenant_id: TenantId,
-        inserts: Optional[Dict[str, Any]] = None,
-        removes: Optional[List[str]] = None,
-    ):
-        current = self.tenant_config(tenant_id).tenant_specific_overrides
-        if inserts is not None:
-            current.update(inserts)
-        if removes is not None:
-            for key in removes:
-                del current[key]
-        self.set_tenant_config(tenant_id, current)
-
-    def tenant_size(self, tenant_id: TenantId) -> int:
-        return self.tenant_size_and_modelinputs(tenant_id)[0]
-
-    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
-        """
-        Returns the tenant size, together with the model inputs as the second tuple item.
-        """
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
-        self.verbose_error(res)
-        res = res.json()
-        assert isinstance(res, dict)
-        assert TenantId(res["id"]) == tenant_id
-        size = res["size"]
-        assert type(size) == int
-        inputs = res["inputs"]
-        assert type(inputs) is dict
-        return (size, inputs)
-
-    def tenant_size_debug(self, tenant_id: TenantId) -> str:
-        """
-        Returns the tenant size debug info, as an HTML string
-        """
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
-            headers={"Accept": "text/html"},
-        )
-        return res.text
-
-    def timeline_list(
-        self,
-        tenant_id: TenantId,
-        include_non_incremental_logical_size: bool = False,
-        include_timeline_dir_layer_file_size_sum: bool = False,
-    ) -> List[Dict[str, Any]]:
-        params = {}
-        if include_non_incremental_logical_size:
-            params["include-non-incremental-logical-size"] = "true"
-        if include_timeline_dir_layer_file_size_sum:
-            params["include-timeline-dir-layer-file-size-sum"] = "true"
-
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def timeline_create(
-        self,
-        tenant_id: TenantId,
-        new_timeline_id: Optional[TimelineId] = None,
-        ancestor_timeline_id: Optional[TimelineId] = None,
-        ancestor_start_lsn: Optional[Lsn] = None,
-    ) -> Dict[Any, Any]:
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline",
-            json={
-                "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
-                "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
-                "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
-            },
-        )
-        self.verbose_error(res)
-        if res.status_code == 409:
-            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
-
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_detail(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        include_non_incremental_logical_size: bool = False,
-        include_timeline_dir_layer_file_size_sum: bool = False,
-        **kwargs,
-    ) -> Dict[Any, Any]:
-        params = {}
-        if include_non_incremental_logical_size:
-            params["include-non-incremental-logical-size"] = "true"
-        if include_timeline_dir_layer_file_size_sum:
-            params["include-timeline-dir-layer-file-size-sum"] = "true"
-
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
-            params=params,
-            **kwargs,
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
-        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-
-    def timeline_gc(
-        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
-    ) -> dict[str, Any]:
-        self.is_testing_enabled_or_skip()
-
-        log.info(
-            f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
-        )
-        res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc",
-            json={"gc_horizon": gc_horizon},
-        )
-        log.info(f"Got GC request response code: {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is not None
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
-        self.is_testing_enabled_or_skip()
-
-        log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
-        res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
-        )
-        log.info(f"Got compact request response code: {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-
-    def timeline_get_lsn_by_timestamp(
-        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
-    ):
-        log.info(
-            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
-        )
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        return res_json
-
-    def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
-        self.is_testing_enabled_or_skip()
-
-        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
-        res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
-        )
-        log.info(f"Got checkpoint request response code: {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-
-    def timeline_spawn_download_remote_layers(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        max_concurrent_downloads: int,
-    ) -> dict[str, Any]:
-        body = {
-            "max_concurrent_downloads": max_concurrent_downloads,
-        }
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
-            json=body,
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is not None
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_poll_download_remote_layers_status(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        spawn_response: dict[str, Any],
-        poll_state=None,
-    ) -> None | dict[str, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is not None
-        assert isinstance(res_json, dict)
-
-        # assumption in this API client here is that nobody else spawns the task
-        assert res_json["task_id"] == spawn_response["task_id"]
-
-        if poll_state is None or res_json["state"] == poll_state:
-            return res_json
-        return None
-
-    def timeline_download_remote_layers(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        max_concurrent_downloads: int,
-        errors_ok=False,
-        at_least_one_download=True,
-    ):
-        res = self.timeline_spawn_download_remote_layers(
-            tenant_id, timeline_id, max_concurrent_downloads
-        )
-        while True:
-            completed = self.timeline_poll_download_remote_layers_status(
-                tenant_id, timeline_id, res, poll_state="Completed"
-            )
-            if not completed:
-                time.sleep(0.1)
-                continue
-            if not errors_ok:
-                assert completed["failed_download_count"] == 0
-            if at_least_one_download:
-                assert completed["successful_download_count"] > 0
-            return completed
-
-    def get_metrics_str(self) -> str:
-        """You probably want to use get_metrics() instead."""
-        res = self.get(f"http://localhost:{self.port}/metrics")
-        self.verbose_error(res)
-        return res.text
-
-    def get_metrics(self) -> Metrics:
-        res = self.get_metrics_str()
-        return parse_metrics(res)
-
-    def get_timeline_metric(
-        self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str
-    ) -> float:
-        metrics = self.get_metrics()
-        return metrics.query_one(
-            metric_name,
-            filter={
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
-            },
-        ).value
-
-    def get_remote_timeline_client_metric(
-        self,
-        metric_name: str,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        file_kind: str,
-        op_kind: str,
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        matches = metrics.query_all(
-            name=metric_name,
-            filter={
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
-                "file_kind": str(file_kind),
-                "op_kind": str(op_kind),
-            },
-        )
-        if len(matches) == 0:
-            value = None
-        elif len(matches) == 1:
-            value = matches[0].value
-            assert value is not None
-        else:
-            assert len(matches) < 2, "above filter should uniquely identify metric"
-        return value
-
-    def get_metric_value(
-        self, name: str, filter: Optional[Dict[str, str]] = None
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        results = metrics.query_all(name, filter=filter)
-        if not results:
-            log.info(f'could not find metric "{name}"')
-            return None
-        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
-        return results[0].value
-
-    def layer_map_info(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> LayerMapInfo:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/",
-        )
-        self.verbose_error(res)
-        return LayerMapInfo.from_json(res.json())
-
-    def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
-        )
-        self.verbose_error(res)
-
-        assert res.status_code == 200
-
-    def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
-        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
-        )
-        self.verbose_error(res)
-
-        assert res.status_code == 200
-
-    def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
-        info = self.layer_map_info(tenant_id, timeline_id)
-        for layer in info.historic_layers:
-            self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
-
-    def disk_usage_eviction_run(self, request: dict[str, Any]):
-        res = self.put(
-            f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
-            json=request,
-        )
-        self.verbose_error(res)
-        return res.json()
-
-    def tenant_break(self, tenant_id: TenantId):
-        res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
-        self.verbose_error(res)
-
-
-@dataclass
-class TenantConfig:
-    tenant_specific_overrides: Dict[str, Any]
-    effective_config: Dict[str, Any]
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> TenantConfig:
-        return TenantConfig(
-            tenant_specific_overrides=d["tenant_specific_overrides"],
-            effective_config=d["effective_config"],
-        )
-
-
-@dataclass
-class LayerMapInfo:
-    in_memory_layers: List[InMemoryLayerInfo]
-    historic_layers: List[HistoricLayerInfo]
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo:
-        info = LayerMapInfo(in_memory_layers=[], historic_layers=[])
-
-        json_in_memory_layers = d["in_memory_layers"]
-        assert isinstance(json_in_memory_layers, List)
-        for json_in_memory_layer in json_in_memory_layers:
-            info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer))
-
-        json_historic_layers = d["historic_layers"]
-        assert isinstance(json_historic_layers, List)
-        for json_historic_layer in json_historic_layers:
-            info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer))
-
-        return info
-
-    def kind_count(self) -> Dict[str, int]:
-        counts: Dict[str, int] = defaultdict(int)
-        for inmem_layer in self.in_memory_layers:
-            counts[inmem_layer.kind] += 1
-        for hist_layer in self.historic_layers:
-            counts[hist_layer.kind] += 1
-        return counts
-
-
-@dataclass
-class InMemoryLayerInfo:
-    kind: str
-    lsn_start: str
-    lsn_end: Optional[str]
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo:
-        return InMemoryLayerInfo(
-            kind=d["kind"],
-            lsn_start=d["lsn_start"],
-            lsn_end=d.get("lsn_end"),
-        )
-
-
-@dataclass(frozen=True)
-class HistoricLayerInfo:
-    kind: str
-    layer_file_name: str
-    layer_file_size: Optional[int]
-    lsn_start: str
-    lsn_end: Optional[str]
-    remote: bool
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
-        return HistoricLayerInfo(
-            kind=d["kind"],
-            layer_file_name=d["layer_file_name"],
-            layer_file_size=d.get("layer_file_size"),
-            lsn_start=d["lsn_start"],
-            lsn_end=d.get("lsn_end"),
-            remote=d["remote"],
-        )
-
-
@dataclass
 class PageserverPort:
    pg: int
@@ -2436,10 +1905,16 @@ def remote_pg(
    connstr = os.getenv("BENCHMARK_CONNSTR")
    if connstr is None:
        raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
-
+    start_ms = int(datetime.utcnow().timestamp() * 1000)
    with RemotePostgres(pg_bin, connstr) as remote_pg:
        yield remote_pg

+    end_ms = int(datetime.utcnow().timestamp() * 1000)
+    host = parse_dsn(connstr).get("host", "")
+    if host.endswith(".neon.build"):
+        # Add 10s margin to the start and end times
+        allure_add_grafana_links(host, start_ms - 10_000, end_ms + 10_000)
+

 class PSQL:
    """
@@ -3378,151 +2853,6 @@ def check_restored_datadir_content(
    assert (mismatch, error) == ([], [])


-def wait_until(number_of_iterations: int, interval: float, func):
-    """
-    Wait until 'func' returns successfully, without exception. Returns the
-    last return value from the function.
-    """
-    last_exception = None
-    for i in range(number_of_iterations):
-        try:
-            res = func()
-        except Exception as e:
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            last_exception = e
-            time.sleep(interval)
-            continue
-        return res
-    raise Exception("timed out while waiting for %s" % func) from last_exception
-
-
-def wait_while(number_of_iterations: int, interval: float, func):
-    """
-    Wait until 'func' returns false, or throws an exception.
-    """
-    for i in range(number_of_iterations):
-        try:
-            if not func():
-                return
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            time.sleep(interval)
-            continue
-        except Exception:
-            return
-    raise Exception("timed out while waiting for %s" % func)
-
-
-def assert_tenant_status(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str
-):
-    tenant_status = pageserver_http_client.tenant_status(tenant)
-    log.info(f"tenant_status: {tenant_status}")
-    assert tenant_status["state"] == expected_status, tenant_status
-
-
-def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId):
-    tenants = ps_http.tenant_list()
-    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
-    assert len(matching) < 2
-    if len(matching) == 0:
-        return None
-    return matching[0]
-
-
-def remote_consistent_lsn(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-) -> Lsn:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    if detail["remote_consistent_lsn"] is None:
-        # No remote information at all. This happens right after creating
-        # a timeline, before any part of it has been uploaded to remote
-        # storage yet.
-        return Lsn(0)
-    else:
-        lsn_str = detail["remote_consistent_lsn"]
-        assert isinstance(lsn_str, str)
-        return Lsn(lsn_str)
-
-
-def wait_for_upload(
-    pageserver_http_client: PageserverHttpClient,
-    tenant: TenantId,
-    timeline: TimelineId,
-    lsn: Lsn,
-):
-    """waits for local timeline upload up to specified lsn"""
-    for i in range(20):
-        current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            log.info("wait finished")
-            return
-        log.info(
-            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
-                lsn, current_lsn, i + 1
-            )
-        )
-        time.sleep(1)
-    raise Exception(
-        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
-            lsn, current_lsn
-        )
-    )
-
-
-# Does not use `wait_until` for debugging purposes
-def wait_until_tenant_state(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    expected_state: str,
-    iterations: int,
-) -> bool:
-    for _ in range(iterations):
-        try:
-            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
-            log.debug(f"Tenant {tenant_id} data: {tenant}")
-            if tenant["state"] == expected_state:
-                return True
-        except Exception as e:
-            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
-
-        time.sleep(1)
-
-    raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
-
-
-def last_record_lsn(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-) -> Lsn:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    lsn_str = detail["last_record_lsn"]
-    assert isinstance(lsn_str, str)
-    return Lsn(lsn_str)
-
-
-def wait_for_last_record_lsn(
-    pageserver_http_client: PageserverHttpClient,
-    tenant: TenantId,
-    timeline: TimelineId,
-    lsn: Lsn,
-) -> Lsn:
-    """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
-    for i in range(10):
-        current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            return current_lsn
-        log.info(
-            "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                lsn, current_lsn, i + 1
-            )
-        )
-        time.sleep(1)
-    raise Exception(
-        "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
-    )
-
-
 def wait_for_last_flush_lsn(
    env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
@@ -3584,23 +2914,3 @@ def wait_for_sk_commit_lsn_to_reach_remote_storage(
    ps_http.timeline_checkpoint(tenant_id, timeline_id)
    wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
    return lsn
-
-
-def wait_for_upload_queue_empty(
-    pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
-):
-    ps_http = pageserver.http_client()
-    while True:
-        all_metrics = ps_http.get_metrics()
-        tl = all_metrics.query_all(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            {
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
-            },
-        )
-        assert len(tl) > 0
-        log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
-        if all(m.value == 0 for m in tl):
-            return
-        time.sleep(0.2)
--- a/test_runner/fixtures/pageserver/init.py
+++ b/test_runner/fixtures/pageserver/init.py
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -0,0 +1,545 @@
+from __future__ import annotations
+
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import requests
+
+from fixtures.log_helper import log
+from fixtures.metrics import Metrics, parse_metrics
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import Fn
+
+
+class PageserverApiException(Exception):
+    def __init__(self, message, status_code: int):
+        super().__init__(message)
+        self.status_code = status_code
+
+
+@dataclass
+class InMemoryLayerInfo:
+    kind: str
+    lsn_start: str
+    lsn_end: Optional[str]
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo:
+        return InMemoryLayerInfo(
+            kind=d["kind"],
+            lsn_start=d["lsn_start"],
+            lsn_end=d.get("lsn_end"),
+        )
+
+
+@dataclass(frozen=True)
+class HistoricLayerInfo:
+    kind: str
+    layer_file_name: str
+    layer_file_size: Optional[int]
+    lsn_start: str
+    lsn_end: Optional[str]
+    remote: bool
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
+        return HistoricLayerInfo(
+            kind=d["kind"],
+            layer_file_name=d["layer_file_name"],
+            layer_file_size=d.get("layer_file_size"),
+            lsn_start=d["lsn_start"],
+            lsn_end=d.get("lsn_end"),
+            remote=d["remote"],
+        )
+
+
+@dataclass
+class LayerMapInfo:
+    in_memory_layers: List[InMemoryLayerInfo]
+    historic_layers: List[HistoricLayerInfo]
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo:
+        info = LayerMapInfo(in_memory_layers=[], historic_layers=[])
+
+        json_in_memory_layers = d["in_memory_layers"]
+        assert isinstance(json_in_memory_layers, List)
+        for json_in_memory_layer in json_in_memory_layers:
+            info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer))
+
+        json_historic_layers = d["historic_layers"]
+        assert isinstance(json_historic_layers, List)
+        for json_historic_layer in json_historic_layers:
+            info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer))
+
+        return info
+
+    def kind_count(self) -> Dict[str, int]:
+        counts: Dict[str, int] = defaultdict(int)
+        for inmem_layer in self.in_memory_layers:
+            counts[inmem_layer.kind] += 1
+        for hist_layer in self.historic_layers:
+            counts[hist_layer.kind] += 1
+        return counts
+
+
+@dataclass
+class TenantConfig:
+    tenant_specific_overrides: Dict[str, Any]
+    effective_config: Dict[str, Any]
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> TenantConfig:
+        return TenantConfig(
+            tenant_specific_overrides=d["tenant_specific_overrides"],
+            effective_config=d["effective_config"],
+        )
+
+
+class PageserverHttpClient(requests.Session):
+    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
+        super().__init__()
+        self.port = port
+        self.auth_token = auth_token
+        self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
+
+        if auth_token is not None:
+            self.headers["Authorization"] = f"Bearer {auth_token}"
+
+    def verbose_error(self, res: requests.Response):
+        try:
+            res.raise_for_status()
+        except requests.RequestException as e:
+            try:
+                msg = res.json()["msg"]
+            except:  # noqa: E722
+                msg = ""
+            raise PageserverApiException(msg, res.status_code) from e
+
+    def check_status(self):
+        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
+
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
+    def tenant_list(self) -> List[Dict[Any, Any]]:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, list)
+        return res_json
+
+    def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant",
+            json={
+                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+            },
+        )
+        self.verbose_error(res)
+        if res.status_code == 409:
+            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
+        new_tenant_id = res.json()
+        assert isinstance(new_tenant_id, str)
+        return TenantId(new_tenant_id)
+
+    def tenant_attach(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
+        self.verbose_error(res)
+
+    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
+        params = {}
+        if detach_ignored:
+            params["detach_ignored"] = "true"
+
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
+        self.verbose_error(res)
+
+    def tenant_load(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
+        self.verbose_error(res)
+
+    def tenant_ignore(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
+        self.verbose_error(res)
+
+    def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
+        self.verbose_error(res)
+        return TenantConfig.from_json(res.json())
+
+    def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
+        assert "tenant_id" not in config.keys()
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/config",
+            json={**config, "tenant_id": str(tenant_id)},
+        )
+        self.verbose_error(res)
+
+    def patch_tenant_config_client_side(
+        self,
+        tenant_id: TenantId,
+        inserts: Optional[Dict[str, Any]] = None,
+        removes: Optional[List[str]] = None,
+    ):
+        current = self.tenant_config(tenant_id).tenant_specific_overrides
+        if inserts is not None:
+            current.update(inserts)
+        if removes is not None:
+            for key in removes:
+                del current[key]
+        self.set_tenant_config(tenant_id, current)
+
+    def tenant_size(self, tenant_id: TenantId) -> int:
+        return self.tenant_size_and_modelinputs(tenant_id)[0]
+
+    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
+        """
+        Returns the tenant size, together with the model inputs as the second tuple item.
+        """
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
+        self.verbose_error(res)
+        res = res.json()
+        assert isinstance(res, dict)
+        assert TenantId(res["id"]) == tenant_id
+        size = res["size"]
+        assert type(size) == int
+        inputs = res["inputs"]
+        assert type(inputs) is dict
+        return (size, inputs)
+
+    def tenant_size_debug(self, tenant_id: TenantId) -> str:
+        """
+        Returns the tenant size debug info, as an HTML string
+        """
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
+            headers={"Accept": "text/html"},
+        )
+        return res.text
+
+    def timeline_list(
+        self,
+        tenant_id: TenantId,
+        include_non_incremental_logical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
+    ) -> List[Dict[str, Any]]:
+        params = {}
+        if include_non_incremental_logical_size:
+            params["include-non-incremental-logical-size"] = "true"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "true"
+
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, list)
+        return res_json
+
+    def timeline_create(
+        self,
+        tenant_id: TenantId,
+        new_timeline_id: Optional[TimelineId] = None,
+        ancestor_timeline_id: Optional[TimelineId] = None,
+        ancestor_start_lsn: Optional[Lsn] = None,
+    ) -> Dict[Any, Any]:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline",
+            json={
+                "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
+                "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
+                "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
+            },
+        )
+        self.verbose_error(res)
+        if res.status_code == 409:
+            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
+
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_detail(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        include_non_incremental_logical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
+        **kwargs,
+    ) -> Dict[Any, Any]:
+        params = {}
+        if include_non_incremental_logical_size:
+            params["include-non-incremental-logical-size"] = "true"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "true"
+
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            params=params,
+            **kwargs,
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
+        res = self.delete(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
+    def timeline_gc(
+        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
+    ) -> dict[str, Any]:
+        self.is_testing_enabled_or_skip()
+
+        log.info(
+            f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
+        )
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc",
+            json={"gc_horizon": gc_horizon},
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        self.is_testing_enabled_or_skip()
+
+        log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
+        )
+        log.info(f"Got compact request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
+    def timeline_get_lsn_by_timestamp(
+        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
+    ):
+        log.info(
+            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
+        )
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
+    def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
+        self.is_testing_enabled_or_skip()
+
+        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
+        )
+        log.info(f"Got checkpoint request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
+    def timeline_spawn_download_remote_layers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        max_concurrent_downloads: int,
+    ) -> dict[str, Any]:
+        body = {
+            "max_concurrent_downloads": max_concurrent_downloads,
+        }
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+            json=body,
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        spawn_response: dict[str, Any],
+        poll_state=None,
+    ) -> None | dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+
+        # assumption in this API client here is that nobody else spawns the task
+        assert res_json["task_id"] == spawn_response["task_id"]
+
+        if poll_state is None or res_json["state"] == poll_state:
+            return res_json
+        return None
+
+    def timeline_download_remote_layers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        max_concurrent_downloads: int,
+        errors_ok=False,
+        at_least_one_download=True,
+    ):
+        res = self.timeline_spawn_download_remote_layers(
+            tenant_id, timeline_id, max_concurrent_downloads
+        )
+        while True:
+            completed = self.timeline_poll_download_remote_layers_status(
+                tenant_id, timeline_id, res, poll_state="Completed"
+            )
+            if not completed:
+                time.sleep(0.1)
+                continue
+            if not errors_ok:
+                assert completed["failed_download_count"] == 0
+            if at_least_one_download:
+                assert completed["successful_download_count"] > 0
+            return completed
+
+    def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
+        res = self.get(f"http://localhost:{self.port}/metrics")
+        self.verbose_error(res)
+        return res.text
+
+    def get_metrics(self) -> Metrics:
+        res = self.get_metrics_str()
+        return parse_metrics(res)
+
+    def get_timeline_metric(
+        self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str
+    ) -> float:
+        metrics = self.get_metrics()
+        return metrics.query_one(
+            metric_name,
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        ).value
+
+    def get_remote_timeline_client_metric(
+        self,
+        metric_name: str,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        file_kind: str,
+        op_kind: str,
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        matches = metrics.query_all(
+            name=metric_name,
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        )
+        if len(matches) == 0:
+            value = None
+        elif len(matches) == 1:
+            value = matches[0].value
+            assert value is not None
+        else:
+            assert len(matches) < 2, "above filter should uniquely identify metric"
+        return value
+
+    def get_metric_value(
+        self, name: str, filter: Optional[Dict[str, str]] = None
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        results = metrics.query_all(name, filter=filter)
+        if not results:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
+        return results[0].value
+
+    def layer_map_info(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> LayerMapInfo:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/",
+        )
+        self.verbose_error(res)
+        return LayerMapInfo.from_json(res.json())
+
+    def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
+        )
+        self.verbose_error(res)
+
+        assert res.status_code == 200
+
+    def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+        res = self.delete(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
+        )
+        self.verbose_error(res)
+
+        assert res.status_code == 200
+
+    def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
+        info = self.layer_map_info(tenant_id, timeline_id)
+        for layer in info.historic_layers:
+            self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    def disk_usage_eviction_run(self, request: dict[str, Any]):
+        res = self.put(
+            f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
+            json=request,
+        )
+        self.verbose_error(res)
+        return res.json()
+
+    def tenant_break(self, tenant_id: TenantId):
+        res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
+        self.verbose_error(res)
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -0,0 +1,145 @@
+import time
+
+from fixtures.log_helper import log
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.types import Lsn, TenantId, TimelineId
+
+
+def assert_tenant_status(
+    pageserver_http: PageserverHttpClient, tenant: TenantId, expected_status: str
+):
+    tenant_status = pageserver_http.tenant_status(tenant)
+    log.info(f"tenant_status: {tenant_status}")
+    assert tenant_status["state"] == expected_status, tenant_status
+
+
+def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    tenants = pageserver_http.tenant_list()
+    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
+    assert len(matching) < 2
+    if len(matching) == 0:
+        return None
+    return matching[0]
+
+
+def remote_consistent_lsn(
+    pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    detail = pageserver_http.timeline_detail(tenant, timeline)
+
+    if detail["remote_consistent_lsn"] is None:
+        # No remote information at all. This happens right after creating
+        # a timeline, before any part of it has been uploaded to remote
+        # storage yet.
+        return Lsn(0)
+    else:
+        lsn_str = detail["remote_consistent_lsn"]
+        assert isinstance(lsn_str, str)
+        return Lsn(lsn_str)
+
+
+def wait_for_upload(
+    pageserver_http: PageserverHttpClient,
+    tenant: TenantId,
+    timeline: TimelineId,
+    lsn: Lsn,
+):
+    """waits for local timeline upload up to specified lsn"""
+    for i in range(20):
+        current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
+        if current_lsn >= lsn:
+            log.info("wait finished")
+            return
+        log.info(
+            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
+                lsn, current_lsn, i + 1
+            )
+        )
+        time.sleep(1)
+    raise Exception(
+        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
+            lsn, current_lsn
+        )
+    )
+
+
+def wait_until_tenant_state(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    expected_state: str,
+    iterations: int,
+) -> bool:
+    """
+    Does not use `wait_until` for debugging purposes
+    """
+    for _ in range(iterations):
+        try:
+            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+            log.debug(f"Tenant {tenant_id} data: {tenant}")
+            if tenant["state"] == expected_state:
+                return True
+        except Exception as e:
+            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
+
+        time.sleep(1)
+
+    raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
+
+
+def wait_until_tenant_active(
+    pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int = 30
+):
+    wait_until_tenant_state(
+        pageserver_http, tenant_id, expected_state="Active", iterations=iterations
+    )
+
+
+def last_record_lsn(
+    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    detail = pageserver_http_client.timeline_detail(tenant, timeline)
+
+    lsn_str = detail["last_record_lsn"]
+    assert isinstance(lsn_str, str)
+    return Lsn(lsn_str)
+
+
+def wait_for_last_record_lsn(
+    pageserver_http: PageserverHttpClient,
+    tenant: TenantId,
+    timeline: TimelineId,
+    lsn: Lsn,
+) -> Lsn:
+    """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
+    for i in range(10):
+        current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
+        if current_lsn >= lsn:
+            return current_lsn
+        log.info(
+            "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
+                lsn, current_lsn, i + 1
+            )
+        )
+        time.sleep(1)
+    raise Exception(
+        "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
+    )
+
+
+def wait_for_upload_queue_empty(
+    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+):
+    while True:
+        all_metrics = pageserver_http.get_metrics()
+        tl = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            {
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        )
+        assert len(tl) > 0
+        log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
+        if all(m.value == 0 for m in tl):
+            return
+        time.sleep(0.2)
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,4 +1,5 @@
 import contextlib
+import json
 import os
 import re
 import subprocess
@@ -6,8 +7,9 @@ import tarfile
 import time
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Tuple, TypeVar
+from urllib.parse import urlencode

-import allure  # type: ignore
+import allure
 from psycopg2.extensions import cursor

 from fixtures.log_helper import log
@@ -184,6 +186,46 @@ def allure_attach_from_dir(dir: Path):
            allure.attach.file(source, name, attachment_type, extension)


+DATASOURCE_ID = "xHHYY0dVz"
+
+
+def allure_add_grafana_links(host: str, start_ms: int, end_ms: int):
+    """Add links to server logs in Grafana to Allure report"""
+    # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build
+    endpoint_id, region_id, _ = host.split(".", 2)
+
+    expressions = {
+        "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}',
+        "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"',
+        "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"',
+        "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}',
+    }
+
+    params: Dict[str, Any] = {
+        "datasource": DATASOURCE_ID,
+        "queries": [
+            {
+                "expr": "<PUT AN EXPRESSION HERE>",
+                "refId": "A",
+                "datasource": {"type": "loki", "uid": DATASOURCE_ID},
+                "editorMode": "code",
+                "queryType": "range",
+            }
+        ],
+        "range": {
+            "from": str(start_ms),
+            "to": str(end_ms),
+        },
+    }
+    for name, expr in expressions.items():
+        params["queries"][0]["expr"] = expr
+        query_string = urlencode({"orgId": 1, "left": json.dumps(params)})
+        link = f"https://neonprod.grafana.net/explore?{query_string}"
+
+        allure.dynamic.link(link, name=name)
+        log.info(f"{name}: {link}")
+
+
 def start_in_background(
    command: list[str], cwd: Path, log_file_name: str, is_started: Fn
 ) -> subprocess.Popen[bytes]:
@@ -236,3 +278,19 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
            continue
        return res
    raise Exception("timed out while waiting for %s" % func) from last_exception
+
+
+def wait_while(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns false, or throws an exception.
+    """
+    for i in range(number_of_iterations):
+        try:
+            if not func():
+                return
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            time.sleep(interval)
+            continue
+        except Exception:
+            return
+    raise Exception("timed out while waiting for %s" % func)
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -10,7 +10,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import wait_for_last_record_lsn
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn


--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -1,7 +1,8 @@
 from contextlib import closing

 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PgProtocol
+from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.types import TenantId


--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -10,12 +10,11 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonCli,
    NeonEnvBuilder,
-    PageserverHttpClient,
    PgBin,
    PortDistributor,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn
 from pytest import FixtureRequest

@@ -34,9 +33,15 @@ from pytest import FixtureRequest
 # - check_neon_works performs the test itself, feel free to add more checks there.
 #

+check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
+    os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None,
+    reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set",
+)
+

 # Note: if renaming this test, don't forget to update a reference to it in a workflow file:
 # "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
+@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
@@ -81,6 +86,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o
    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it


+@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
 def test_backward_compatibility(
@@ -134,6 +140,7 @@ def test_backward_compatibility(
    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"


+@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
 def test_forward_compatibility(
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -11,14 +11,14 @@ from fixtures.neon_fixtures import (
    LocalFsStorage,
    NeonEnv,
    NeonEnvBuilder,
-    PageserverHttpClient,
    PgBin,
    RemoteStorageKind,
    wait_for_last_flush_lsn,
-    wait_for_upload_queue_empty,
-    wait_until,
 )
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until

 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"

@@ -138,7 +138,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
    # remove the initial tenant
    ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865
    assert env.initial_timeline
-    wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline)
    pageserver_http.tenant_detach(env.initial_tenant)
    assert isinstance(env.remote_storage, LocalFsStorage)
    tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
@@ -182,7 +182,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
    # after stopping the safekeepers, we know that no new WAL will be coming in
    for tenant_id, timeline_id in timelines:
        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id)
+        wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
        tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
        assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
        assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -13,9 +13,8 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    Postgres,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import subprocess_capture

--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -6,10 +6,9 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    RemoteStorageKind,
    wait_for_last_flush_lsn,
-    wait_for_last_record_lsn,
    wait_for_sk_commit_lsn_to_reach_remote_storage,
-    wait_for_upload,
 )
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar

--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -5,8 +5,8 @@ from fixtures.neon_fixtures import (
    DEFAULT_BRANCH_NAME,
    NeonEnv,
    NeonEnvBuilder,
-    PageserverHttpClient,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.types import TenantId, TimelineId


--- a/test_runner/regress/test_normal_work.py
+++ b/test_runner/regress/test_normal_work.py
@@ -1,6 +1,7 @@
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PageserverHttpClient
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.pageserver.http import PageserverHttpClient


 def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -10,20 +10,20 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
-    PageserverApiException,
-    PageserverHttpClient,
    RemoteStorageKind,
-    assert_tenant_status,
    available_remote_storages,
    wait_for_last_flush_lsn,
-    wait_for_last_record_lsn,
    wait_for_sk_commit_lsn_to_reach_remote_storage,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
+    assert_tenant_status,
+    wait_for_last_record_lsn,
    wait_for_upload,
-    wait_until,
    wait_until_tenant_state,
 )
 from fixtures.types import Lsn
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until


 def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -6,8 +6,8 @@ from fixtures.neon_fixtures import (
    DEFAULT_BRANCH_NAME,
    NeonEnv,
    NeonEnvBuilder,
-    PageserverHttpClient,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until

--- a/test_runner/regress/test_read_trace.py
+++ b/test_runner/regress/test_read_trace.py
@@ -1,6 +1,7 @@
 from contextlib import closing

-from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar

--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,6 +1,7 @@
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -13,13 +13,15 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    LocalFsStorage,
    NeonEnvBuilder,
-    PageserverApiException,
-    PageserverHttpClient,
    RemoteStorageKind,
    available_remote_storages,
    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_until_tenant_active,
    wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -172,15 +174,10 @@ def test_remote_storage_backup_and_restore(
        client.tenant_attach(tenant_id)
    log.info("waiting for tenant to become active. this should be quick with on-demand download")

-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=tenant_active,
+    wait_until_tenant_active(
+        pageserver_http=client,
+        tenant_id=tenant_id,
+        iterations=5,
    )

    detail = client.timeline_detail(tenant_id, timeline_id)
@@ -357,12 +354,7 @@ def test_remote_storage_upload_queue_retries(

    client.tenant_attach(tenant_id)

-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(30, 1, tenant_active)
+    wait_until_tenant_active(client, tenant_id)

    log.info("restarting postgres to validate")
    pg = env.postgres.create_start("main", tenant_id=tenant_id)
@@ -497,12 +489,7 @@ def test_remote_timeline_client_calls_started_metric(

    client.tenant_attach(tenant_id)

-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(30, 1, tenant_active)
+    wait_until_tenant_active(client, tenant_id)

    log.info("restarting postgres to validate")
    pg = env.postgres.create_start("main", tenant_id=tenant_id)
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -6,9 +6,8 @@ from fixtures.neon_fixtures import (
    LocalFsStorage,
    NeonEnvBuilder,
    RemoteStorageKind,
-    assert_tenant_status,
-    wait_for_upload,
 )
+from fixtures.pageserver.utils import assert_tenant_status, wait_for_upload
 from fixtures.types import Lsn
 from fixtures.utils import wait_until

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -9,18 +9,18 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
-    PageserverApiException,
-    PageserverHttpClient,
    Postgres,
    RemoteStorageKind,
    available_remote_storages,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
    wait_for_upload,
-    wait_until,
    wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until


 def do_gc_target(
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -10,18 +10,24 @@ from fixtures.neon_fixtures import (
    NeonBroker,
    NeonEnv,
    NeonEnvBuilder,
-    PageserverHttpClient,
    PortDistributor,
    Postgres,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import (
    assert_tenant_status,
    tenant_exists,
    wait_for_last_record_lsn,
    wait_for_upload,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import (
+    query_scalar,
+    start_in_background,
+    subprocess_capture,
    wait_until,
    wait_while,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar, start_in_background, subprocess_capture


 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -6,11 +6,11 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
-    PageserverHttpClient,
    Postgres,
    wait_for_last_flush_lsn,
    wait_for_wal_insert_lsn,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.types import Lsn, TenantId, TimelineId


--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -20,10 +20,12 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    Postgres,
    RemoteStorageKind,
-    assert_tenant_status,
    available_remote_storages,
-    wait_for_last_record_lsn,
    wait_for_sk_commit_lsn_to_reach_remote_storage,
+)
+from fixtures.pageserver.utils import (
+    assert_tenant_status,
+    wait_for_last_record_lsn,
    wait_for_upload,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,5 +1,6 @@
 import pytest
-from fixtures.neon_fixtures import NeonEnv, PageserverApiException
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until

--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -14,20 +14,21 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
-    PageserverApiException,
-    PageserverHttpClient,
    PgBin,
    PortDistributor,
    Postgres,
    RemoteStorageKind,
    VanillaPostgres,
-    assert_tenant_status,
    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
+    assert_tenant_status,
    wait_for_upload_queue_empty,
-    wait_until,
+    wait_until_tenant_active,
 )
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import get_timeline_dir_size
+from fixtures.utils import get_timeline_dir_size, wait_until


 def test_timeline_size(neon_simple_env: NeonEnv):
@@ -246,12 +247,7 @@ def test_timeline_initial_logical_size_calculation_cancellation(
        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
    )

-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(30, 1, tenant_active)
+    wait_until_tenant_active(client, tenant_id)

    # kick off initial size calculation task (the response we get here is the estimated size)
    def assert_size_calculation_not_done():
@@ -425,7 +421,7 @@ def test_timeline_physical_size_post_compaction(
    pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)

    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)

    assert_physical_size_invariants(
        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
@@ -478,7 +474,7 @@ def test_timeline_physical_size_post_gc(
    pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)

    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)

    assert_physical_size_invariants(
        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
@@ -584,7 +580,7 @@ def test_tenant_physical_size(

    tenant, timeline = env.neon_cli.create_tenant()
    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
+        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)

    def get_timeline_resident_physical_size(timeline: TimelineId):
        sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
@@ -609,7 +605,7 @@ def test_tenant_physical_size(
        pageserver_http.timeline_checkpoint(tenant, timeline)

        if remote_storage_kind is not None:
-            wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
+            wait_for_upload_queue_empty(pageserver_http, tenant, timeline)

        timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)

--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -30,9 +30,8 @@ from fixtures.neon_fixtures import (
    SafekeeperHttpClient,
    SafekeeperPort,
    available_remote_storages,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background

--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -3,7 +3,8 @@ import time
 import psutil
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.types import TenantId


--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15