From c6172dae4784580df58e5424a99813d4f74859a7 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Wed, 22 Sep 2021 18:29:35 +0300
Subject: [PATCH] implement performance tests against our staging environment
 tests are based on self-hosted runner which is physically close to our
 staging deployment in aws, currently tests consist of various configurations
 of pgbenchi runs.

Also these changes rework benchmark fixture by removing globals and
allowing to collect reports with desired metrics and dump them to json
for further analysis. This is also applicable to usual performance tests
which use local zenith binaries.
---
 .circleci/config.yml                          |   2 +-
 .github/workflows/benchmarking.yml            | 158 +++++++++
 scripts/generate_perf_report_page.py          | 218 ++++++++++++
 scripts/perf_report_template.html             |  52 +++
 test_runner/Pipfile                           |   3 +-
 test_runner/Pipfile.lock                      | 150 ++++++--
 test_runner/fixtures/benchmark_fixture.py     | 320 +++++++++++++-----
 test_runner/fixtures/zenith_fixtures.py       |  60 +++-
 test_runner/performance/__init__.py           |   0
 test_runner/performance/conftest.py           |   8 +
 test_runner/performance/test_bulk_insert.py   |  14 +-
 .../performance/test_bulk_tenant_create.py    |   6 +-
 test_runner/performance/test_gist_build.py    |  11 +-
 test_runner/performance/test_perf_pgbench.py  |  12 +-
 .../performance/test_perf_pgbench_remote.py   | 125 +++++++
 .../performance/test_write_amplification.py   |   6 +-
 test_runner/pytest.ini                        |   4 +
 17 files changed, 1017 insertions(+), 132 deletions(-)
 create mode 100644 .github/workflows/benchmarking.yml
 create mode 100755 scripts/generate_perf_report_page.py
 create mode 100644 scripts/perf_report_template.html
 create mode 100644 test_runner/performance/__init__.py
 create mode 100644 test_runner/performance/conftest.py
 create mode 100644 test_runner/performance/test_perf_pgbench_remote.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9e4b9ea4ba..1ba8a62d4d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -267,7 +267,7 @@ jobs:
             # -n4 uses four processes to run tests via pytest-xdist
             # -s is not used to prevent pytest from capturing output, because tests are running
             # in parallel and logs are mixed between different tests
-            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
+            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -m "not remote_cluster" -rA $TEST_SELECTION $EXTRA_PARAMS
       - run:
           # CircleCI artifacts are preserved one file at a time, so skipping
           # this step isn't a good idea. If you want to extract the
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
new file mode 100644
index 0000000000..c87a22afc1
--- /dev/null
+++ b/.github/workflows/benchmarking.yml
@@ -0,0 +1,158 @@
+name: benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ mybranch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '36 7 * * *' # run once a day, timezone is utc
+
+env:
+  BASE_URL: "https://console.zenith.tech"
+
+jobs:
+  bench:
+    # this workflow runs on self hosteed runner
+    # it's environment is quite different from usual guthub runner
+    # probably the most important difference is that it doesnt start from clean workspace each time
+    # e g if you install system packages they are not cleaned up since you install them directly in host machine
+    # not a container or something
+    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
+    runs-on: [self-hosted, zenith-benchmarker]
+
+    steps:
+    - name: Checkout zenith repo
+      uses: actions/checkout@v2
+
+    - name: Checkout zenith-perf-data repo
+      uses: actions/checkout@v2
+      with:
+        repository: zenithdb/zenith-perf-data
+        token: ${{ secrets.VIP_VAP_ACCESS_TOKEN }}
+        ref: testing # TODO replace with master once everything is ready
+        path: zenith-perf-data
+
+    # actions/setup-python@v2 is not working correctly on self-hosted runners
+    # see https://github.com/actions/setup-python/issues/162
+    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
+    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
+    # there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
+    - name: Install pipenv & deps
+      run: |
+        python3 -m pip install --upgrade pipenv wheel
+        # since pip/pipenv caches are reused there shouldn't be any troubles with install every time
+        pipenv install
+
+    - name: Show versions
+      run: |
+        echo Python
+        python3 --version
+        pipenv run python3 --version
+        echo Pipenv
+        pipenv --version
+        echo Pgbench
+        pgbench --version
+
+    # FIXME cluster setup is skipped due to various changes in console API
+    # for now pre created cluster is used. When API gain some stability
+    # after massive changes dynamic cluster setup will be revived.
+    # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
+    - name: Setup cluster
+      env:
+        BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        # USERNAME: "benchmark"
+      shell: bash
+      run: |
+        set -e
+        # echo "Creating cluster"
+
+        # CLUSTER=$(curl -s --fail --show-error $BASE_URL/api/v1/clusters.json \
+        #     -H 'Content-Type: application/json; charset=utf-8' \
+        #     -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN" \
+        #     --data-binary @- << EOF
+        # {
+        #     "cluster": {
+        #         "name": "default_cluster",
+        #         "region_id": "2",
+        #         "instance_type_id": 7,
+        #         "settings": {}
+        #     },
+        #     "database": {"name": "benchmark"},
+        #     "role": {"name": "$USERNAME", "password": "$BENCHMARK_CONSOLE_USER_PASSWORD"}
+        # }
+        # EOF
+        # )
+
+        # echo "Created cluster"
+
+        echo "Starting cluster"
+        CLUSTER_ID=285
+        CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$CLUSTER_ID/start \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+        echo $CLUSTER | python -m json.tool
+
+        echo "Waiting for cluster to become ready"
+        sleep 10
+
+        # # note that jq is installed on host system
+        # CLUSTER_ID=$(echo $CLUSTER| jq ".id")
+        echo "CLUSTER_ID=$CLUSTER_ID" >> $GITHUB_ENV
+        # echo "Constructing connstr"
+        # CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$CLUSTER_ID.json \
+        #     -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+
+        # echo $CLUSTER | python -m json.tool
+        # CONNSTR=$(echo $CLUSTER | jq -r ".| \"postgresql://$USERNAME:$BENCHMARK_CONSOLE_USER_PASSWORD@\(.public_ip_address):\(.public_pg_port)/benchmark\"")
+        # echo "BENCHMARK_CONNSTR=$CONNSTR" >> $GITHUB_ENV
+
+    - name: Run benchmark
+      # pgbench is installed system wide from official repo
+      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # via
+      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
+      # [pgdg13]
+      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
+      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # enabled=1
+      # gpgcheck=0
+      # EOF
+      # sudo yum makecache
+      # sudo yum install postgresql13-contrib
+      # actual binaries are located in /usr/pgsql-13/bin/
+      env:
+        PG_BIN: "/usr/pgsql-13/bin/"
+        TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
+        TEST_PG_BENCH_SCALES_MATRIX: "10,15"
+        PLATFORM: "zenith-staging"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
+      run: |
+        mkdir -p zenith-perf-data/data/staging
+        pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir zenith-perf-data/data/staging
+
+    - name: Submit result
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+      run: |
+        cd zenith-perf-data
+        git add data
+        git commit --author="vipvap <vipvap@zenith.tech>" -m "add performance test result for $GITHUB_SHA zenith revision"
+        git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git main
+
+    # FIXME see comment above Setup cluster job
+    # change to delete cluster after switching to creating a cluster for every run
+    - name: Stop cluster
+      if: ${{ always() }}
+      env:
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+      run: |
+        curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$CLUSTER_ID/stop \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN"
diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py
new file mode 100755
index 0000000000..618f224c73
--- /dev/null
+++ b/scripts/generate_perf_report_page.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+import json
+from typing import Any, Dict, List, Optional, Tuple, cast
+from jinja2 import Template
+
+# skip 'input' columns. They are included in the header and just blow the table
+EXCLUDE_COLUMNS = frozenset({
+    'scale',
+    'duration',
+    'number_of_clients',
+    'number_of_threads',
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+
+KEY_EXCLUDE_FIELDS = frozenset({
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+NEGATIVE_COLOR = 'negative'
+POSITIVE_COLOR = 'positive'
+
+
+@dataclass
+class SuitRun:
+    revision: str
+    values: Dict[str, Any]
+
+
+@dataclass
+class SuitRuns:
+    platform: str
+    suit: str
+    common_columns: List[str]
+    value_columns: List[str]
+    runs: List[SuitRun]
+
+
+@dataclass
+class RowValue:
+    value: str
+    color: str
+    ratio: str
+
+
+def get_columns(values: List[Dict]) -> Tuple[List[Tuple[str, str]], List[str]]:
+    value_columns = []
+    common_columns = []
+    for item in values:
+        if item['name'] in KEY_EXCLUDE_FIELDS:
+            continue
+        if item['report'] != 'test_param':
+            value_columns.append(cast(str, item['name']))
+        else:
+            common_columns.append(
+                (cast(str, item['name']), cast(str, item['value'])))
+    value_columns.sort()
+    common_columns.sort(key=lambda x: x[0])  # sort by name
+    return common_columns, value_columns
+
+
+def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
+    color = ''
+    sign = '+' if ratio > 0 else ''
+    if abs(ratio) < 0.05:
+        return f'&nbsp({sign}{ratio:.2f})', color
+
+    if report not in {'test_param', 'higher_is_better', 'lower_is_better'}:
+        raise ValueError(f'Unknown report type: {report}')
+
+    if report == 'test_param':
+        return f'{ratio:.2f}', color
+
+    if ratio > 0:
+        if report == 'higher_is_better':
+            color = POSITIVE_COLOR
+        elif report == 'lower_is_better':
+            color = NEGATIVE_COLOR
+    elif ratio < 0:
+        if report == 'higher_is_better':
+            color = NEGATIVE_COLOR
+        elif report == 'lower_is_better':
+            color = POSITIVE_COLOR
+
+    return f'&nbsp({sign}{ratio:.2f})', color
+
+
+def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
+    for item in suit_run.values['data']:
+        if item['name'] == name:
+            return item
+    return None
+
+
+def get_row_values(columns: List[str], run_result: SuitRun,
+                   prev_result: Optional[SuitRun]) -> List[RowValue]:
+    row_values = []
+    for column in columns:
+        current_value = extract_value(column, run_result)
+        if current_value is None:
+            # should never happen
+            raise ValueError(f'{column} not found in {run_result.values}')
+
+        value = current_value["value"]
+        if isinstance(value, float):
+            value = f'{value:.2f}'
+
+        if prev_result is None:
+            row_values.append(RowValue(value, '', ''))
+            continue
+
+        prev_value = extract_value(column, prev_result)
+        if prev_value is None:
+            # this might happen when new metric is added and there is no value for it in previous run
+            # let this be here, TODO add proper handling when this actually happens
+            raise ValueError(f'{column} not found in previous result')
+        ratio = float(value) / float(prev_value['value']) - 1
+        ratio_display, color = format_ratio(ratio,
+                                            current_value['report'])
+        row_values.append(RowValue(value, color, ratio_display))
+    return row_values
+
+
+@dataclass
+class SuiteRunTableRow:
+    revision: str
+    values: List[RowValue]
+
+
+def prepare_rows_from_runs(value_columns: List[str],
+                           runs: List[SuitRun]) -> List[SuiteRunTableRow]:
+    rows = []
+    prev_run = None
+    for run in runs:
+        rows.append(
+            SuiteRunTableRow(revision=run.revision,
+                             values=get_row_values(value_columns, run,
+                                                   prev_run)))
+        prev_run = run
+
+    return rows
+
+
+def main(args: argparse.Namespace) -> None:
+    input_dir = Path(args.input_dir)
+    grouped_runs = {}
+    # we have files in form: <ctr>_<rev>.json
+    # fill them in the hashmap so we have grouped items for the
+    # same run configuration (scale, duration etc.) ordered by counter.
+    for item in sorted(input_dir.iterdir(),
+                       key=lambda x: int(x.name.split('_')[0])):
+        run_data = json.loads(item.read_text())
+        revision = run_data['revision']
+
+        for suit_result in run_data['result']:
+            key = "{}{}".format(run_data['platform'], suit_result['suit'])
+            # pack total duration as a synthetic value
+            total_duration = suit_result['total_duration']
+            suit_result['data'].append({
+                'name': 'total_duration',
+                'value': total_duration,
+                'unit': 's',
+                'report': 'lower_is_better',
+            })
+            common_columns, value_columns = get_columns(suit_result['data'])
+
+            grouped_runs.setdefault(
+                key,
+                SuitRuns(
+                    platform=run_data['platform'],
+                    suit=suit_result['suit'],
+                    common_columns=common_columns,
+                    value_columns=value_columns,
+                    runs=[],
+                ),
+            )
+
+            grouped_runs[key].runs.append(
+                SuitRun(revision=revision, values=suit_result))
+    context = {}
+    for result in grouped_runs.values():
+        suit = result.suit
+        context[suit] = {
+            'common_columns':
+            result.common_columns,
+            'value_columns':
+            result.value_columns,
+            'platform':
+            result.platform,
+            # reverse the order so newest results are on top of the table
+            'rows':
+            reversed(prepare_rows_from_runs(result.value_columns,
+                                            result.runs)),
+        }
+
+    template = Template((Path(__file__).parent / 'perf_report_template.html').read_text())
+
+    Path(args.out).write_text(template.render(context=context))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input-dir',
+        dest='input_dir',
+        required=True,
+        help='Directory with jsons generated by the test suite',
+    )
+    parser.add_argument('--out', required=True, help='Output html file path')
+    args = parser.parse_args()
+    main(args)
diff --git a/scripts/perf_report_template.html b/scripts/perf_report_template.html
new file mode 100644
index 0000000000..2847e75a00
--- /dev/null
+++ b/scripts/perf_report_template.html
@@ -0,0 +1,52 @@
+<!DOCTYPE html>
+<html>
+
+<body>
+    <style>
+        table,
+        th,
+        td {
+            border: 1px solid black;
+            border-collapse: collapse;
+        }
+
+        .positive {
+            background-color: rgba(0, 255, 0, 0.8)
+        }
+
+        .negative {
+            background-color: rgba(255, 0, 0, 0.65)
+        }
+    </style>
+
+    <h2>Zenith Performance Tests</h2>
+
+    {% for suit_name, suit_data in context.items() %}
+    <h3>Runs for {{ suit_name }} </h3>
+    <b>platform:</b> {{ suit_data.platform }}<br>
+    {% for common_column_name, common_column_value in suit_data.common_columns %}
+    <b>{{ common_column_name }}</b>: {{ common_column_value }}<br>
+    {% endfor %}
+    <br>
+
+    <table>
+        <tr>
+            <th>revision</th>
+            {% for column_name in suit_data.value_columns %}
+            <th>{{ column_name }}</th>
+            {% endfor %}
+        </tr>
+        {% for row in suit_data.rows %}
+        <tr>
+            <td><a href=https://github.com/zenithdb/zenith/commit/{{ row.revision }}>{{ row.revision[:6] }}</a></td>
+            {% for column_value in row.values %}
+            <td class="{{ column_value.color }}">{{ column_value.value }}{{column_value.ratio}}</td>
+            {% endfor %}
+        </tr>
+        {% endfor %}
+    </table>
+    {% endfor %}
+
+</body>
+
+</html>
diff --git a/test_runner/Pipfile b/test_runner/Pipfile
index e179553dbd..572ff82ec8 100644
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -5,13 +5,14 @@ name = "pypi"
 
 [packages]
 pytest = ">=6.0.0"
-psycopg2 = "*"
 typing-extensions = "*"
 pyjwt = {extras = ["crypto"], version = "*"}
 requests = "*"
 pytest-xdist = "*"
 asyncpg = "*"
 cached-property = "*"
+psycopg2-binary = "*"
+jinja2 = "*"
 
 [dev-packages]
 # Behavior may change slightly between versions. These are run continuously,
diff --git a/test_runner/Pipfile.lock b/test_runner/Pipfile.lock
index 185b91189b..1524bbe584 100644
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "63b72760ef37375186a638066ba0ad5804dbace99ddc503ea654e9749070ab24"
+            "sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -177,13 +177,96 @@
             ],
             "version": "==1.1.1"
         },
-        "packaging": {
+        "jinja2": {
             "hashes": [
-                "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
-                "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
+                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
+                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
+            ],
+            "index": "pypi",
+            "version": "==3.0.2"
+        },
+        "markupsafe": {
+            "hashes": [
+                "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
+                "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
+                "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
+                "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
+                "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
+                "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
+                "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
+                "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
+                "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
+                "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
+                "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
+                "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
+                "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
+                "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
+                "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
+                "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
+                "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
+                "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
+                "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
+                "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
+                "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
+                "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
+                "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
+                "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
+                "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
+                "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
+                "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
+                "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
+                "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
+                "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
+                "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
+                "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
+                "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
+                "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
+                "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
+                "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
+                "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
+                "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
+                "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
+                "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
+                "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
+                "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
+                "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
+                "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
+                "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
+                "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
+                "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
+                "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
+                "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
+                "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
+                "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
+                "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
+                "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
+                "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
+                "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
+                "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
+                "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
+                "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
+                "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
+                "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
+                "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
+                "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
+                "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
+                "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
+                "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
+                "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
+                "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
+                "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
+                "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
             ],
             "markers": "python_version >= '3.6'",
-            "version": "==21.0"
+            "version": "==2.0.1"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
+                "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==21.2"
         },
         "pluggy": {
             "hashes": [
@@ -193,17 +276,42 @@
             "markers": "python_version >= '3.6'",
             "version": "==1.0.0"
         },
-        "psycopg2": {
+        "psycopg2-binary": {
             "hashes": [
-                "sha256:079d97fc22de90da1d370c90583659a9f9a6ee4007355f5825e5f1c70dffc1fa",
-                "sha256:2087013c159a73e09713294a44d0c8008204d06326006b7f652bef5ace66eebb",
-                "sha256:2c992196719fadda59f72d44603ee1a2fdcc67de097eea38d41c7ad9ad246e62",
-                "sha256:7640e1e4d72444ef012e275e7b53204d7fab341fb22bc76057ede22fe6860b25",
-                "sha256:7f91312f065df517187134cce8e395ab37f5b601a42446bdc0f0d51773621854",
-                "sha256:830c8e8dddab6b6716a4bf73a09910c7954a92f40cf1d1e702fb93c8a919cc56",
-                "sha256:89409d369f4882c47f7ea20c42c5046879ce22c1e4ea20ef3b00a4dfc0a7f188",
-                "sha256:bf35a25f1aaa8a3781195595577fcbb59934856ee46b4f252f56ad12b8043bcf",
-                "sha256:de5303a6f1d0a7a34b9d40e4d3bef684ccc44a49bbe3eb85e3c0bffb4a131b7c"
+                "sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
+                "sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
+                "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
+                "sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
+                "sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
+                "sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
+                "sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
+                "sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
+                "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
+                "sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
+                "sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
+                "sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
+                "sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
+                "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
+                "sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
+                "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
+                "sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
+                "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
+                "sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
+                "sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
+                "sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
+                "sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
+                "sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
+                "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
+                "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
+                "sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
+                "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
+                "sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
+                "sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
+                "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
+                "sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
+                "sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
+                "sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
+                "sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
             ],
             "index": "pypi",
             "version": "==2.9.1"
@@ -334,11 +442,11 @@
         },
         "filelock": {
             "hashes": [
-                "sha256:2b5eb3589e7fdda14599e7eb1a50e09b4cc14f34ed98b8ba56d33bfaafcbef2f",
-                "sha256:34a9f35f95c441e7b38209775d6e0337f9a3759f3565f6c5798f19618527c76f"
+                "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
+                "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
             ],
             "markers": "python_version >= '3.6'",
-            "version": "==3.3.1"
+            "version": "==3.3.2"
         },
         "flake8": {
             "hashes": [
@@ -510,11 +618,11 @@
         },
         "virtualenv": {
             "hashes": [
-                "sha256:10062e34c204b5e4ec5f62e6ef2473f8ba76513a9a617e873f1f8fb4a519d300",
-                "sha256:bcc17f0b3a29670dd777d6f0755a4c04f28815395bca279cdcb213b97199a6b8"
+                "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
+                "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
             ],
             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==20.8.1"
+            "version": "==20.10.0"
         },
         "virtualenv-clone": {
             "hashes": [
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 8f5deef690..a83cbc95dd 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -1,29 +1,22 @@
+import dataclasses
+import json
 import os
+from pathlib import Path
 import re
+import subprocess
 import timeit
-import pathlib
-import uuid
-import psycopg2
+import calendar
+import enum
+from datetime import datetime
 import pytest
 from _pytest.config import Config
-from _pytest.runner import CallInfo
 from _pytest.terminal import TerminalReporter
-import shutil
-import signal
-import subprocess
-import time
+import warnings
 
 from contextlib import contextmanager
-from contextlib import closing
-from pathlib import Path
-from dataclasses import dataclass
 
 # Type-related stuff
-from psycopg2.extensions import connection as PgConnection
-from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
-from typing_extensions import Literal
-
-from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
+from typing import Iterator, Optional
 """
 This file contains fixtures for micro-benchmarks.
 
@@ -39,7 +32,7 @@ def test_mybench(zenith_simple_env: env, zenbenchmark):
 
     # Initialize the test
     ...
-    
+
     # Run the test, timing how long it takes
     with zenbenchmark.record_duration('test_query'):
         cur.execute('SELECT test_query(...)')
@@ -55,36 +48,91 @@ in the test initialization, or measure disk usage after the test query.
 """
 
 
-# TODO: It would perhaps be better to store the results as additional
-# properties in the pytest TestReport objects, to make them visible to
-# other pytest tools.
-class ZenithBenchmarkResults:
-    """ An object for recording benchmark results. """
-    def __init__(self):
-        self.results = []
+@dataclasses.dataclass
+class PgBenchRunResult:
+    scale: int
+    number_of_clients: int
+    number_of_threads: int
+    number_of_transactions_actually_processed: int
+    latency_average: float
+    latency_stddev: float
+    tps_including_connection_time: float
+    tps_excluding_connection_time: float
+    init_duration: float
+    init_start_timestamp: int
+    init_end_timestamp: int
+    run_duration: float
+    run_start_timestamp: int
+    run_end_timestamp: int
 
-    def record(self, test_name: str, metric_name: str, metric_value: float, unit: str):
-        """
-        Record a benchmark result.
-        """
+    # TODO progress
 
-        self.results.append((test_name, metric_name, metric_value, unit))
+    @classmethod
+    def parse_from_output(
+        cls,
+        out: 'subprocess.CompletedProcess[str]',
+        init_duration: float,
+        init_start_timestamp: int,
+        init_end_timestamp: int,
+        run_duration: float,
+        run_start_timestamp: int,
+        run_end_timestamp: int,
+    ):
+        stdout_lines = out.stdout.splitlines()
+        # we know significant parts of these values from test input
+        # but to be precise take them from output
+        # scaling factor: 5
+        assert "scaling factor" in stdout_lines[1]
+        scale = int(stdout_lines[1].split()[-1])
+        # number of clients: 1
+        assert "number of clients" in stdout_lines[3]
+        number_of_clients = int(stdout_lines[3].split()[-1])
+        # number of threads: 1
+        assert "number of threads" in stdout_lines[4]
+        number_of_threads = int(stdout_lines[4].split()[-1])
+        # number of transactions actually processed: 1000/1000
+        assert "number of transactions actually processed" in stdout_lines[6]
+        number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1])
+        # latency average = 19.894 ms
+        assert "latency average" in stdout_lines[7]
+        latency_average = stdout_lines[7].split()[-2]
+        # latency stddev = 3.387 ms
+        assert "latency stddev" in stdout_lines[8]
+        latency_stddev = stdout_lines[8].split()[-2]
+        # tps = 50.219689 (including connections establishing)
+        assert "(including connections establishing)" in stdout_lines[9]
+        tps_including_connection_time = stdout_lines[9].split()[2]
+        # tps = 50.264435 (excluding connections establishing)
+        assert "(excluding connections establishing)" in stdout_lines[10]
+        tps_excluding_connection_time = stdout_lines[10].split()[2]
+
+        return cls(
+            scale=scale,
+            number_of_clients=number_of_clients,
+            number_of_threads=number_of_threads,
+            number_of_transactions_actually_processed=number_of_transactions_actually_processed,
+            latency_average=float(latency_average),
+            latency_stddev=float(latency_stddev),
+            tps_including_connection_time=float(tps_including_connection_time),
+            tps_excluding_connection_time=float(tps_excluding_connection_time),
+            init_duration=init_duration,
+            init_start_timestamp=init_start_timestamp,
+            init_end_timestamp=init_end_timestamp,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
 
 
-# Will be recreated in each session.
-zenbenchmark_results: ZenithBenchmarkResults = ZenithBenchmarkResults()
-
-
-# Session scope fixture that initializes the results object
-@pytest.fixture(autouse=True, scope='session')
-def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
-    """
-    This is a python decorator for benchmark fixtures
-    """
-    global zenbenchmark_results
-    zenbenchmark_results = ZenithBenchmarkResults()
-
-    yield zenbenchmark_results
+@enum.unique
+class MetricReport(str, enum.Enum):  # str is a hack to make it json serializable
+    # this means that this is a constant test parameter
+    # like number of transactions, or number of clients
+    TEST_PARAM = 'test_param'
+    # reporter can use it to mark test runs with higher values as improvements
+    HIGHER_IS_BETTER = 'higher_is_better'
+    # the same but for lower values
+    LOWER_IS_BETTER = 'lower_is_better'
 
 
 class ZenithBenchmarker:
@@ -92,30 +140,109 @@ class ZenithBenchmarker:
     An object for recording benchmark results. This is created for each test
     function by the zenbenchmark fixture
     """
-    def __init__(self, results, request):
-        self.results = results
-        self.request = request
+    def __init__(self, property_recorder):
+        # property recorder here is a pytest fixture provided by junitxml module
+        # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
+        self.property_recorder = property_recorder
 
-    def record(self, metric_name: str, metric_value: float, unit: str):
+    def record(
+        self,
+        metric_name: str,
+        metric_value: float,
+        unit: str,
+        report: MetricReport,
+    ):
         """
         Record a benchmark result.
         """
-        self.results.record(self.request.node.name, metric_name, metric_value, unit)
+        # just to namespace the value
+        name = f"zenith_benchmarker_{metric_name}"
+        self.property_recorder(
+            name,
+            {
+                "name": metric_name,
+                "value": metric_value,
+                "unit": unit,
+                "report": report,
+            },
+        )
 
     @contextmanager
-    def record_duration(self, metric_name):
+    def record_duration(self, metric_name: str):
         """
         Record a duration. Usage:
-        
+
         with zenbenchmark.record_duration('foobar_runtime'):
             foobar()   # measure this
-        
         """
         start = timeit.default_timer()
         yield
         end = timeit.default_timer()
 
-        self.results.record(self.request.node.name, metric_name, end - start, 's')
+        self.record(
+            metric_name=metric_name,
+            metric_value=end - start,
+            unit="s",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+    def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult):
+        self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM)
+        self.record("number_of_clients",
+                    pg_bench_result.number_of_clients,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("number_of_threads",
+                    pg_bench_result.number_of_threads,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record(
+            "number_of_transactions_actually_processed",
+            pg_bench_result.number_of_transactions_actually_processed,
+            '',
+            # thats because this is predefined by test matrix and doesnt change across runs
+            report=MetricReport.TEST_PARAM,
+        )
+        self.record("latency_average",
+                    pg_bench_result.latency_average,
+                    unit="ms",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("latency_stddev",
+                    pg_bench_result.latency_stddev,
+                    unit="ms",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("tps_including_connection_time",
+                    pg_bench_result.tps_including_connection_time,
+                    '',
+                    report=MetricReport.HIGHER_IS_BETTER)
+        self.record("tps_excluding_connection_time",
+                    pg_bench_result.tps_excluding_connection_time,
+                    '',
+                    report=MetricReport.HIGHER_IS_BETTER)
+        self.record("init_duration",
+                    pg_bench_result.init_duration,
+                    unit="s",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("init_start_timestamp",
+                    pg_bench_result.init_start_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("init_end_timestamp",
+                    pg_bench_result.init_end_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("run_duration",
+                    pg_bench_result.run_duration,
+                    unit="s",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("run_start_timestamp",
+                    pg_bench_result.run_start_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("run_end_timestamp",
+                    pg_bench_result.run_end_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
 
     def get_io_writes(self, pageserver) -> int:
         """
@@ -149,7 +276,7 @@ class ZenithBenchmarker:
         assert matches
         return int(round(float(matches.group(1))))
 
-    def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
+    def get_timeline_size(self, repo_dir: Path, tenantid: str, timelineid: str):
         """
         Calculate the on-disk size of a timeline
         """
@@ -171,47 +298,82 @@ class ZenithBenchmarker:
         yield
         after = self.get_io_writes(pageserver)
 
-        self.results.record(self.request.node.name,
-                            metric_name,
-                            round((after - before) / (1024 * 1024)),
-                            'MB')
+        self.record(metric_name,
+                    round((after - before) / (1024 * 1024)),
+                    "MB",
+                    report=MetricReport.LOWER_IS_BETTER)
 
 
-@pytest.fixture(scope='function')
-def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
+@pytest.fixture(scope="function")
+def zenbenchmark(record_property) -> Iterator[ZenithBenchmarker]:
     """
     This is a python decorator for benchmark fixtures. It contains functions for
     recording measurements, and prints them out at the end.
     """
-    benchmarker = ZenithBenchmarker(zenbenchmark_global, request)
+    benchmarker = ZenithBenchmarker(record_property)
     yield benchmarker
 
 
+def get_out_path(target_dir: Path, revision: str) -> Path:
+    """
+    get output file path
+    if running in the CI uses commit revision
+    to avoid duplicates uses counter
+    """
+    # use UTC timestamp as a counter marker to avoid weird behaviour
+    # when for example files are deleted
+    ts = calendar.timegm(datetime.utcnow().utctimetuple())
+    path = target_dir / f"{ts}_{revision}.json"
+    assert not path.exists()
+    return path
+
+
 # Hook to print the results at the end
 @pytest.hookimpl(hookwrapper=True)
 def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
     yield
+    revision = os.getenv("GITHUB_SHA", "local")
+    platform = os.getenv("PLATFORM", "local")
 
-    global zenbenchmark_results
+    terminalreporter.section("Benchmark results", "-")
 
-    if not zenbenchmark_results:
+    result = []
+    for test_report in terminalreporter.stats.get("passed", []):
+        result_entry = []
+
+        for _, recorded_property in test_report.user_properties:
+            terminalreporter.write("{}.{}: ".format(test_report.head_line,
+                                                    recorded_property["name"]))
+            unit = recorded_property["unit"]
+            value = recorded_property["value"]
+            if unit == "MB":
+                terminalreporter.write("{0:,.0f}".format(value), green=True)
+            elif unit in ("s", "ms") and isinstance(value, float):
+                terminalreporter.write("{0:,.3f}".format(value), green=True)
+            elif isinstance(value, float):
+                terminalreporter.write("{0:,.4f}".format(value), green=True)
+            else:
+                terminalreporter.write(str(value), green=True)
+            terminalreporter.line(" {}".format(unit))
+
+            result_entry.append(recorded_property)
+
+        result.append({
+            "suit": test_report.nodeid,
+            "total_duration": test_report.duration,
+            "data": result_entry,
+        })
+
+    out_dir = config.getoption("out_dir")
+    if out_dir is None:
+        warnings.warn("no out dir provided to store performance test results")
         return
 
-    terminalreporter.section('Benchmark results', "-")
+    if not result:
+        warnings.warn("no results to store (no passed test suites)")
+        return
 
-    for result in zenbenchmark_results.results:
-        func = result[0]
-        metric_name = result[1]
-        metric_value = result[2]
-        unit = result[3]
-
-        terminalreporter.write("{}.{}: ".format(func, metric_name))
-
-        if unit == 'MB':
-            terminalreporter.write("{0:,.0f}".format(metric_value), green=True)
-        elif unit == 's':
-            terminalreporter.write("{0:,.3f}".format(metric_value), green=True)
-        else:
-            terminalreporter.write("{0:,.4f}".format(metric_value), green=True)
-
-        terminalreporter.line(" {}".format(unit))
+    get_out_path(Path(out_dir), revision=revision).write_text(
+        json.dumps({
+            "revision": revision, "platform": platform, "result": result
+        }, indent=4))
diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py
index 4622cf64d4..48faf47a6d 100644
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -6,6 +6,7 @@ import asyncpg
 import os
 import pathlib
 import uuid
+import warnings
 import jwt
 import json
 import psycopg2
@@ -26,6 +27,7 @@ from dataclasses import dataclass
 from psycopg2.extensions import connection as PgConnection
 from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
 from typing_extensions import Literal
+import pytest
 
 import requests
 
@@ -58,6 +60,16 @@ DEFAULT_POSTGRES_DIR = 'tmp_install'
 BASE_PORT = 15000
 WORKER_PORT_NUM = 100
 
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--skip-interfering-proc-check",
+        dest="skip_interfering_proc_check",
+        action="store_true",
+        help="skip check for interferring processes",
+    )
+
+
 # These are set in pytest_configure()
 base_dir = ""
 zenith_binpath = ""
@@ -65,14 +77,10 @@ pg_distrib_dir = ""
 top_output_dir = ""
 
 
-def pytest_configure(config):
-    """
-    Ensure that no unwanted daemons are running before we start testing.
-    Check that we do not owerflow available ports range.
-    """
-    numprocesses = config.getoption('numprocesses')
-    if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768:  # do not use ephemeral ports
-        raise Exception('Too many workers configured. Cannot distrubute ports for services.')
+def check_interferring_processes(config):
+    if config.getoption("skip_interfering_proc_check"):
+        warnings.warn("interferring process check is skipped")
+        return
 
     # does not use -c as it is not supported on macOS
     cmd = ['pgrep', 'pageserver|postgres|safekeeper']
@@ -86,11 +94,36 @@ def pytest_configure(config):
             'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.'
         )
 
+
+def pytest_configure(config):
+    """
+    Ensure that no unwanted daemons are running before we start testing.
+    Check that we do not owerflow available ports range.
+    """
+    check_interferring_processes(config)
+
+    numprocesses = config.getoption('numprocesses')
+    if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768:  # do not use ephemeral ports
+        raise Exception('Too many workers configured. Cannot distrubute ports for services.')
+
     # find the base directory (currently this is the git root)
     global base_dir
     base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
     log.info(f'base_dir is {base_dir}')
 
+    # Compute the top-level directory for all tests.
+    global top_output_dir
+    env_test_output = os.environ.get('TEST_OUTPUT')
+    if env_test_output is not None:
+        top_output_dir = env_test_output
+    else:
+        top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
+    mkdir_if_needed(top_output_dir)
+
+    if os.getenv("REMOTE_ENV"):
+        # we are in remote env and do not have zenith binaries locally
+        # this is the case for benchmarks run on self-hosted runner
+        return
     # Find the zenith binaries.
     global zenith_binpath
     env_zenith_bin = os.environ.get('ZENITH_BIN')
@@ -100,7 +133,7 @@ def pytest_configure(config):
         zenith_binpath = os.path.join(base_dir, 'target/debug')
     log.info(f'zenith_binpath is {zenith_binpath}')
     if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')):
-        raise Exception('zenith binaries not found at "{}"'.format(zenith_dir))
+        raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath))
 
     # Find the postgres installation.
     global pg_distrib_dir
@@ -113,15 +146,6 @@ def pytest_configure(config):
     if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')):
         raise Exception('postgres not found at "{}"'.format(pg_distrib_dir))
 
-    # Compute the top-level directory for all tests.
-    global top_output_dir
-    env_test_output = os.environ.get('TEST_OUTPUT')
-    if env_test_output is not None:
-        top_output_dir = env_test_output
-    else:
-        top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
-    mkdir_if_needed(top_output_dir)
-
 
 def zenfixture(func: Fn) -> Fn:
     """
diff --git a/test_runner/performance/__init__.py b/test_runner/performance/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/performance/conftest.py b/test_runner/performance/conftest.py
new file mode 100644
index 0000000000..cd8b40ca82
--- /dev/null
+++ b/test_runner/performance/conftest.py
@@ -0,0 +1,8 @@
+# pytest some has quirks with discovering plugins, so having it there just works
+# probably we should create custom plugin and add it to pytest config to always have needed things at hand
+def pytest_addoption(parser):
+    parser.addoption(
+        "--out-dir",
+        dest="out_dir",
+        help="Directory to ouput performance tests results to.",
+    )
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 46dcb01c71..9892a70516 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,7 +1,7 @@
-import os
 from contextlib import closing
 from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log
+from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
 
 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 
@@ -16,7 +16,7 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 # 3. Disk space used
 # 4. Peak memory usage
 #
-def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
+def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark: ZenithBenchmarker):
     env = zenith_simple_env
     # Create a branch for us
     env.zenith_cli(["branch", "test_bulk_insert", "empty"])
@@ -47,10 +47,16 @@ def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
                     pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
 
             # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
+            zenbenchmark.record("peak_mem",
+                                zenbenchmark.get_peak_mem(env.pageserver) / 1024,
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
 
             # Report disk space used by the repository
             timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
                                                            env.initial_tenant,
                                                            timeline)
-            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+            zenbenchmark.record('size',
+                                timeline_size / (1024 * 1024),
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py
index e913afc27c..f2ccb1dc34 100644
--- a/test_runner/performance/test_bulk_tenant_create.py
+++ b/test_runner/performance/test_bulk_tenant_create.py
@@ -1,4 +1,5 @@
 import timeit
+from fixtures.benchmark_fixture import MetricReport
 import pytest
 
 from fixtures.zenith_fixtures import ZenithEnvBuilder
@@ -54,4 +55,7 @@ def test_bulk_tenant_create(
 
         pg_tenant.stop()
 
-    zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
+    zenbenchmark.record('tenant_creation_time',
+                        sum(time_slices) / len(time_slices),
+                        's',
+                        report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py
index b078c820b0..daa8c71df1 100644
--- a/test_runner/performance/test_gist_build.py
+++ b/test_runner/performance/test_gist_build.py
@@ -1,5 +1,6 @@
 import os
 from contextlib import closing
+from fixtures.benchmark_fixture import MetricReport
 from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log
 
@@ -48,10 +49,16 @@ def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
                     pscur.execute(f"do_gc {env.initial_tenant} {timeline} 1000000")
 
             # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
+            zenbenchmark.record("peak_mem",
+                                zenbenchmark.get_peak_mem(env.pageserver) / 1024,
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
 
             # Report disk space used by the repository
             timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
                                                            env.initial_tenant,
                                                            timeline)
-            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+            zenbenchmark.record('size',
+                                timeline_size / (1024 * 1024),
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index dc50587a82..307dfb3559 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -1,6 +1,7 @@
-import os
 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.zenith_fixtures import PgBin, ZenithEnv
+
+from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
 from fixtures.log_helper import log
 
 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
@@ -15,7 +16,7 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 # 2. Time to run 5000 pgbench transactions
 # 3. Disk space used
 #
-def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
+def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin: PgBin, zenbenchmark: ZenithBenchmarker):
     env = zenith_simple_env
     # Create a branch for us
     env.zenith_cli(["branch", "test_pgbench_perf", "empty"])
@@ -55,4 +56,7 @@ def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
 
     # Report disk space used by the repository
     timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
-    zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+    zenbenchmark.record('size',
+                        timeline_size / (1024 * 1024),
+                        'MB',
+                        report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_perf_pgbench_remote.py b/test_runner/performance/test_perf_pgbench_remote.py
new file mode 100644
index 0000000000..2d64a39a95
--- /dev/null
+++ b/test_runner/performance/test_perf_pgbench_remote.py
@@ -0,0 +1,125 @@
+import dataclasses
+import os
+import subprocess
+from typing import List
+from fixtures.benchmark_fixture import PgBenchRunResult, ZenithBenchmarker
+import pytest
+from datetime import datetime
+import calendar
+import timeit
+import os
+
+pytest_plugins = ("fixtures.benchmark_fixture", )
+
+
+def utc_now_timestamp() -> int:
+    return calendar.timegm(datetime.utcnow().utctimetuple())
+
+
+@dataclasses.dataclass
+class PgBenchRunner:
+    connstr: str
+    scale: int
+    transactions: int
+    pgbench_bin_path: str = "pgbench"
+
+    def invoke(self, args: List[str]) -> 'subprocess.CompletedProcess[str]':
+        return subprocess.run([self.pgbench_bin_path, *args],
+                              check=True,
+                              text=True,
+                              capture_output=True)
+
+    def init(self, vacuum: bool = True) -> 'subprocess.CompletedProcess[str]':
+        args = []
+        if not vacuum:
+            args.append("--no-vacuum")
+        args.extend([f"--scale={self.scale}", "--initialize", self.connstr])
+        return self.invoke(args)
+
+    def run(self, jobs: int = 1, clients: int = 1):
+        return self.invoke([
+            f"--transactions={self.transactions}",
+            f"--jobs={jobs}",
+            f"--client={clients}",
+            "--progress=2",  # print progress every two seconds
+            self.connstr,
+        ])
+
+
+@pytest.fixture
+def connstr():
+    res = os.getenv("BENCHMARK_CONNSTR")
+    if res is None:
+        raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
+    return res
+
+
+def get_transactions_matrix():
+    transactions = os.getenv("TEST_PG_BENCH_TRANSACTIONS_MATRIX")
+    if transactions is None:
+        return [10**4, 10**5]
+    return list(map(int, transactions.split(",")))
+
+
+def get_scales_matrix():
+    scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX")
+    if scales is None:
+        return [10, 20]
+    return list(map(int, scales.split(",")))
+
+
+@pytest.mark.parametrize("scale", get_scales_matrix())
+@pytest.mark.parametrize("transactions", get_transactions_matrix())
+@pytest.mark.remote_cluster
+def test_pg_bench_remote_cluster(zenbenchmark: ZenithBenchmarker,
+                                 connstr: str,
+                                 scale: int,
+                                 transactions: int):
+    """
+    The best way is to run same pack of tests both, for local zenith
+    and against staging, but currently local tests heavily depend on
+    things available only locally e.g. zenith binaries, pageserver api, etc.
+    Also separate test allows to run pgbench workload against vanilla postgres
+    or other systems that support postgres protocol.
+
+    Also now this is more of a liveness test because it stresses pageserver internals,
+    so we clearly see what goes wrong in more "real" environment.
+    """
+    pg_bin = os.getenv("PG_BIN")
+    if pg_bin is not None:
+        pgbench_bin_path = os.path.join(pg_bin, "pgbench")
+    else:
+        pgbench_bin_path = "pgbench"
+
+    runner = PgBenchRunner(
+        connstr=connstr,
+        scale=scale,
+        transactions=transactions,
+        pgbench_bin_path=pgbench_bin_path,
+    )
+    # calculate timestamps and durations separately
+    # timestamp is intended to be used for linking to grafana and logs
+    # duration is actually a metric and uses float instead of int for timestamp
+    init_start_timestamp = utc_now_timestamp()
+    t0 = timeit.default_timer()
+    runner.init()
+    init_duration = timeit.default_timer() - t0
+    init_end_timestamp = utc_now_timestamp()
+
+    run_start_timestamp = utc_now_timestamp()
+    t0 = timeit.default_timer()
+    out = runner.run()  # TODO handle failures
+    run_duration = timeit.default_timer() - t0
+    run_end_timestamp = utc_now_timestamp()
+
+    res = PgBenchRunResult.parse_from_output(
+        out=out,
+        init_duration=init_duration,
+        init_start_timestamp=init_start_timestamp,
+        init_end_timestamp=init_end_timestamp,
+        run_duration=run_duration,
+        run_start_timestamp=run_start_timestamp,
+        run_end_timestamp=run_end_timestamp,
+    )
+
+    zenbenchmark.record_pg_bench_result(res)
diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py
index a5850e98f6..46e8ac5266 100644
--- a/test_runner/performance/test_write_amplification.py
+++ b/test_runner/performance/test_write_amplification.py
@@ -12,6 +12,7 @@
 # Amplification problem at its finest.
 import os
 from contextlib import closing
+from fixtures.benchmark_fixture import MetricReport
 from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log
 
@@ -76,4 +77,7 @@ def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
             timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
                                                            env.initial_tenant,
                                                            timeline)
-            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+            zenbenchmark.record('size',
+                                timeline_size / (1024 * 1024),
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/pytest.ini b/test_runner/pytest.ini
index 7ea2ae5dfb..b7d42dfe46 100644
--- a/test_runner/pytest.ini
+++ b/test_runner/pytest.ini
@@ -1,4 +1,8 @@
 [pytest]
+addopts =
+    -m 'not remote_cluster'
+markers =
+    remote_cluster
 minversion = 6.0
 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S