diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9e4b9ea4ba..1ba8a62d4d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -267,7 +267,7 @@ jobs:
             # -n4 uses four processes to run tests via pytest-xdist
             # -s is not used to prevent pytest from capturing output, because tests are running
             # in parallel and logs are mixed between different tests
-            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
+            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -m "not remote_cluster" -rA $TEST_SELECTION $EXTRA_PARAMS
       - run:
           # CircleCI artifacts are preserved one file at a time, so skipping
           # this step isn't a good idea. If you want to extract the
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
new file mode 100644
index 0000000000..c87a22afc1
--- /dev/null
+++ b/.github/workflows/benchmarking.yml
@@ -0,0 +1,158 @@
+name: benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ mybranch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '36 7 * * *' # run once a day, timezone is utc
+
+env:
+  BASE_URL: "https://console.zenith.tech"
+
+jobs:
+  bench:
+    # this workflow runs on self hosteed runner
+    # it's environment is quite different from usual guthub runner
+    # probably the most important difference is that it doesnt start from clean workspace each time
+    # e g if you install system packages they are not cleaned up since you install them directly in host machine
+    # not a container or something
+    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
+    runs-on: [self-hosted, zenith-benchmarker]
+
+    steps:
+    - name: Checkout zenith repo
+      uses: actions/checkout@v2
+
+    - name: Checkout zenith-perf-data repo
+      uses: actions/checkout@v2
+      with:
+        repository: zenithdb/zenith-perf-data
+        token: ${{ secrets.VIP_VAP_ACCESS_TOKEN }}
+        ref: testing # TODO replace with master once everything is ready
+        path: zenith-perf-data
+
+    # actions/setup-python@v2 is not working correctly on self-hosted runners
+    # see https://github.com/actions/setup-python/issues/162
+    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
+    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
+    # there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
+    - name: Install pipenv & deps
+      run: |
+        python3 -m pip install --upgrade pipenv wheel
+        # since pip/pipenv caches are reused there shouldn't be any troubles with install every time
+        pipenv install
+
+    - name: Show versions
+      run: |
+        echo Python
+        python3 --version
+        pipenv run python3 --version
+        echo Pipenv
+        pipenv --version
+        echo Pgbench
+        pgbench --version
+
+    # FIXME cluster setup is skipped due to various changes in console API
+    # for now pre created cluster is used. When API gain some stability
+    # after massive changes dynamic cluster setup will be revived.
+    # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
+    - name: Setup cluster
+      env:
+        BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        # USERNAME: "benchmark"
+      shell: bash
+      run: |
+        set -e
+        # echo "Creating cluster"
+
+        # CLUSTER=$(curl -s --fail --show-error $BASE_URL/api/v1/clusters.json \
+        #     -H 'Content-Type: application/json; charset=utf-8' \
+        #     -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN" \
+        #     --data-binary @- << EOF
+        # {
+        #     "cluster": {
+        #         "name": "default_cluster",
+        #         "region_id": "2",
+        #         "instance_type_id": 7,
+        #         "settings": {}
+        #     },
+        #     "database": {"name": "benchmark"},
+        #     "role": {"name": "$USERNAME", "password": "$BENCHMARK_CONSOLE_USER_PASSWORD"}
+        # }
+        # EOF
+        # )
+
+        # echo "Created cluster"
+
+        echo "Starting cluster"
+        CLUSTER_ID=285
+        CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$CLUSTER_ID/start \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+        echo $CLUSTER | python -m json.tool
+
+        echo "Waiting for cluster to become ready"
+        sleep 10
+
+        # # note that jq is installed on host system
+        # CLUSTER_ID=$(echo $CLUSTER| jq ".id")
+        echo "CLUSTER_ID=$CLUSTER_ID" >> $GITHUB_ENV
+        # echo "Constructing connstr"
+        # CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$CLUSTER_ID.json \
+        #     -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+
+        # echo $CLUSTER | python -m json.tool
+        # CONNSTR=$(echo $CLUSTER | jq -r ".| \"postgresql://$USERNAME:$BENCHMARK_CONSOLE_USER_PASSWORD@\(.public_ip_address):\(.public_pg_port)/benchmark\"")
+        # echo "BENCHMARK_CONNSTR=$CONNSTR" >> $GITHUB_ENV
+
+    - name: Run benchmark
+      # pgbench is installed system wide from official repo
+      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # via
+      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
+      # [pgdg13]
+      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
+      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # enabled=1
+      # gpgcheck=0
+      # EOF
+      # sudo yum makecache
+      # sudo yum install postgresql13-contrib
+      # actual binaries are located in /usr/pgsql-13/bin/
+      env:
+        PG_BIN: "/usr/pgsql-13/bin/"
+        TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
+        TEST_PG_BENCH_SCALES_MATRIX: "10,15"
+        PLATFORM: "zenith-staging"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
+      run: |
+        mkdir -p zenith-perf-data/data/staging
+        pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir zenith-perf-data/data/staging
+
+    - name: Submit result
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+      run: |
+        cd zenith-perf-data
+        git add data
+        git commit --author="vipvap <vipvap@zenith.tech>" -m "add performance test result for $GITHUB_SHA zenith revision"
+        git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git main
+
+    # FIXME see comment above Setup cluster job
+    # change to delete cluster after switching to creating a cluster for every run
+    - name: Stop cluster
+      if: ${{ always() }}
+      env:
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+      run: |
+        curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$CLUSTER_ID/stop \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN"
diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py
new file mode 100755
index 0000000000..618f224c73
--- /dev/null
+++ b/scripts/generate_perf_report_page.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+import json
+from typing import Any, Dict, List, Optional, Tuple, cast
+from jinja2 import Template
+
+# skip 'input' columns. They are included in the header and just blow the table
+EXCLUDE_COLUMNS = frozenset({
+    'scale',
+    'duration',
+    'number_of_clients',
+    'number_of_threads',
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+
+KEY_EXCLUDE_FIELDS = frozenset({
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+NEGATIVE_COLOR = 'negative'
+POSITIVE_COLOR = 'positive'
+
+
+@dataclass
+class SuitRun:
+    revision: str
+    values: Dict[str, Any]
+
+
+@dataclass
+class SuitRuns:
+    platform: str
+    suit: str
+    common_columns: List[str]
+    value_columns: List[str]
+    runs: List[SuitRun]
+
+
+@dataclass
+class RowValue:
+    value: str
+    color: str
+    ratio: str
+
+
+def get_columns(values: List[Dict]) -> Tuple[List[Tuple[str, str]], List[str]]:
+    value_columns = []
+    common_columns = []
+    for item in values:
+        if item['name'] in KEY_EXCLUDE_FIELDS:
+            continue
+        if item['report'] != 'test_param':
+            value_columns.append(cast(str, item['name']))
+        else:
+            common_columns.append(
+                (cast(str, item['name']), cast(str, item['value'])))
+    value_columns.sort()
+    common_columns.sort(key=lambda x: x[0])  # sort by name
+    return common_columns, value_columns
+
+
+def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
+    color = ''
+    sign = '+' if ratio > 0 else ''
+    if abs(ratio) < 0.05:
+        return f'&nbsp({sign}{ratio:.2f})', color
+
+    if report not in {'test_param', 'higher_is_better', 'lower_is_better'}:
+        raise ValueError(f'Unknown report type: {report}')
+
+    if report == 'test_param':
+        return f'{ratio:.2f}', color
+
+    if ratio > 0:
+        if report == 'higher_is_better':
+            color = POSITIVE_COLOR
+        elif report == 'lower_is_better':
+            color = NEGATIVE_COLOR
+    elif ratio < 0:
+        if report == 'higher_is_better':
+            color = NEGATIVE_COLOR
+        elif report == 'lower_is_better':
+            color = POSITIVE_COLOR
+
+    return f'&nbsp({sign}{ratio:.2f})', color
+
+
+def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
+    for item in suit_run.values['data']:
+        if item['name'] == name:
+            return item
+    return None
+
+
+def get_row_values(columns: List[str], run_result: SuitRun,
+                   prev_result: Optional[SuitRun]) -> List[RowValue]:
+    row_values = []
+    for column in columns:
+        current_value = extract_value(column, run_result)
+        if current_value is None:
+            # should never happen
+            raise ValueError(f'{column} not found in {run_result.values}')
+
+        value = current_value["value"]
+        if isinstance(value, float):
+            value = f'{value:.2f}'
+
+        if prev_result is None:
+            row_values.append(RowValue(value, '', ''))
+            continue
+
+        prev_value = extract_value(column, prev_result)
+        if prev_value is None:
+            # this might happen when new metric is added and there is no value for it in previous run
+            # let this be here, TODO add proper handling when this actually happens
+            raise ValueError(f'{column} not found in previous result')
+        ratio = float(value) / float(prev_value['value']) - 1
+        ratio_display, color = format_ratio(ratio,
+                                            current_value['report'])
+        row_values.append(RowValue(value, color, ratio_display))
+    return row_values
+
+
+@dataclass
+class SuiteRunTableRow:
+    revision: str
+    values: List[RowValue]
+
+
+def prepare_rows_from_runs(value_columns: List[str],
+                           runs: List[SuitRun]) -> List[SuiteRunTableRow]:
+    rows = []
+    prev_run = None
+    for run in runs:
+        rows.append(
+            SuiteRunTableRow(revision=run.revision,
+                             values=get_row_values(value_columns, run,
+                                                   prev_run)))
+        prev_run = run
+
+    return rows
+
+
+def main(args: argparse.Namespace) -> None:
+    input_dir = Path(args.input_dir)
+    grouped_runs = {}
+    # we have files in form: <ctr>_<rev>.json
+    # fill them in the hashmap so we have grouped items for the
+    # same run configuration (scale, duration etc.) ordered by counter.
+    for item in sorted(input_dir.iterdir(),
+                       key=lambda x: int(x.name.split('_')[0])):
+        run_data = json.loads(item.read_text())
+        revision = run_data['revision']
+
+        for suit_result in run_data['result']:
+            key = "{}{}".format(run_data['platform'], suit_result['suit'])
+            # pack total duration as a synthetic value
+            total_duration = suit_result['total_duration']
+            suit_result['data'].append({
+                'name': 'total_duration',
+                'value': total_duration,
+                'unit': 's',
+                'report': 'lower_is_better',
+            })
+            common_columns, value_columns = get_columns(suit_result['data'])
+
+            grouped_runs.setdefault(
+                key,
+                SuitRuns(
+                    platform=run_data['platform'],
+                    suit=suit_result['suit'],
+                    common_columns=common_columns,
+                    value_columns=value_columns,
+                    runs=[],
+                ),
+            )
+
+            grouped_runs[key].runs.append(
+                SuitRun(revision=revision, values=suit_result))
+    context = {}
+    for result in grouped_runs.values():
+        suit = result.suit
+        context[suit] = {
+            'common_columns':
+            result.common_columns,
+            'value_columns':
+            result.value_columns,
+            'platform':
+            result.platform,
+            # reverse the order so newest results are on top of the table
+            'rows':
+            reversed(prepare_rows_from_runs(result.value_columns,
+                                            result.runs)),
+        }
+
+    template = Template((Path(__file__).parent / 'perf_report_template.html').read_text())
+
+    Path(args.out).write_text(template.render(context=context))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input-dir',
+        dest='input_dir',
+        required=True,
+        help='Directory with jsons generated by the test suite',
+    )
+    parser.add_argument('--out', required=True, help='Output html file path')
+    args = parser.parse_args()
+    main(args)
diff --git a/scripts/perf_report_template.html b/scripts/perf_report_template.html
new file mode 100644
index 0000000000..2847e75a00
--- /dev/null
+++ b/scripts/perf_report_template.html
@@ -0,0 +1,52 @@
+<!DOCTYPE html>
+<html>
+
+<body>
+    <style>
+        table,
+        th,
+        td {
+            border: 1px solid black;
+            border-collapse: collapse;
+        }
+
+        .positive {
+            background-color: rgba(0, 255, 0, 0.8)
+        }
+
+        .negative {
+            background-color: rgba(255, 0, 0, 0.65)
+        }
+    </style>
+
+    <h2>Zenith Performance Tests</h2>
+
+    {% for suit_name, suit_data in context.items() %}
+    <h3>Runs for {{ suit_name }} </h3>
+    <b>platform:</b> {{ suit_data.platform }}<br>
+    {% for common_column_name, common_column_value in suit_data.common_columns %}
+    <b>{{ common_column_name }}</b>: {{ common_column_value }}<br>
+    {% endfor %}
+    <br>
+
+    <table>
+        <tr>
+            <th>revision</th>
+            {% for column_name in suit_data.value_columns %}
+            <th>{{ column_name }}</th>
+            {% endfor %}
+        </tr>
+        {% for row in suit_data.rows %}
+        <tr>
+            <td><a href=https://github.com/zenithdb/zenith/commit/{{ row.revision }}>{{ row.revision[:6] }}</a></td>
+            {% for column_value in row.values %}
+            <td class="{{ column_value.color }}">{{ column_value.value }}{{column_value.ratio}}</td>
+            {% endfor %}
+        </tr>
+        {% endfor %}
+    </table>
+    {% endfor %}
+
+</body>
+
+</html>
diff --git a/test_runner/Pipfile b/test_runner/Pipfile
index e179553dbd..572ff82ec8 100644
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -5,13 +5,14 @@ name = "pypi"
 
 [packages]
 pytest = ">=6.0.0"
-psycopg2 = "*"
 typing-extensions = "*"
 pyjwt = {extras = ["crypto"], version = "*"}
 requests = "*"
 pytest-xdist = "*"
 asyncpg = "*"
 cached-property = "*"
+psycopg2-binary = "*"
+jinja2 = "*"
 
 [dev-packages]
 # Behavior may change slightly between versions. These are run continuously,
diff --git a/test_runner/Pipfile.lock b/test_runner/Pipfile.lock
index 185b91189b..1524bbe584 100644
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "63b72760ef37375186a638066ba0ad5804dbace99ddc503ea654e9749070ab24"
+            "sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -177,13 +177,96 @@
             ],
             "version": "==1.1.1"
         },
-        "packaging": {
+        "jinja2": {
             "hashes": [
-                "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
-                "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
+                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
+                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
+            ],
+            "index": "pypi",
+            "version": "==3.0.2"
+        },
+        "markupsafe": {
+            "hashes": [
+                "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
+                "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
+                "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
+                "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
+                "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
+                "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
+                "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
+                "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
+                "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
+                "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
+                "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
+                "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
+                "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
+                "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
+                "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
+                "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
+                "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
+                "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
+                "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
+                "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
+                "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
+                "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
+                "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
+                "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
+                "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
+                "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
+                "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
+                "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
+                "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
+                "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
+                "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
+                "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
+                "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
+                "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
+                "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
+                "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
+                "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
+                "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
+                "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
+                "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
+                "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
+                "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
+                "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
+                "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
+                "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
+                "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
+                "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
+                "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
+                "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
+                "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
+                "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
+                "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
+                "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
+                "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
+                "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
+                "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
+                "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
+                "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
+                "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
+                "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
+                "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
+                "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
+                "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
+                "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
+                "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
+                "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
+                "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
+                "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
+                "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
             ],
             "markers": "python_version >= '3.6'",
-            "version": "==21.0"
+            "version": "==2.0.1"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
+                "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==21.2"
         },
         "pluggy": {
             "hashes": [
@@ -193,17 +276,42 @@
             "markers": "python_version >= '3.6'",
             "version": "==1.0.0"
         },
-        "psycopg2": {
+        "psycopg2-binary": {
             "hashes": [
-                "sha256:079d97fc22de90da1d370c90583659a9f9a6ee4007355f5825e5f1c70dffc1fa",
-                "sha256:2087013c159a73e09713294a44d0c8008204d06326006b7f652bef5ace66eebb",
-                "sha256:2c992196719fadda59f72d44603ee1a2fdcc67de097eea38d41c7ad9ad246e62",
-                "sha256:7640e1e4d72444ef012e275e7b53204d7fab341fb22bc76057ede22fe6860b25",
-                "sha256:7f91312f065df517187134cce8e395ab37f5b601a42446bdc0f0d51773621854",
-                "sha256:830c8e8dddab6b6716a4bf73a09910c7954a92f40cf1d1e702fb93c8a919cc56",
-                "sha256:89409d369f4882c47f7ea20c42c5046879ce22c1e4ea20ef3b00a4dfc0a7f188",
-                "sha256:bf35a25f1aaa8a3781195595577fcbb59934856ee46b4f252f56ad12b8043bcf",
-                "sha256:de5303a6f1d0a7a34b9d40e4d3bef684ccc44a49bbe3eb85e3c0bffb4a131b7c"
+                "sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
+                "sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
+                "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
+                "sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
+                "sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
+                "sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
+                "sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
+                "sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
+                "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
+                "sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
+                "sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
+                "sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
+                "sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
+                "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
+                "sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
+                "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
+                "sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
+                "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
+                "sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
+                "sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
+                "sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
+                "sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
+                "sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
+                "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
+                "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
+                "sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
+                "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
+                "sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
+                "sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
+                "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
+                "sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
+                "sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
+                "sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
+                "sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
             ],
             "index": "pypi",
             "version": "==2.9.1"
@@ -334,11 +442,11 @@
         },
         "filelock": {
             "hashes": [
-                "sha256:2b5eb3589e7fdda14599e7eb1a50e09b4cc14f34ed98b8ba56d33bfaafcbef2f",
-                "sha256:34a9f35f95c441e7b38209775d6e0337f9a3759f3565f6c5798f19618527c76f"
+                "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
+                "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
             ],
             "markers": "python_version >= '3.6'",
-            "version": "==3.3.1"
+            "version": "==3.3.2"
         },
         "flake8": {
             "hashes": [
@@ -510,11 +618,11 @@
         },
         "virtualenv": {
             "hashes": [
-                "sha256:10062e34c204b5e4ec5f62e6ef2473f8ba76513a9a617e873f1f8fb4a519d300",
-                "sha256:bcc17f0b3a29670dd777d6f0755a4c04f28815395bca279cdcb213b97199a6b8"
+                "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
+                "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
             ],
             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==20.8.1"
+            "version": "==20.10.0"
         },
         "virtualenv-clone": {
             "hashes": [
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 8f5deef690..a83cbc95dd 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -1,29 +1,22 @@
+import dataclasses
+import json
 import os
+from pathlib import Path
 import re
+import subprocess
 import timeit
-import pathlib
-import uuid
-import psycopg2
+import calendar
+import enum
+from datetime import datetime
 import pytest
 from _pytest.config import Config
-from _pytest.runner import CallInfo
 from _pytest.terminal import TerminalReporter
-import shutil
-import signal
-import subprocess
-import time
+import warnings
 
 from contextlib import contextmanager
-from contextlib import closing
-from pathlib import Path
-from dataclasses import dataclass
 
 # Type-related stuff
-from psycopg2.extensions import connection as PgConnection
-from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
-from typing_extensions import Literal
-
-from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
+from typing import Iterator, Optional
 """
 This file contains fixtures for micro-benchmarks.
 
@@ -39,7 +32,7 @@ def test_mybench(zenith_simple_env: env, zenbenchmark):
 
     # Initialize the test
     ...
-    
+
     # Run the test, timing how long it takes
     with zenbenchmark.record_duration('test_query'):
         cur.execute('SELECT test_query(...)')
@@ -55,36 +48,91 @@ in the test initialization, or measure disk usage after the test query.
 """
 
 
-# TODO: It would perhaps be better to store the results as additional
-# properties in the pytest TestReport objects, to make them visible to
-# other pytest tools.
-class ZenithBenchmarkResults:
-    """ An object for recording benchmark results. """
-    def __init__(self):
-        self.results = []
+@dataclasses.dataclass
+class PgBenchRunResult:
+    scale: int
+    number_of_clients: int
+    number_of_threads: int
+    number_of_transactions_actually_processed: int
+    latency_average: float
+    latency_stddev: float
+    tps_including_connection_time: float
+    tps_excluding_connection_time: float
+    init_duration: float
+    init_start_timestamp: int
+    init_end_timestamp: int
+    run_duration: float
+    run_start_timestamp: int
+    run_end_timestamp: int
 
-    def record(self, test_name: str, metric_name: str, metric_value: float, unit: str):
-        """
-        Record a benchmark result.
-        """
+    # TODO progress
 
-        self.results.append((test_name, metric_name, metric_value, unit))
+    @classmethod
+    def parse_from_output(
+        cls,
+        out: 'subprocess.CompletedProcess[str]',
+        init_duration: float,
+        init_start_timestamp: int,
+        init_end_timestamp: int,
+        run_duration: float,
+        run_start_timestamp: int,
+        run_end_timestamp: int,
+    ):
+        stdout_lines = out.stdout.splitlines()
+        # we know significant parts of these values from test input
+        # but to be precise take them from output
+        # scaling factor: 5
+        assert "scaling factor" in stdout_lines[1]
+        scale = int(stdout_lines[1].split()[-1])
+        # number of clients: 1
+        assert "number of clients" in stdout_lines[3]
+        number_of_clients = int(stdout_lines[3].split()[-1])
+        # number of threads: 1
+        assert "number of threads" in stdout_lines[4]
+        number_of_threads = int(stdout_lines[4].split()[-1])
+        # number of transactions actually processed: 1000/1000
+        assert "number of transactions actually processed" in stdout_lines[6]
+        number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1])
+        # latency average = 19.894 ms
+        assert "latency average" in stdout_lines[7]
+        latency_average = stdout_lines[7].split()[-2]
+        # latency stddev = 3.387 ms
+        assert "latency stddev" in stdout_lines[8]
+        latency_stddev = stdout_lines[8].split()[-2]
+        # tps = 50.219689 (including connections establishing)
+        assert "(including connections establishing)" in stdout_lines[9]
+        tps_including_connection_time = stdout_lines[9].split()[2]
+        # tps = 50.264435 (excluding connections establishing)
+        assert "(excluding connections establishing)" in stdout_lines[10]
+        tps_excluding_connection_time = stdout_lines[10].split()[2]
+
+        return cls(
+            scale=scale,
+            number_of_clients=number_of_clients,
+            number_of_threads=number_of_threads,
+            number_of_transactions_actually_processed=number_of_transactions_actually_processed,
+            latency_average=float(latency_average),
+            latency_stddev=float(latency_stddev),
+            tps_including_connection_time=float(tps_including_connection_time),
+            tps_excluding_connection_time=float(tps_excluding_connection_time),
+            init_duration=init_duration,
+            init_start_timestamp=init_start_timestamp,
+            init_end_timestamp=init_end_timestamp,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
 
 
-# Will be recreated in each session.
-zenbenchmark_results: ZenithBenchmarkResults = ZenithBenchmarkResults()
-
-
-# Session scope fixture that initializes the results object
-@pytest.fixture(autouse=True, scope='session')
-def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
-    """
-    This is a python decorator for benchmark fixtures
-    """
-    global zenbenchmark_results
-    zenbenchmark_results = ZenithBenchmarkResults()
-
-    yield zenbenchmark_results
+@enum.unique
+class MetricReport(str, enum.Enum):  # str is a hack to make it json serializable
+    # this means that this is a constant test parameter
+    # like number of transactions, or number of clients
+    TEST_PARAM = 'test_param'
+    # reporter can use it to mark test runs with higher values as improvements
+    HIGHER_IS_BETTER = 'higher_is_better'
+    # the same but for lower values
+    LOWER_IS_BETTER = 'lower_is_better'
 
 
 class ZenithBenchmarker:
@@ -92,30 +140,109 @@ class ZenithBenchmarker:
     An object for recording benchmark results. This is created for each test
     function by the zenbenchmark fixture
     """
-    def __init__(self, results, request):
-        self.results = results
-        self.request = request
+    def __init__(self, property_recorder):
+        # property recorder here is a pytest fixture provided by junitxml module
+        # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
+        self.property_recorder = property_recorder
 
-    def record(self, metric_name: str, metric_value: float, unit: str):
+    def record(
+        self,
+        metric_name: str,
+        metric_value: float,
+        unit: str,
+        report: MetricReport,
+    ):
         """
         Record a benchmark result.
         """
-        self.results.record(self.request.node.name, metric_name, metric_value, unit)
+        # just to namespace the value
+        name = f"zenith_benchmarker_{metric_name}"
+        self.property_recorder(
+            name,
+            {
+                "name": metric_name,
+                "value": metric_value,
+                "unit": unit,
+                "report": report,
+            },
+        )
 
     @contextmanager
-    def record_duration(self, metric_name):
+    def record_duration(self, metric_name: str):
         """
         Record a duration. Usage:
-        
+
         with zenbenchmark.record_duration('foobar_runtime'):
             foobar()   # measure this
-        
         """
         start = timeit.default_timer()
         yield
         end = timeit.default_timer()
 
-        self.results.record(self.request.node.name, metric_name, end - start, 's')
+        self.record(
+            metric_name=metric_name,
+            metric_value=end - start,
+            unit="s",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+    def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult):
+        self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM)
+        self.record("number_of_clients",
+                    pg_bench_result.number_of_clients,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("number_of_threads",
+                    pg_bench_result.number_of_threads,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record(
+            "number_of_transactions_actually_processed",
+            pg_bench_result.number_of_transactions_actually_processed,
+            '',
+            # thats because this is predefined by test matrix and doesnt change across runs
+            report=MetricReport.TEST_PARAM,
+        )
+        self.record("latency_average",
+                    pg_bench_result.latency_average,
+                    unit="ms",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("latency_stddev",
+                    pg_bench_result.latency_stddev,
+                    unit="ms",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("tps_including_connection_time",
+                    pg_bench_result.tps_including_connection_time,
+                    '',
+                    report=MetricReport.HIGHER_IS_BETTER)
+        self.record("tps_excluding_connection_time",
+                    pg_bench_result.tps_excluding_connection_time,
+                    '',
+                    report=MetricReport.HIGHER_IS_BETTER)
+        self.record("init_duration",
+                    pg_bench_result.init_duration,
+                    unit="s",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("init_start_timestamp",
+                    pg_bench_result.init_start_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("init_end_timestamp",
+                    pg_bench_result.init_end_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("run_duration",
+                    pg_bench_result.run_duration,
+                    unit="s",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("run_start_timestamp",
+                    pg_bench_result.run_start_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("run_end_timestamp",
+                    pg_bench_result.run_end_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
 
     def get_io_writes(self, pageserver) -> int:
         """
@@ -149,7 +276,7 @@ class ZenithBenchmarker:
         assert matches
         return int(round(float(matches.group(1))))
 
-    def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
+    def get_timeline_size(self, repo_dir: Path, tenantid: str, timelineid: str):
         """
         Calculate the on-disk size of a timeline
         """
@@ -171,47 +298,82 @@ class ZenithBenchmarker:
         yield
         after = self.get_io_writes(pageserver)
 
-        self.results.record(self.request.node.name,
-                            metric_name,
-                            round((after - before) / (1024 * 1024)),
-                            'MB')
+        self.record(metric_name,
+                    round((after - before) / (1024 * 1024)),
+                    "MB",
+                    report=MetricReport.LOWER_IS_BETTER)
 
 
-@pytest.fixture(scope='function')
-def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
+@pytest.fixture(scope="function")
+def zenbenchmark(record_property) -> Iterator[ZenithBenchmarker]:
     """
     This is a python decorator for benchmark fixtures. It contains functions for
     recording measurements, and prints them out at the end.
     """
-    benchmarker = ZenithBenchmarker(zenbenchmark_global, request)
+    benchmarker = ZenithBenchmarker(record_property)
     yield benchmarker
 
 
+def get_out_path(target_dir: Path, revision: str) -> Path:
+    """
+    get output file path
+    if running in the CI uses commit revision
+    to avoid duplicates uses counter
+    """
+    # use UTC timestamp as a counter marker to avoid weird behaviour
+    # when for example files are deleted
+    ts = calendar.timegm(datetime.utcnow().utctimetuple())
+    path = target_dir / f"{ts}_{revision}.json"
+    assert not path.exists()
+    return path
+
+
 # Hook to print the results at the end
 @pytest.hookimpl(hookwrapper=True)
 def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
     yield
+    revision = os.getenv("GITHUB_SHA", "local")
+    platform = os.getenv("PLATFORM", "local")
 
-    global zenbenchmark_results
+    terminalreporter.section("Benchmark results", "-")
 
-    if not zenbenchmark_results:
+    result = []
+    for test_report in terminalreporter.stats.get("passed", []):
+        result_entry = []
+
+        for _, recorded_property in test_report.user_properties:
+            terminalreporter.write("{}.{}: ".format(test_report.head_line,
+                                                    recorded_property["name"]))
+            unit = recorded_property["unit"]
+            value = recorded_property["value"]
+            if unit == "MB":
+                terminalreporter.write("{0:,.0f}".format(value), green=True)
+            elif unit in ("s", "ms") and isinstance(value, float):
+                terminalreporter.write("{0:,.3f}".format(value), green=True)
+            elif isinstance(value, float):
+                terminalreporter.write("{0:,.4f}".format(value), green=True)
+            else:
+                terminalreporter.write(str(value), green=True)
+            terminalreporter.line(" {}".format(unit))
+
+            result_entry.append(recorded_property)
+
+        result.append({
+            "suit": test_report.nodeid,
+            "total_duration": test_report.duration,
+            "data": result_entry,
+        })
+
+    out_dir = config.getoption("out_dir")
+    if out_dir is None:
+        warnings.warn("no out dir provided to store performance test results")
         return
 
-    terminalreporter.section('Benchmark results', "-")
+    if not result:
+        warnings.warn("no results to store (no passed test suites)")
+        return
 
-    for result in zenbenchmark_results.results:
-        func = result[0]
-        metric_name = result[1]
-        metric_value = result[2]
-        unit = result[3]
-
-        terminalreporter.write("{}.{}: ".format(func, metric_name))
-
-        if unit == 'MB':
-            terminalreporter.write("{0:,.0f}".format(metric_value), green=True)
-        elif unit == 's':
-            terminalreporter.write("{0:,.3f}".format(metric_value), green=True)
-        else:
-            terminalreporter.write("{0:,.4f}".format(metric_value), green=True)
-
-        terminalreporter.line(" {}".format(unit))
+    get_out_path(Path(out_dir), revision=revision).write_text(
+        json.dumps({
+            "revision": revision, "platform": platform, "result": result
+        }, indent=4))
diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py
index 4622cf64d4..48faf47a6d 100644
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -6,6 +6,7 @@ import asyncpg
 import os
 import pathlib
 import uuid
+import warnings
 import jwt
 import json
 import psycopg2
@@ -26,6 +27,7 @@ from dataclasses import dataclass
 from psycopg2.extensions import connection as PgConnection
 from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
 from typing_extensions import Literal
+import pytest
 
 import requests
 
@@ -58,6 +60,16 @@ DEFAULT_POSTGRES_DIR = 'tmp_install'
 BASE_PORT = 15000
 WORKER_PORT_NUM = 100
 
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--skip-interfering-proc-check",
+        dest="skip_interfering_proc_check",
+        action="store_true",
+        help="skip check for interferring processes",
+    )
+
+
 # These are set in pytest_configure()
 base_dir = ""
 zenith_binpath = ""
@@ -65,14 +77,10 @@ pg_distrib_dir = ""
 top_output_dir = ""
 
 
-def pytest_configure(config):
-    """
-    Ensure that no unwanted daemons are running before we start testing.
-    Check that we do not owerflow available ports range.
-    """
-    numprocesses = config.getoption('numprocesses')
-    if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768:  # do not use ephemeral ports
-        raise Exception('Too many workers configured. Cannot distrubute ports for services.')
+def check_interferring_processes(config):
+    if config.getoption("skip_interfering_proc_check"):
+        warnings.warn("interferring process check is skipped")
+        return
 
     # does not use -c as it is not supported on macOS
     cmd = ['pgrep', 'pageserver|postgres|safekeeper']
@@ -86,11 +94,36 @@ def pytest_configure(config):
             'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.'
         )
 
+
+def pytest_configure(config):
+    """
+    Ensure that no unwanted daemons are running before we start testing.
+    Check that we do not owerflow available ports range.
+    """
+    check_interferring_processes(config)
+
+    numprocesses = config.getoption('numprocesses')
+    if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768:  # do not use ephemeral ports
+        raise Exception('Too many workers configured. Cannot distrubute ports for services.')
+
     # find the base directory (currently this is the git root)
     global base_dir
     base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
     log.info(f'base_dir is {base_dir}')
 
+    # Compute the top-level directory for all tests.
+    global top_output_dir
+    env_test_output = os.environ.get('TEST_OUTPUT')
+    if env_test_output is not None:
+        top_output_dir = env_test_output
+    else:
+        top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
+    mkdir_if_needed(top_output_dir)
+
+    if os.getenv("REMOTE_ENV"):
+        # we are in remote env and do not have zenith binaries locally
+        # this is the case for benchmarks run on self-hosted runner
+        return
     # Find the zenith binaries.
     global zenith_binpath
     env_zenith_bin = os.environ.get('ZENITH_BIN')
@@ -100,7 +133,7 @@ def pytest_configure(config):
         zenith_binpath = os.path.join(base_dir, 'target/debug')
     log.info(f'zenith_binpath is {zenith_binpath}')
     if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')):
-        raise Exception('zenith binaries not found at "{}"'.format(zenith_dir))
+        raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath))
 
     # Find the postgres installation.
     global pg_distrib_dir
@@ -113,15 +146,6 @@ def pytest_configure(config):
     if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')):
         raise Exception('postgres not found at "{}"'.format(pg_distrib_dir))
 
-    # Compute the top-level directory for all tests.
-    global top_output_dir
-    env_test_output = os.environ.get('TEST_OUTPUT')
-    if env_test_output is not None:
-        top_output_dir = env_test_output
-    else:
-        top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
-    mkdir_if_needed(top_output_dir)
-
 
 def zenfixture(func: Fn) -> Fn:
     """
diff --git a/test_runner/performance/__init__.py b/test_runner/performance/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/performance/conftest.py b/test_runner/performance/conftest.py
new file mode 100644
index 0000000000..cd8b40ca82
--- /dev/null
+++ b/test_runner/performance/conftest.py
@@ -0,0 +1,8 @@
+# pytest some has quirks with discovering plugins, so having it there just works
+# probably we should create custom plugin and add it to pytest config to always have needed things at hand
+def pytest_addoption(parser):
+    parser.addoption(
+        "--out-dir",
+        dest="out_dir",
+        help="Directory to ouput performance tests results to.",
+    )
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 46dcb01c71..9892a70516 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,7 +1,7 @@
-import os
 from contextlib import closing
 from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log
+from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
 
 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 
@@ -16,7 +16,7 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 # 3. Disk space used
 # 4. Peak memory usage
 #
-def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
+def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark: ZenithBenchmarker):
     env = zenith_simple_env
     # Create a branch for us
     env.zenith_cli(["branch", "test_bulk_insert", "empty"])
@@ -47,10 +47,16 @@ def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
                     pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
 
             # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
+            zenbenchmark.record("peak_mem",
+                                zenbenchmark.get_peak_mem(env.pageserver) / 1024,
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
 
             # Report disk space used by the repository
             timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
                                                            env.initial_tenant,
                                                            timeline)
-            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+            zenbenchmark.record('size',
+                                timeline_size / (1024 * 1024),
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py
index e913afc27c..f2ccb1dc34 100644
--- a/test_runner/performance/test_bulk_tenant_create.py
+++ b/test_runner/performance/test_bulk_tenant_create.py
@@ -1,4 +1,5 @@
 import timeit
+from fixtures.benchmark_fixture import MetricReport
 import pytest
 
 from fixtures.zenith_fixtures import ZenithEnvBuilder
@@ -54,4 +55,7 @@ def test_bulk_tenant_create(
 
         pg_tenant.stop()
 
-    zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
+    zenbenchmark.record('tenant_creation_time',
+                        sum(time_slices) / len(time_slices),
+                        's',
+                        report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py
index b078c820b0..daa8c71df1 100644
--- a/test_runner/performance/test_gist_build.py
+++ b/test_runner/performance/test_gist_build.py
@@ -1,5 +1,6 @@
 import os
 from contextlib import closing
+from fixtures.benchmark_fixture import MetricReport
 from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log
 
@@ -48,10 +49,16 @@ def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
                     pscur.execute(f"do_gc {env.initial_tenant} {timeline} 1000000")
 
             # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
+            zenbenchmark.record("peak_mem",
+                                zenbenchmark.get_peak_mem(env.pageserver) / 1024,
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
 
             # Report disk space used by the repository
             timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
                                                            env.initial_tenant,
                                                            timeline)
-            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+            zenbenchmark.record('size',
+                                timeline_size / (1024 * 1024),
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index dc50587a82..307dfb3559 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -1,6 +1,7 @@
-import os
 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.zenith_fixtures import PgBin, ZenithEnv
+
+from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
 from fixtures.log_helper import log
 
 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
@@ -15,7 +16,7 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 # 2. Time to run 5000 pgbench transactions
 # 3. Disk space used
 #
-def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
+def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin: PgBin, zenbenchmark: ZenithBenchmarker):
     env = zenith_simple_env
     # Create a branch for us
     env.zenith_cli(["branch", "test_pgbench_perf", "empty"])
@@ -55,4 +56,7 @@ def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
 
     # Report disk space used by the repository
     timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
-    zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+    zenbenchmark.record('size',
+                        timeline_size / (1024 * 1024),
+                        'MB',
+                        report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/performance/test_perf_pgbench_remote.py b/test_runner/performance/test_perf_pgbench_remote.py
new file mode 100644
index 0000000000..2d64a39a95
--- /dev/null
+++ b/test_runner/performance/test_perf_pgbench_remote.py
@@ -0,0 +1,125 @@
+import dataclasses
+import os
+import subprocess
+from typing import List
+from fixtures.benchmark_fixture import PgBenchRunResult, ZenithBenchmarker
+import pytest
+from datetime import datetime
+import calendar
+import timeit
+import os
+
+pytest_plugins = ("fixtures.benchmark_fixture", )
+
+
+def utc_now_timestamp() -> int:
+    return calendar.timegm(datetime.utcnow().utctimetuple())
+
+
+@dataclasses.dataclass
+class PgBenchRunner:
+    connstr: str
+    scale: int
+    transactions: int
+    pgbench_bin_path: str = "pgbench"
+
+    def invoke(self, args: List[str]) -> 'subprocess.CompletedProcess[str]':
+        return subprocess.run([self.pgbench_bin_path, *args],
+                              check=True,
+                              text=True,
+                              capture_output=True)
+
+    def init(self, vacuum: bool = True) -> 'subprocess.CompletedProcess[str]':
+        args = []
+        if not vacuum:
+            args.append("--no-vacuum")
+        args.extend([f"--scale={self.scale}", "--initialize", self.connstr])
+        return self.invoke(args)
+
+    def run(self, jobs: int = 1, clients: int = 1):
+        return self.invoke([
+            f"--transactions={self.transactions}",
+            f"--jobs={jobs}",
+            f"--client={clients}",
+            "--progress=2",  # print progress every two seconds
+            self.connstr,
+        ])
+
+
+@pytest.fixture
+def connstr():
+    res = os.getenv("BENCHMARK_CONNSTR")
+    if res is None:
+        raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
+    return res
+
+
+def get_transactions_matrix():
+    transactions = os.getenv("TEST_PG_BENCH_TRANSACTIONS_MATRIX")
+    if transactions is None:
+        return [10**4, 10**5]
+    return list(map(int, transactions.split(",")))
+
+
+def get_scales_matrix():
+    scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX")
+    if scales is None:
+        return [10, 20]
+    return list(map(int, scales.split(",")))
+
+
+@pytest.mark.parametrize("scale", get_scales_matrix())
+@pytest.mark.parametrize("transactions", get_transactions_matrix())
+@pytest.mark.remote_cluster
+def test_pg_bench_remote_cluster(zenbenchmark: ZenithBenchmarker,
+                                 connstr: str,
+                                 scale: int,
+                                 transactions: int):
+    """
+    The best way is to run same pack of tests both, for local zenith
+    and against staging, but currently local tests heavily depend on
+    things available only locally e.g. zenith binaries, pageserver api, etc.
+    Also separate test allows to run pgbench workload against vanilla postgres
+    or other systems that support postgres protocol.
+
+    Also now this is more of a liveness test because it stresses pageserver internals,
+    so we clearly see what goes wrong in more "real" environment.
+    """
+    pg_bin = os.getenv("PG_BIN")
+    if pg_bin is not None:
+        pgbench_bin_path = os.path.join(pg_bin, "pgbench")
+    else:
+        pgbench_bin_path = "pgbench"
+
+    runner = PgBenchRunner(
+        connstr=connstr,
+        scale=scale,
+        transactions=transactions,
+        pgbench_bin_path=pgbench_bin_path,
+    )
+    # calculate timestamps and durations separately
+    # timestamp is intended to be used for linking to grafana and logs
+    # duration is actually a metric and uses float instead of int for timestamp
+    init_start_timestamp = utc_now_timestamp()
+    t0 = timeit.default_timer()
+    runner.init()
+    init_duration = timeit.default_timer() - t0
+    init_end_timestamp = utc_now_timestamp()
+
+    run_start_timestamp = utc_now_timestamp()
+    t0 = timeit.default_timer()
+    out = runner.run()  # TODO handle failures
+    run_duration = timeit.default_timer() - t0
+    run_end_timestamp = utc_now_timestamp()
+
+    res = PgBenchRunResult.parse_from_output(
+        out=out,
+        init_duration=init_duration,
+        init_start_timestamp=init_start_timestamp,
+        init_end_timestamp=init_end_timestamp,
+        run_duration=run_duration,
+        run_start_timestamp=run_start_timestamp,
+        run_end_timestamp=run_end_timestamp,
+    )
+
+    zenbenchmark.record_pg_bench_result(res)
diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py
index a5850e98f6..46e8ac5266 100644
--- a/test_runner/performance/test_write_amplification.py
+++ b/test_runner/performance/test_write_amplification.py
@@ -12,6 +12,7 @@
 # Amplification problem at its finest.
 import os
 from contextlib import closing
+from fixtures.benchmark_fixture import MetricReport
 from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log
 
@@ -76,4 +77,7 @@ def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
             timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
                                                            env.initial_tenant,
                                                            timeline)
-            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
+            zenbenchmark.record('size',
+                                timeline_size / (1024 * 1024),
+                                'MB',
+                                report=MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/pytest.ini b/test_runner/pytest.ini
index 7ea2ae5dfb..b7d42dfe46 100644
--- a/test_runner/pytest.ini
+++ b/test_runner/pytest.ini
@@ -1,4 +1,8 @@
 [pytest]
+addopts =
+    -m 'not remote_cluster'
+markers =
+    remote_cluster
 minversion = 6.0
 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S