Move logic for ingest benchmark from GitHub workflow into python testcase (#9762)

## Problem The first version of the ingest benchmark had some parsing and reporting logic in shell script inside GitHub workflow. it is better to move that logic into a python testcase so that we can also run it locally. ## Summary of changes - Create new python testcase - invoke pgcopydb inside python test case - move the following logic into python testcase - determine backpressure - invoke pgcopydb and report its progress - parse pgcopydb log and extract metrics - insert metrics into perf test database - add additional column to perf test database that can receive endpoint ID used for pgcopydb run to have it available in grafana dashboard when retrieving other metrics for an endpoint ## Example run https://github.com/neondatabase/neon/actions/runs/11860622170/job/33056264386
2025-12-22 21:59:59 +00:00 · 2024-11-19 10:46:46 +01:00
parent 9b6af2bcad
commit 982cb1c15d
6 changed files with 324 additions and 248 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -48,6 +48,10 @@ inputs:
    description: 'benchmark durations JSON'
    required: false
    default: '{}'
+  aws_oicd_role_arn:
+    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
+    required: false
+    default: ''

 runs:
  using: "composite"
@@ -222,6 +226,13 @@ runs:
        # (for example if we didn't run the test for non build-and-test workflow)
        skip-if-does-not-exist: true

+    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
+      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
    - name: Upload test results
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-store
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -133,6 +133,7 @@ jobs:
          --ignore test_runner/performance/test_perf_pgvector_queries.py
          --ignore test_runner/performance/test_logical_replication.py
          --ignore test_runner/performance/test_physical_replication.py
+          --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -1,4 +1,4 @@
-name: Benchmarking
+name: benchmarking ingest

 on:
  # uncomment to run on push for debugging your PR
@@ -74,18 +74,16 @@ jobs:
        compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

-    - name: Initialize Neon project and retrieve current backpressure seconds
+    - name: Initialize Neon project
      if: ${{ matrix.target_project == 'new_empty_project' }}
      env:
-          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
+          BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
          NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
      run: |
        echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
-        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
-        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
-        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
-        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+        ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV

    - name: Create Neon Branch for large tenant
      if: ${{ matrix.target_project == 'large_existing_project' }}
@@ -95,266 +93,55 @@ jobs:
        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

-    - name: Initialize Neon project and retrieve current backpressure seconds
+    - name: Initialize Neon project 
      if: ${{ matrix.target_project == 'large_existing_project' }}
      env:
-          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
+          BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
          NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
      run: |
        echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
        # Extract the part before the database name
-        base_connstr="${NEW_PROJECT_CONNSTR%/*}"
+        base_connstr="${BENCHMARK_INGEST_TARGET_CONNSTR%/*}"
        # Extract the query parameters (if any) after the database name
-        query_params="${NEW_PROJECT_CONNSTR#*\?}"
+        query_params="${BENCHMARK_INGEST_TARGET_CONNSTR#*\?}"
        # Reconstruct the new connection string
-        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+        if [ "$query_params" != "$BENCHMARK_INGEST_TARGET_CONNSTR" ]; then
          new_connstr="${base_connstr}/neondb?${query_params}"
        else
          new_connstr="${base_connstr}/neondb"
        fi
        ${PSQL} "${new_connstr}" -c "drop database ludicrous;"
        ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
-        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
-          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
+        if [ "$query_params" != "$BENCHMARK_INGEST_TARGET_CONNSTR" ]; then
+          BENCHMARK_INGEST_TARGET_CONNSTR="${base_connstr}/ludicrous?${query_params}"
        else
-          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
+          BENCHMARK_INGEST_TARGET_CONNSTR="${base_connstr}/ludicrous"
        fi
-        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
-        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
-        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
-        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
-      
-        
-    - name: Create pgcopydb filter file
-      run: |
-        cat << EOF > /tmp/pgcopydb_filter.txt
-          [include-only-table]
-          public.events
-          public.emails
-          public.email_transmissions
-          public.payments
-          public.editions
-          public.edition_modules
-          public.sp_content
-          public.email_broadcasts
-          public.user_collections
-          public.devices
-          public.user_accounts
-          public.lessons
-          public.lesson_users
-          public.payment_methods
-          public.orders
-          public.course_emails
-          public.modules
-          public.users
-          public.module_users
-          public.courses
-          public.payment_gateway_keys
-          public.accounts
-          public.roles
-          public.payment_gateways
-          public.management
-          public.event_names
-        EOF
+        ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV

-    - name: Invoke pgcopydb
+    - name: Invoke pgcopydb  
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: remote
+        test_selection: performance/test_perf_ingest_using_pgcopydb.py
+        run_in_parallel: false
+        extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
+        pg_version: v16
+        save_perf_report: true
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
-          BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
-      run: |
-        export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
-        export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
-        export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
-        export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
-        ${PG_CONFIG} --bindir
-        ${PGCOPYDB} --version
-        ${PGCOPYDB} clone --skip-vacuum  --no-owner --no-acl --skip-db-properties --table-jobs 4 \
-          --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
-          --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
+        BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
+        TARGET_PROJECT_TYPE: ${{ matrix.target_project }}
+        # we report PLATFORM in zenbenchmark NeonBenchmarker perf database and want to distinguish between new project and large tenant
+        PLATFORM: "${{ matrix.target_project }}-us-east-2-staging"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

-    # create dummy pgcopydb log to test parsing
-    # - name: create dummy log for parser test
-    #   run: |
-    #     cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
-    #     2024-11-04 18:00:53.433 500861 INFO   main.c:136                Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
-    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1225         [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
-    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1226         [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
-    #     2024-11-04 18:00:53.442 500861 INFO   copydb.c:105              Using work dir "/tmp/pgcopydb"
-    #     2024-11-04 18:00:53.541 500861 INFO   snapshot.c:107            Exported snapshot "00000008-00000033-1" from the source database
-    #     2024-11-04 18:00:53.556 500865 INFO   cli_clone_follow.c:543    STEP 1: fetch source database tables, indexes, and sequences
-    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:716       Splitting source candidate tables larger than 10 GB
-    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:829       Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
-    #     2024-11-04 18:01:05.538 500865 INFO   copydb_schema.c:905       Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
-    #     2024-11-04 18:01:05.564 500865 INFO   copydb_schema.c:905       Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
-    #     2024-11-04 18:01:05.584 500865 INFO   copydb_schema.c:905       Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
-    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:905       Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
-    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:761       Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
-    #     2024-11-04 18:01:05.687 500865 INFO   copydb_schema.c:968       Fetched information for 57 indexes (supporting 25 constraints)
-    #     2024-11-04 18:01:05.753 500865 INFO   sequences.c:78            Fetching information for 24 sequences
-    #     2024-11-04 18:01:05.903 500865 INFO   copydb_schema.c:1122      Fetched information for 4 extensions
-    #     2024-11-04 18:01:06.178 500865 INFO   copydb_schema.c:1538      Found 0 indexes (supporting 0 constraints) in the target database
-    #     2024-11-04 18:01:06.184 500865 INFO   cli_clone_follow.c:584    STEP 2: dump the source database schema (pre/post data)
-    #     2024-11-04 18:01:06.186 500865 INFO   pgcmd.c:468                /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
-    #     2024-11-04 18:01:06.952 500865 INFO   cli_clone_follow.c:592    STEP 3: restore the pre-data section to the target database
-    #     2024-11-04 18:01:07.004 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
-    #     2024-11-04 18:01:07.438 500874 INFO   table-data.c:656          STEP 4: starting 4 table-data COPY processes
-    #     2024-11-04 18:01:07.451 500877 INFO   vacuum.c:139              STEP 8: skipping VACUUM jobs per --skip-vacuum
-    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:182             STEP 6: starting 4 CREATE INDEX processes
-    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:183             STEP 7: constraints are built by the CREATE INDEX processes
-    #     2024-11-04 18:01:07.507 500865 INFO   blobs.c:74                Skipping large objects: none found.
-    #     2024-11-04 18:01:07.509 500865 INFO   sequences.c:194           STEP 9: reset sequences values
-    #     2024-11-04 18:01:07.510 500886 INFO   sequences.c:290           Set sequences values on the target database
-    #     2024-11-04 20:49:00.587 500865 INFO   cli_clone_follow.c:608    STEP 10: restore the post-data section to the target database
-    #     2024-11-04 20:49:00.600 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
-    #     2024-11-05 10:50:58.508 500865 INFO   cli_clone_follow.c:639    All step are now done, 16h49m elapsed
-    #     2024-11-05 10:50:58.508 500865 INFO   summary.c:3155            Printing summary for 26 tables and 57 indexes
-
-    #       OID | Schema |                 Name | Parts | copy duration | transmitted bytes | indexes | create index duration 
-    #     ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
-    #     24654 | public |               events |    10 |         1d11h |            878 GB |       1 |                 1h41m
-    #     24623 | public |  email_transmissions |     4 |         4h46m |             99 GB |       3 |                 2h04m
-    #     24665 | public |              lessons |     4 |         4h42m |            161 GB |       4 |                 1m11s
-    #     24661 | public |         lesson_users |     3 |         2h46m |             49 GB |       3 |                39m35s
-    #     24631 | public |               emails |     1 |        34m07s |             10 GB |       2 |                   17s
-    #     24739 | public |             payments |     1 |         5m47s |           1848 MB |       4 |                 4m40s
-    #     24681 | public |         module_users |     1 |         4m57s |           1610 MB |       3 |                 1m50s
-    #     24694 | public |               orders |     1 |         2m50s |            835 MB |       3 |                 1m05s
-    #     24597 | public |              devices |     1 |         1m45s |            498 MB |       2 |                   40s
-    #     24723 | public |      payment_methods |     1 |         1m24s |            548 MB |       2 |                   31s
-    #     24765 | public |     user_collections |     1 |         2m17s |           1005 MB |       2 |                 968ms
-    #     24774 | public |                users |     1 |           52s |            291 MB |       4 |                   27s
-    #     24760 | public |        user_accounts |     1 |           16s |            172 MB |       3 |                   16s
-    #     24606 | public |      edition_modules |     1 |         8s983 |             46 MB |       3 |                 4s749
-    #     24583 | public |        course_emails |     1 |         8s526 |             26 MB |       2 |                 996ms
-    #     24685 | public |              modules |     1 |         1s592 |             21 MB |       3 |                 1s696
-    #     24610 | public |             editions |     1 |         2s199 |           7483 kB |       2 |                 1s032
-    #     24755 | public |           sp_content |     1 |         1s555 |           4177 kB |       0 |                   0ms
-    #     24619 | public |     email_broadcasts |     1 |         744ms |           2645 kB |       2 |                 677ms
-    #     24590 | public |              courses |     1 |         387ms |           1540 kB |       2 |                 367ms
-    #     24704 | public | payment_gateway_keys |     1 |         1s972 |            164 kB |       2 |                  27ms
-    #     24576 | public |             accounts |     1 |          58ms |             24 kB |       1 |                  14ms
-    #     24647 | public |          event_names |     1 |          32ms |             397 B |       1 |                   8ms
-    #     24716 | public |     payment_gateways |     1 |         1s675 |             117 B |       1 |                  11ms
-    #     24748 | public |                roles |     1 |          71ms |             173 B |       1 |                   8ms
-    #     24676 | public |           management |     1 |          33ms |              40 B |       1 |                  19ms
-
-
-    #                                                   Step   Connection    Duration    Transfer   Concurrency
-    #     --------------------------------------------------   ----------  ----------  ----------  ------------
-    #       Catalog Queries (table ordering, filtering, etc)       source         12s                         1
-    #                                             Dump Schema       source       765ms                         1
-    #                                         Prepare Schema       target       466ms                         1
-    #           COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)         both       2h47m                        12
-    #                                       COPY (cumulative)         both       7h46m     1225 GB             4
-    #                               CREATE INDEX (cumulative)       target       4h36m                         4
-    #                               CONSTRAINTS (cumulative)       target       8s493                         4
-    #                                     VACUUM (cumulative)       target         0ms                         4
-    #                                         Reset Sequences         both        60ms                         1
-    #                             Large Objects (cumulative)       (null)         0ms                         0
-    #                                         Finalize Schema         both      14h01m                         4
-    #     --------------------------------------------------   ----------  ----------  ----------  ------------
-    #                               Total Wall Clock Duration         both      16h49m                        20
-
-
-    #     EOF
-
-
-    - name: show tables sizes and retrieve current backpressure seconds
+    - name: show tables sizes after ingest
      run: |
        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
-        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
-        BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
-        echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
-
-    - name: Parse pgcopydb log and report performance metrics
-      env:
-        PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
-      run: |
-        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
-
-        # Define the log file path
-        LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
-        
-        # Get the current git commit hash
-        git config --global --add safe.directory /__w/neon/neon
-        COMMIT_HASH=$(git rev-parse --short HEAD)
-        
-        # Define the platform and test suite
-        PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
-        SUIT="pgcopydb_ingest_bench"
-        
-        # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
-        convert_to_seconds() {
-          local duration=$1
-          local total_seconds=0
-    
-          # Check for hours (h)
-          if [[ "$duration" =~ ([0-9]+)h ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
-          fi
-    
-          # Check for seconds (s)
-          if [[ "$duration" =~ ([0-9]+)s ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
-          fi
-    
-          # Check for milliseconds (ms) (if applicable)
-          if [[ "$duration" =~ ([0-9]+)ms ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
-            duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m 
-          fi
-
-          # Check for minutes (m) - must be checked after ms because m is contained in ms
-          if [[ "$duration" =~ ([0-9]+)m ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
-          fi
-    
-          echo $total_seconds
-        }
-
-        # Calculate the backpressure difference in seconds
-        BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
-
-        # Insert the backpressure time difference into the performance database
-        if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
-          PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
-          INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
-          VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
-          \""
-          echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
-          eval $PSQL_CMD
-        fi
-
-        # Extract and process log lines
-        while IFS= read -r line; do
-          METRIC_NAME=""
-          # Match each desired line and extract the relevant information
-          if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
-            METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
-          elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
-            METRIC_NAME="COPY (cumulative)"
-          elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
-            METRIC_NAME="CREATE INDEX (cumulative)"
-          elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
-            METRIC_NAME="CONSTRAINTS (cumulative)"
-          elif [[ "$line" =~ Finalize\ Schema.* ]]; then
-            METRIC_NAME="Finalize Schema"
-          elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
-            METRIC_NAME="Total Wall Clock Duration"
-          fi
-          
-          # If a metric was matched, insert it into the performance database
-          if [ -n "$METRIC_NAME" ]; then
-            DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
-            METRIC_VALUE=$(convert_to_seconds "$DURATION")
-            PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
-            INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
-            VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
-            \""
-            echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
-            eval $PSQL_CMD
-          fi
-        done < "$LOG_FILE"
+        ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+"
      
    - name: Delete Neon Project
      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
--- a/scripts/ingest_perf_test_result.py
+++ b/scripts/ingest_perf_test_result.py
@@ -25,7 +25,8 @@ CREATE TABLE IF NOT EXISTS perf_test_results (
    metric_value NUMERIC,
    metric_unit VARCHAR(10),
    metric_report_type TEXT,
-    recorded_at_timestamp TIMESTAMP WITH TIME ZONE DEFAULT NOW()
+    recorded_at_timestamp TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
+    labels JSONB with default '{}'
 )
 """

@@ -91,6 +92,7 @@ def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int)
                "metric_unit": metric["unit"],
                "metric_report_type": metric["report"],
                "recorded_at_timestamp": datetime.utcfromtimestamp(recorded_at_timestamp),
+                "labels": json.dumps(metric.get("labels")),
            }
            args_list.append(values)

@@ -105,7 +107,8 @@ def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int)
            metric_value,
            metric_unit,
            metric_report_type,
-            recorded_at_timestamp
+            recorded_at_timestamp,
+            labels
        ) VALUES %s
        """,
        args_list,
@@ -117,7 +120,8 @@ def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int)
            %(metric_value)s,
            %(metric_unit)s,
            %(metric_report_type)s,
-            %(recorded_at_timestamp)s
+            %(recorded_at_timestamp)s,
+            %(labels)s
        )""",
    )
    return len(args_list)
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -256,12 +256,17 @@ class NeonBenchmarker:
        metric_value: float,
        unit: str,
        report: MetricReport,
+        labels: Optional[
+            dict[str, str]
+        ] = None,  # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric
    ):
        """
        Record a benchmark result.
        """
        # just to namespace the value
        name = f"{self.PROPERTY_PREFIX}_{metric_name}"
+        if labels is None:
+            labels = {}
        self.property_recorder(
            name,
            {
@@ -269,6 +274,7 @@ class NeonBenchmarker:
                "value": metric_value,
                "unit": unit,
                "report": report,
+                "labels": labels,
            },
        )

--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -0,0 +1,267 @@
+import os
+import re
+import subprocess
+import sys
+import textwrap
+from pathlib import Path
+from typing import cast
+from urllib.parse import urlparse
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.utils import humantime_to_ms
+
+
+def setup_environment():
+    """Set up necessary environment variables for pgcopydb execution.
+
+    Expects the following variables to be set in the environment:
+    - PG_CONFIG: e.g. /tmp/neon/pg_install/v16/bin/pg_config
+    - PSQL: e.g. /tmp/neon/pg_install/v16/bin/psql
+    - PG_16_LIB_PATH: e.g. /tmp/neon/pg_install/v16/lib
+    - PGCOPYDB: e.g. /pgcopydb/bin/pgcopydb
+    - PGCOPYDB_LIB_PATH: e.g. /pgcopydb/lib
+    - BENCHMARK_INGEST_SOURCE_CONNSTR
+    - BENCHMARK_INGEST_TARGET_CONNSTR
+    - PERF_TEST_RESULT_CONNSTR
+    - TARGET_PROJECT_TYPE
+
+    """
+    # Ensure required environment variables are set
+    required_env_vars = [
+        "PGCOPYDB",
+        "PGCOPYDB_LIB_PATH",
+        "PG_CONFIG",
+        "PSQL",
+        "PG_16_LIB_PATH",
+        "BENCHMARK_INGEST_SOURCE_CONNSTR",
+        "BENCHMARK_INGEST_TARGET_CONNSTR",
+        "PERF_TEST_RESULT_CONNSTR",
+        "TARGET_PROJECT_TYPE",
+    ]
+    for var in required_env_vars:
+        if not os.getenv(var):
+            raise OSError(f"Required environment variable '{var}' is not set.")
+
+
+def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path):
+    """Builds the pgcopydb command to execute using existing environment variables."""
+    pgcopydb_executable = os.getenv("PGCOPYDB")
+    if not pgcopydb_executable:
+        raise OSError("PGCOPYDB environment variable is not set.")
+
+    return [
+        pgcopydb_executable,
+        "clone",
+        "--dir",
+        str(test_output_dir),
+        "--skip-vacuum",
+        "--no-owner",
+        "--no-acl",
+        "--skip-db-properties",
+        "--table-jobs",
+        "4",
+        "--index-jobs",
+        "4",
+        "--restore-jobs",
+        "4",
+        "--split-tables-larger-than",
+        "10GB",
+        "--skip-extensions",
+        "--use-copy-binary",
+        "--filters",
+        str(pgcopydb_filter_file),
+    ]
+
+
+@pytest.fixture()  # must be function scoped because test_output_dir is function scoped
+def pgcopydb_filter_file(test_output_dir: Path) -> Path:
+    """Creates the pgcopydb_filter.txt file required by pgcopydb."""
+    filter_content = textwrap.dedent("""\
+        [include-only-table]
+        public.events
+        public.emails
+        public.email_transmissions
+        public.payments
+        public.editions
+        public.edition_modules
+        public.sp_content
+        public.email_broadcasts
+        public.user_collections
+        public.devices
+        public.user_accounts
+        public.lessons
+        public.lesson_users
+        public.payment_methods
+        public.orders
+        public.course_emails
+        public.modules
+        public.users
+        public.module_users
+        public.courses
+        public.payment_gateway_keys
+        public.accounts
+        public.roles
+        public.payment_gateways
+        public.management
+        public.event_names
+        """)
+    filter_path = test_output_dir / "pgcopydb_filter.txt"
+    filter_path.write_text(filter_content)
+    return filter_path
+
+
+def get_backpressure_time(connstr):
+    """Executes a query to get the backpressure throttling time in seconds."""
+    query = "select backpressure_throttling_time()/1000000;"
+    psql_path = os.getenv("PSQL")
+    if psql_path is None:
+        raise OSError("The PSQL environment variable is not set.")
+    result = subprocess.run(
+        [psql_path, connstr, "-t", "-c", query], capture_output=True, text=True, check=True
+    )
+    return float(result.stdout.strip())
+
+
+def run_command_and_log_output(command, log_file_path: Path):
+    """
+    Runs a command and logs output to both a file and GitHub Actions console.
+
+    Args:
+        command (list): The command to execute.
+        log_file_path (Path): Path object for the log file where output is written.
+    """
+    # Define a list of necessary environment variables for pgcopydb
+    custom_env_vars = {
+        "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
+        "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
+        "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
+        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
+    }
+    # Combine the current environment with custom variables
+    env = os.environ.copy()
+    env.update(custom_env_vars)
+
+    with log_file_path.open("w") as log_file:
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env
+        )
+
+        assert process.stdout is not None, "process.stdout should not be None"
+
+        # Stream output to both log file and console
+        for line in process.stdout:
+            print(line, end="")  # Stream to GitHub Actions log
+            sys.stdout.flush()
+            log_file.write(line)  # Write to log file
+
+        process.wait()  # Wait for the process to finish
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(process.returncode, command)
+
+
+def parse_log_and_report_metrics(
+    zenbenchmark: NeonBenchmarker, log_file_path: Path, backpressure_time_diff: float
+):
+    """Parses the pgcopydb log file for performance metrics and reports them to the database."""
+    metrics = {"backpressure_time": backpressure_time_diff}
+
+    # Define regex patterns to capture metrics
+    metric_patterns = {
+        "COPY_INDEX_CONSTRAINTS_VACUUM": re.compile(
+            r"COPY, INDEX, CONSTRAINTS, VACUUM \(wall clock\).*"
+        ),
+        "COPY_CUMULATIVE": re.compile(r"COPY \(cumulative\).*"),
+        "CREATE_INDEX_CUMULATIVE": re.compile(r"CREATE INDEX \(cumulative\).*"),
+        "CONSTRAINTS_CUMULATIVE": re.compile(r"CONSTRAINTS \(cumulative\).*"),
+        "FINALIZE_SCHEMA": re.compile(r"Finalize Schema.*"),
+        "TOTAL_DURATION": re.compile(r"Total Wall Clock Duration.*"),
+    }
+
+    # Parse log file
+    with log_file_path.open("r") as log_file:
+        for line in log_file:
+            for metric_name, pattern in metric_patterns.items():
+                if pattern.search(line):
+                    # Extract duration and convert it to seconds
+                    duration_match = re.search(r"\d+h\d+m|\d+s|\d+ms|\d+\.\d+s", line)
+                    if duration_match:
+                        duration_str = duration_match.group(0)
+                        parts = re.findall(r"\d+[a-zA-Z]+", duration_str)
+                        rust_like_humantime = " ".join(parts)
+                        duration_seconds = humantime_to_ms(rust_like_humantime) / 1000.0
+                        metrics[metric_name] = duration_seconds
+
+    endpoint_id = {"endpoint_id": get_endpoint_id()}
+    for metric_name, duration_seconds in metrics.items():
+        zenbenchmark.record(
+            metric_name, duration_seconds, "s", MetricReport.LOWER_IS_BETTER, endpoint_id
+        )
+
+
+def get_endpoint_id():
+    """Extracts and returns the first segment of the hostname from the PostgreSQL URI stored in BENCHMARK_INGEST_TARGET_CONNSTR."""
+    connstr = os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")
+    if connstr is None:
+        raise OSError("BENCHMARK_INGEST_TARGET_CONNSTR environment variable is not set.")
+
+    # Parse the URI
+    parsed_url = urlparse(connstr)
+
+    # Extract the hostname and split to get the first segment
+    hostname = parsed_url.hostname
+    if hostname is None:
+        raise ValueError("Unable to parse hostname from BENCHMARK_INGEST_TARGET_CONNSTR")
+
+    # Split the hostname by dots and take the first segment
+    endpoint_id = hostname.split(".")[0]
+
+    return endpoint_id
+
+
+@pytest.fixture()  # must be function scoped because test_output_dir is function scoped
+def log_file_path(test_output_dir):
+    """Fixture to provide a temporary log file path."""
+    if not os.getenv("TARGET_PROJECT_TYPE"):
+        raise OSError("Required environment variable 'TARGET_PROJECT_TYPE' is not set.")
+    return (test_output_dir / os.getenv("TARGET_PROJECT_TYPE")).with_suffix(".log")
+
+
+@pytest.mark.remote_cluster
+def test_ingest_performance_using_pgcopydb(
+    zenbenchmark: NeonBenchmarker,
+    log_file_path: Path,
+    pgcopydb_filter_file: Path,
+    test_output_dir: Path,
+):
+    """
+    Simulate project migration from another PostgreSQL provider to Neon.
+
+    Measure performance for Neon ingest steps
+    - COPY
+    - CREATE INDEX
+    - CREATE CONSTRAINT
+    - VACUUM ANALYZE
+    - create foreign keys
+
+    Use pgcopydb to copy data from the source database to the destination database.
+    """
+    # Set up environment and create filter file
+    setup_environment()
+
+    # Get backpressure time before ingest
+    backpressure_time_before = get_backpressure_time(os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR"))
+
+    # Build and run the pgcopydb command
+    command = build_pgcopydb_command(pgcopydb_filter_file, test_output_dir)
+    try:
+        run_command_and_log_output(command, log_file_path)
+    except subprocess.CalledProcessError as e:
+        pytest.fail(f"pgcopydb command failed with error: {e}")
+
+    # Get backpressure time after ingest and calculate the difference
+    backpressure_time_after = get_backpressure_time(os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR"))
+    backpressure_time_diff = backpressure_time_after - backpressure_time_before
+
+    # Parse log file and report metrics, including backpressure time difference
+    parse_log_and_report_metrics(zenbenchmark, log_file_path, backpressure_time_diff)