From 87fc0a03747c4c1e24b2a8ec1479ea97150fa433 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Fri, 23 May 2025 11:37:19 +0200 Subject: [PATCH] periodic pagebench on hetzner runners (#11963) ## Problem - Benchmark periodic pagebench had inconsistent benchmarking results even when run with the same commit hash. Hypothesis is this was due to running on dedicated but virtualized EC instance with varying CPU frequency. - the dedicated instance type used for the benchmark is quite "old" and we increasingly get `An error occurred (InsufficientInstanceCapacity) when calling the StartInstances operation (reached max retries: 2): Insufficient capacity.` - periodic pagebench uses a snapshot of pageserver timelines to have the same layer structure in each run and get consistent performance. Re-creating the snapshot was a painful manual process (see https://github.com/neondatabase/cloud/issues/27051 and https://github.com/neondatabase/cloud/issues/27653) ## Summary of changes - Run the periodic pagebench on a custom hetzner GitHub runner with large nvme disk and governor set to defined perf profile - provide a manual dispatch option for the workflow that allows to create a new snapshot - keep the manual dispatch option to specify a commit hash useful for bi-secting regressions - always use the newest created snapshot (S3 bucket uses date suffix in S3 key, example `s3://neon-github-public-dev/performance/pagebench/shared-snapshots-2025-05-17/` - `--ignore` `test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py` in regular benchmarks run for each commit - improve perf copying snapshot by using `cp` subprocess instead of traversing tree in python ## Example runs with code in this PR: - run which creates new snapshot https://github.com/neondatabase/neon/actions/runs/15083408849/job/42402986376#step:19:55 - run which uses latest snapshot - https://github.com/neondatabase/neon/actions/runs/15084907676/job/42406240745#step:11:65 --- .github/workflows/build_and_test.yml | 3 +- .github/workflows/periodic_pagebench.yml | 281 ++++++++++++------ scripts/benchmark_durations.py | 6 - test_runner/fixtures/neon_fixtures.py | 7 +- ...er_max_throughput_getpage_at_latest_lsn.py | 8 +- 5 files changed, 197 insertions(+), 108 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a887db2ab1..9f2fa3d52c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -314,7 +314,8 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} + # test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots + extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 532da435c2..317db94052 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -1,4 +1,4 @@ -name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region +name: Periodic pagebench performance test on unit-perf hetzner runner on: schedule: @@ -8,7 +8,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 */3 * * *' # Runs every 3 hours + - cron: '0 */4 * * *' # Runs every 4 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: @@ -16,6 +16,11 @@ on: description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' required: false default: '' + recreate_snapshots: + type: boolean + description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.' + required: false + default: false defaults: run: @@ -29,13 +34,13 @@ permissions: contents: read jobs: - trigger_bench_on_ec2_machine_in_eu_central_1: + run_periodic_pagebench_test: permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, small ] + runs-on: [ self-hosted, unit-perf ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: @@ -44,10 +49,13 @@ jobs: options: --init timeout-minutes: 360 # Set the timeout to 6 hours env: - API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} RUN_ID: ${{ github.run_id }} - AWS_DEFAULT_REGION : "eu-central-1" - AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + DEFAULT_PG_VERSION: 16 + BUILD_TYPE: release + RUST_BACKTRACE: 1 + # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container + S3_BUCKET: neon-github-public-dev + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" steps: # we don't need the neon source code because we run everything remotely # however we still need the local github actions to run the allure step below @@ -56,99 +64,194 @@ jobs: with: egress-policy: audit - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive + id: set-env + shell: bash -euxo pipefail {0} + run: | + { + echo "NEON_DIR=${RUNNER_TEMP}/neon" + echo "NEON_BIN=${RUNNER_TEMP}/neon/bin" + echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install" + echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib" + echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots" + echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output" + echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local" + echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results" + echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results" + } >> "$GITHUB_ENV" - - name: Show my own (github runner) external IP address - usefull for IP allowlisting - run: curl https://ifconfig.me + echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT" - - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} - role-duration-seconds: 3600 - - - name: Start EC2 instance and wait for the instance to boot up - run: | - aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID - aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID - sleep 60 # sleep some time to allow cloudinit and our API server to start up - - - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US - run: | - public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) - echo "Public IP of the EC2 instance: $public_ip" - echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV - + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built) - name: Determine commit hash + id: commit_hash + shell: bash -euxo pipefail {0} env: INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} run: | - if [ -z "$INPUT_COMMIT_HASH" ]; then - echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + if [[ -z "${INPUT_COMMIT_HASH}" ]]; then + COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha') + echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV + echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else - echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + COMMIT_HASH="${INPUT_COMMIT_HASH}" + echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV + echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi + - name: Checkout the neon repository at given commit hash + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ steps.commit_hash.outputs.commit_hash }} - - name: Start Bench with run_id + # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash + # example artifact + # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst + - name: Determine artifact S3_KEY for given commit hash and download and extract artifact + id: artifact_prefix + shell: bash -euxo pipefail {0} + env: + ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst + COMMIT_HASH: ${{ env.COMMIT_HASH }} + COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }} run: | - curl -k -X 'POST' \ - "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -H "Authorization: Bearer $API_KEY" \ - -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}" + attempt=0 + max_attempts=24 # 5 minutes * 24 = 2 hours - - name: Poll Test Status - id: poll_step - run: | - status="" - while [[ "$status" != "failure" && "$status" != "success" ]]; do - response=$(curl -k -X 'GET' \ - "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H "Authorization: Bearer $API_KEY") - echo "Response: $response" - set +x - status=$(echo $response | jq -r '.status') - echo "Test status: $status" - if [[ "$status" == "failure" ]]; then - echo "Test failed" - exit 1 # Fail the job step if status is failure - elif [[ "$status" == "success" || "$status" == "null" ]]; then + while [[ $attempt -lt $max_attempts ]]; do + # the following command will fail until the artifacts are available ... + S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \ + | jq -r '.Contents[]?.Key' \ + | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \ + | sort --version-sort \ + | tail -1) || true # ... thus ignore errors from the command + if [[ -n "${S3_KEY}" ]]; then + echo "Artifact found: $S3_KEY" + echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV break - elif [[ "$status" == "too_many_runs" ]]; then - echo "Too many runs already running" - echo "too_many_runs=true" >> "$GITHUB_OUTPUT" - exit 1 fi - - sleep 60 # Poll every 60 seconds + + # Increment attempt counter and sleep for 5 minutes + attempt=$((attempt + 1)) + echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..." + sleep 300 # Sleep for 5 minutes done - - name: Retrieve Test Logs - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - run: | - curl -k -X 'GET' \ - "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ - -H 'accept: application/gzip' \ - -H "Authorization: Bearer $API_KEY" \ - --output "test_log_${GITHUB_RUN_ID}.gz" + if [[ -z "${S3_KEY}" ]]; then + echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours + else + mkdir -p $(dirname $ARCHIVE) + time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE} + mkdir -p ${NEON_DIR} + time tar -xf ${ARCHIVE} -C ${NEON_DIR} + rm -f ${ARCHIVE} + fi - - name: Unzip Test Log and Print it into this job's log - if: always() && steps.poll_step.outputs.too_many_runs != 'true' + - name: Download snapshots from S3 + if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }} + id: download_snapshots + shell: bash -euxo pipefail {0} run: | - gzip -d "test_log_${GITHUB_RUN_ID}.gz" - cat "test_log_${GITHUB_RUN_ID}" + # Download the snapshots from S3 + mkdir -p ${TEST_OUTPUT} + mkdir -p $BACKUP_DIR + cd $BACKUP_DIR + mkdir parts + cd parts + PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \ + | jq -r '.Contents[]?.Key' \ + | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \ + | sort \ + | tail -1) + echo "Latest PART: $PART" + if [[ -z "$PART" ]]; then + echo "ERROR: No matching S3 key found" >&2 + exit 1 + fi + S3_KEY=$(dirname $PART) + time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ . + cd $TEST_OUTPUT + time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions + rm -rf ${BACKUP_DIR} + + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -euxo pipefail {0} + run: ./scripts/pysync + + # we need high number of open files for pagebench + - name: show ulimits + shell: bash -euxo pipefail {0} + run: | + ulimit -a + + - name: Run pagebench testcase + shell: bash -euxo pipefail {0} + env: + CI: false # need to override this env variable set by github to enforce using snapshots + run: | + export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE} + # report the commit hash of the neon repository in the revision of the test results + export GITHUB_SHA=${COMMIT_HASH} + rm -rf ${PERF_REPORT_DIR} + rm -rf ${ALLURE_RESULTS_DIR} + mkdir -p ${PERF_REPORT_DIR} + mkdir -p ${ALLURE_RESULTS_DIR} + PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA" + EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json" + # run only two selected tests + # environment set by parent: + # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release + ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS} + ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS} + + - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results + shell: bash -euxo pipefail {0} + run: | + export REPORT_FROM="$PERF_REPORT_DIR" + export GITHUB_SHA=${COMMIT_HASH} + time ./scripts/generate_and_push_perf_report.sh + + - name: Upload test results + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-store + with: + report-dir: ${{ steps.set-env.outputs.allure_results_dir }} + unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + - name: Upload snapshots + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }} + id: upload_snapshots + shell: bash -euxo pipefail {0} + run: | + mkdir -p $BACKUP_DIR + cd $TEST_OUTPUT + tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst + cd $BACKUP_DIR + mkdir parts + split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part. + SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD + cd parts + time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/ + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 @@ -157,26 +260,22 @@ jobs: slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - + - name: Cleanup Test Resources if: always() + shell: bash -euxo pipefail {0} + env: + ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst run: | - curl -k -X 'POST' \ - "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H "Authorization: Bearer $API_KEY" \ - -d '' + # Cleanup the test resources + if [[ -d "${BACKUP_DIR}" ]]; then + rm -rf ${BACKUP_DIR} + fi + if [[ -d "${TEST_OUTPUT}" ]]; then + rm -rf ${TEST_OUTPUT} + fi + if [[ -d "${NEON_DIR}" ]]; then + rm -rf ${NEON_DIR} + fi + rm -rf $(dirname $ARCHIVE) - - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} - role-duration-seconds: 3600 - - - name: Stop EC2 instance and wait for the instance to be stopped - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - run: | - aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID - aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index a9a90c7370..c74ef9d899 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -32,12 +32,6 @@ BENCHMARKS_DURATION_QUERY = """ # the total duration varies from 8 to 40 minutes. # We use some pre-collected durations as a fallback to have a better distribution. FALLBACK_DURATION = { - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135, "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036, "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104, "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e413b3c6d2..5c92f2e2d0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -682,7 +682,7 @@ class NeonEnvBuilder: log.info( f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" ) - shutil.copytree(tenants_from_dir, tenants_to_dir) + subprocess.run(["cp", "-a", tenants_from_dir, tenants_to_dir], check=True) else: log.info( f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" @@ -698,8 +698,9 @@ class NeonEnvBuilder: shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) if self.test_overlay_dir is None: log.info("Copying local_fs_remote_storage directory from snapshot") - shutil.copytree( - repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + subprocess.run( + ["cp", "-a", f"{repo_dir / 'local_fs_remote_storage'}", f"{self.repo_dir}"], + check=True, ) else: log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot") diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 8874fe663b..41696bf887 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -14,7 +14,7 @@ from fixtures.neon_fixtures import ( PgBin, wait_for_last_flush_lsn, ) -from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci +from fixtures.utils import get_scale_for_db, humantime_to_ms from performance.pageserver.util import setup_pageserver_with_tenants @@ -36,9 +36,6 @@ if TYPE_CHECKING: @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [500]) @pytest.mark.timeout(10000) -@skip_on_ci( - "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" -) def test_pageserver_characterize_throughput_with_n_tenants( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, @@ -63,9 +60,6 @@ def test_pageserver_characterize_throughput_with_n_tenants( @pytest.mark.parametrize("n_clients", [1, 64]) @pytest.mark.parametrize("n_tenants", [1]) @pytest.mark.timeout(2400) -@skip_on_ci( - "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" -) def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker,