From 59e393aef35fea56bbbf5dd1feeebfb3c518731d Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:28:39 +0200 Subject: [PATCH] Enable parallel execution of extension tests (#12118) ## Problem Extension tests were previously run sequentially, resulting in unnecessary wait time and underutilization of available CPU cores. ## Summary of changes Tests are now executed in a customizable number of parallel threads using separate database branches. This reduces overall test time by approximately 50% (e.g., on my laptop, parallel test lasts 173s, while sequential one lasts 340s) and increases the load on the pageserver, providing better test coverage. --------- Co-authored-by: Alexander Bayandin Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Alexey Masterov --- .github/workflows/build_and_test.yml | 1 + .gitignore | 1 + compute/compute-node.Dockerfile | 4 +- .../compute_wrapper/shell/compute.sh | 18 ++-- docker-compose/docker-compose.yml | 25 +++-- docker-compose/docker_compose_test.sh | 92 +++++++++++++------ docker-compose/run-tests.sh | 69 +++++++++----- docker-compose/test_extensions_upgrade.sh | 6 +- 8 files changed, 144 insertions(+), 72 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0ceaa96fb0..864abad574 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -986,6 +986,7 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 60 env: + PARALLEL_COMPUTES: 3 TAG: >- ${{ needs.meta.outputs.run-kind == 'compute-rc-pr' diff --git a/.gitignore b/.gitignore index 6574d7b9de..4857972f1d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ neon.iml /.neon /integration_tests/.neon compaction-suite-results.* +docker-compose/docker-compose-parallel.yml # Coverage *.profraw diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 0dd32011fb..39136fe573 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1915,10 +1915,10 @@ RUN cd /ext-src/pg_repack-src && patch -p1 /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig -RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \ +RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \ && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/* ENV PATH=/usr/local/pgsql/bin:$PATH -ENV PGHOST=compute +ENV PGHOST=compute1 ENV PGPORT=55433 ENV PGUSER=cloud_admin ENV PGDATABASE=postgres diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 1e62e91fd0..6f36b4358e 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -54,14 +54,16 @@ else printf '%s\n' "${result}" | jq . fi - echo "Check if a timeline present" - PARAMS=( - -X GET - -H "Content-Type: application/json" - "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" - ) - timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) - if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then + if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then + echo "Check if a timeline present" + PARAMS=( + -X GET + -H "Content-Type: application/json" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" + ) + timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) + fi + if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then generate_id timeline_id PARAMS=( -sbf diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 2519b75c7f..19c3bc74e2 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -142,7 +142,7 @@ services: - "storage_broker" - "--listen-addr=0.0.0.0:50051" - compute: + compute1: restart: always build: context: ./compute_wrapper/ @@ -152,6 +152,7 @@ services: - TAG=${COMPUTE_TAG:-${TAG:-latest}} - http_proxy=${http_proxy:-} - https_proxy=${https_proxy:-} + image: built-compute environment: - PG_VERSION=${PG_VERSION:-16} - TENANT_ID=${TENANT_ID:-} @@ -166,6 +167,11 @@ services: - 3080:3080 # http endpoints entrypoint: - "/shell/compute.sh" + # Ad an alias for compute1 for compatibility + networks: + default: + aliases: + - compute depends_on: - safekeeper1 - safekeeper2 @@ -174,15 +180,20 @@ services: compute_is_ready: image: postgres:latest + environment: + - PARALLEL_COMPUTES=1 entrypoint: - - "/bin/bash" + - "/bin/sh" - "-c" command: - - "until pg_isready -h compute -p 55433 -U cloud_admin ; do - echo 'Waiting to start compute...' && sleep 1; - done" + - "for i in $(seq 1 $${PARALLEL_COMPUTES}); do + until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do + sleep 1; + done; + done; + echo All computes are started" depends_on: - - compute + - compute1 neon-test-extensions: profiles: ["test-extensions"] @@ -196,4 +207,4 @@ services: command: - sleep 3600 depends_on: - - compute + - compute1 diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 6edf90ca8d..063b8dee85 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # A basic test to ensure Docker images are built correctly. # Build a wrapper around the compute, start all services and runs a simple SQL query. @@ -13,9 +13,36 @@ # set -eux -o pipefail +cd "$(dirname "${0}")" export COMPOSE_FILE='docker-compose.yml' export COMPOSE_PROFILES=test-extensions -cd "$(dirname "${0}")" +export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1} +READY_MESSAGE="All computes are started" +COMPUTES=() +for i in $(seq 1 "${PARALLEL_COMPUTES}"); do + COMPUTES+=("compute${i}") +done +CURRENT_TMPDIR=$(mktemp -d) +trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT +if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then + export COMPOSE_FILE=docker-compose-parallel.yml + cp docker-compose.yml docker-compose-parallel.yml + # Replace the environment variable PARALLEL_COMPUTES with the actual value + yq eval -i ".services.compute_is_ready.environment |= map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE} + for i in $(seq 2 "${PARALLEL_COMPUTES}"); do + # Duplicate compute1 as compute${i} for parallel execution + yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE} + # We don't need these sections, so delete them + yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE} + # Let the compute 1 be the only dependence + yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE} + # Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes + yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE} + # Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes + # They will create new TENANT_ID and TIMELINE_ID anyway. + yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE} + done +fi PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" function cleanup() { @@ -27,11 +54,11 @@ function cleanup() { for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do pg_version=${pg_version/v/} - echo "clean up containers if exists" + echo "clean up containers if exist" cleanup PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) - PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d - + PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1 + PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 while sleep 3; do @@ -41,45 +68,50 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do echo "timeout before the compute is ready." exit 1 fi - if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then + if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then echo "OK. The compute is ready to connect." echo "execute simple queries." - docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'" + for compute in "${COMPUTES[@]}"; do + docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'" + done break fi done if [[ ${pg_version} -ge 16 ]]; then - # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail - # It cannot be moved to Dockerfile now because the database directory is created after the start of the container - echo Adding dummy config - docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf - # Prepare for the PostGIS test - docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp - TMPDIR=$(mktemp -d) - docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}" - docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}" - docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install - docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test - docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress - rm -rf "${TMPDIR}" - # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment - TMPDIR=$(mktemp -d) - docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data" - docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/ - rm -rf "${TMPDIR}" - # The following block does the same for the contrib/file_fdw test - TMPDIR=$(mktemp -d) - docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data" - docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data - rm -rf "${TMPDIR}" + mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src} + docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test" + docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install" + docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data" + docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data" + + for compute in "${COMPUTES[@]}"; do + # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail + # It cannot be moved to Dockerfile now because the database directory is created after the start of the container + echo Adding dummy config on "${compute}" + docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf + # Prepare for the PostGIS test + docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install + docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test + docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress + # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment + docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/ + # The following block does the same for the contrib/file_fdw test + docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data + done # Apply patches docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch" # We are running tests now rm -f testout.txt testout_contrib.txt + # We want to run the longest tests first to better utilize parallelization and reduce overall test time. + # Tests listed in the RUN_FIRST variable will be run before others. + # If parallelization is not used, this environment variable will be ignored. + docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ + -e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \ neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ + -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \ neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then CONTRIB_FAILED= diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index 930402ce66..b37b9363fa 100755 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -x if [[ -v BENCHMARK_CONNSTR ]]; then @@ -26,8 +26,9 @@ if [[ -v BENCHMARK_CONNSTR ]]; then fi fi REGULAR_USER=false -while getopts r arg; do - case $arg in +PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1} +while getopts pr arg; do + case ${arg} in r) REGULAR_USER=true shift $((OPTIND-1)) @@ -41,26 +42,49 @@ extdir=${1} cd "${extdir}" || exit 2 FAILED= -LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u) -for d in ${LIST}; do - [ -d "${d}" ] || continue - if ! psql -w -c "select 1" >/dev/null; then - FAILED="${d} ${FAILED}" - break - fi - if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then - "${d}/regular-test.sh" || FAILED="${d} ${FAILED}" - continue - fi +export FAILED_FILE=/tmp/failed +rm -f ${FAILED_FILE} +mapfile -t LIST < <( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u) +if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then + # Avoid errors if RUN_FIRST is not defined + RUN_FIRST=${RUN_FIRST:-} + # Move entries listed in the RUN_FIRST variable to the beginning + ORDERED_LIST=$(printf "%s\n" "${LIST[@]}" | grep -x -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"); printf "%s\n" "${LIST[@]}" | grep -vx -Ff <(echo -e "${RUN_FIRST//,/$'\n'}")) + parallel -j"${PARALLEL_COMPUTES}" "[[ -d {} ]] || exit 0 + export PGHOST=compute{%} + if ! psql -c 'select 1'>/dev/null; then + exit 1 + fi + echo Running on \${PGHOST} + if [[ -f ${extdir}/{}/neon-test.sh ]]; then + echo Running from script + ${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE}; + else + echo Running using make; + USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE}; + fi" ::: ${ORDERED_LIST} + [[ ! -f ${FAILED_FILE} ]] && exit 0 +else + for d in "${LIST[@]}"; do + [ -d "${d}" ] || continue + if ! psql -w -c "select 1" >/dev/null; then + FAILED="${d} ${FAILED}" + break + fi + if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then + "${d}/regular-test.sh" || FAILED="${d} ${FAILED}" + continue + fi - if [ -f "${d}/neon-test.sh" ]; then - "${d}/neon-test.sh" || FAILED="${d} ${FAILED}" - else - USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" - fi -done -[ -z "${FAILED}" ] && exit 0 -for d in ${FAILED}; do + if [ -f "${d}/neon-test.sh" ]; then + "${d}/neon-test.sh" || FAILED="${d} ${FAILED}" + else + USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" + fi + done + [[ -z ${FAILED} ]] && exit 0 +fi +for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do cat "$(find $d -name regression.diffs)" done for postgis_diff in /tmp/pgis_reg/*_diff; do @@ -68,4 +92,5 @@ for postgis_diff in /tmp/pgis_reg/*_diff; do cat "${postgis_diff}" done echo "${FAILED}" +cat ${FAILED_FILE} exit 1 diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index f1cf17f531..1d39fc029e 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -eux -o pipefail cd "$(dirname "${0}")" # Takes a variable name as argument. The result is stored in that variable. @@ -60,8 +60,8 @@ function check_timeline() { # Restarts the compute node with the required compute tag and timeline. # Accepts the tag for the compute node and the timeline as parameters. function restart_compute() { - docker compose down compute compute_is_ready - COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready + docker compose down compute1 compute_is_ready + COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready wait_for_ready check_timeline ${2} }