From 968c20ca5f060eccc0930dc98ea358ab67df62c6 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 13 Jul 2022 21:22:44 +0300 Subject: [PATCH 1/5] Add zenith-1-ps-3 to prod inventory (#2084) --- .github/ansible/production.hosts | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ansible/production.hosts b/.github/ansible/production.hosts index 6a3a7791ad..d22ce0e37e 100644 --- a/.github/ansible/production.hosts +++ b/.github/ansible/production.hosts @@ -1,6 +1,7 @@ [pageservers] #zenith-1-ps-1 console_region_id=1 zenith-1-ps-2 console_region_id=1 +zenith-1-ps-3 console_region_id=1 [safekeepers] zenith-1-sk-1 console_region_id=1 From 9a7427c203f919891e3a8713a21672e3ed1da03b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 14 Jul 2022 02:15:43 +0300 Subject: [PATCH 2/5] Fill build-args for Docker builds via GH Actions context --- .github/workflows/build_and_test.yml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 857e9e3533..e4858c1fe9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -441,14 +441,14 @@ jobs: fi id: legacy-build-tag - - name: Build compute-tools Docker image + - name: Build neon Docker image uses: docker/build-push-action@v2 with: context: . build-args: | - GIT_VERSION="${GITHUB_SHA}" - AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" pull: true push: true tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}} @@ -508,8 +508,9 @@ jobs: with: context: . build-args: | - AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" push: false file: Dockerfile.compute-tools tags: neondatabase/compute-tools:local @@ -519,8 +520,9 @@ jobs: with: context: . build-args: | - AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" - AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" push: true file: Dockerfile.compute-tools tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}} From 12bac9c12bdcddfb73fec68fbe65a9013af3e588 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 13 Jul 2022 15:52:04 +0200 Subject: [PATCH 3/5] Wait for compute image before deploy in GitHub Action We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly. --- .github/workflows/build_and_test.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e4858c1fe9..75f5828ef4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -560,7 +560,11 @@ jobs: deploy: runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ docker-image, calculate-deploy-targets ] + # We need both storage **and** compute images for deploy, because control plane + # picks the compute version based on the storage version. If it notices a fresh + # storage it may bump the compute version. And if compute image failed to build + # it may break things badly. + needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -603,7 +607,9 @@ jobs: deploy-proxy: runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ docker-image, calculate-deploy-targets ] + # Compute image isn't strictly required for proxy deploy, but let's still wait for it + # to run all deploy jobs consistently. + needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' From 1b6a80a38f3961f95c8b96361367f9d827e39ae6 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 8 Jul 2022 20:18:24 +0300 Subject: [PATCH 4/5] Fix flaky test_concurrent_computes * Wait for all computes (except one) to complete before proceeding with the single compute. * It previously waited for too few seconds. As the test is randomized, it was not failing all the time, but only in specific unlucky cases. E.g. when there were no successfuly queries by concurrent computes, and the single node had big timeouts and spent lots of time making the transaction. See https://github.com/neondatabase/neon/runs/7234456482?check_suite_focus=true (around line 980). * Wait for exactly one extra transaction by the single compute. --- .../batch_others/test_wal_acceptor_async.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 4664c332fc..d74ef8840a 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -302,6 +302,8 @@ def test_compute_restarts(neon_env_builder: NeonEnvBuilder): class BackgroundCompute(object): + MAX_QUERY_GAP_SECONDS = 2 + def __init__(self, index: int, env: NeonEnv, branch: str): self.index = index self.env = env @@ -339,7 +341,7 @@ class BackgroundCompute(object): # With less sleep, there is a very big chance of not committing # anything or only 1 xact during test run. - await asyncio.sleep(2 * random.random()) + await asyncio.sleep(random.uniform(0, self.MAX_QUERY_GAP_SECONDS)) self.running = False @@ -356,20 +358,34 @@ async def run_concurrent_computes(env: NeonEnv, background_tasks = [asyncio.create_task(compute.run()) for compute in computes] await asyncio.sleep(run_seconds) + log.info("stopping all tasks but one") for compute in computes[1:]: compute.stopped = True + await asyncio.gather(*background_tasks[1:]) log.info("stopped all tasks but one") # work for some time with only one compute -- it should be able to make some xacts - await asyncio.sleep(8) + TIMEOUT_SECONDS = computes[0].MAX_QUERY_GAP_SECONDS + 3 + initial_queries_by_0 = len(computes[0].successful_queries) + log.info(f'Waiting for another query by computes[0], ' + f'it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s') + for _ in range(10 * TIMEOUT_SECONDS): + current_queries_by_0 = len(computes[0].successful_queries) - initial_queries_by_0 + if current_queries_by_0 >= 1: + log.info(f'Found {current_queries_by_0} successful queries ' + f'by computes[0], completing the test') + break + await asyncio.sleep(0.1) + else: + assert False, "Timed out while waiting for another query by computes[0]" computes[0].stopped = True - await asyncio.gather(*background_tasks) + await asyncio.gather(background_tasks[0]) result = await exec_compute_query(env, branch, 'SELECT * FROM query_log') # we should have inserted something while single compute was running - assert len(result) >= 4 - log.info(f'Executed {len(result)} queries') + log.info(f'Executed {len(result)} queries, {current_queries_by_0} of them ' + f'by computes[0] after we started stopping the others') for row in result: log.info(f'{row[0]} {row[1]} {row[2]}') From c004a6d62fc9b45c4ef6f8ed6ba1016101d4807d Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 14 Jul 2022 12:46:38 +0300 Subject: [PATCH 5/5] Do not cancel in-progress checks on the `main` branch See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#concurrency * Previously there was a single concurrency group per each branch. As the `main` branch got pushed into frequently, very few commits got tested to the end. It resulted in "broken" `main` branch as there were no fully successful workflow runs. Now the `main` branch gets a separate concurrency group for each commit. * As GitHub Actions syntax does not have the conditional operator, it is emulated via logical and/or operations. Although undocumented, they return one of their operands instead of plain true/false. * Replace 3-space indentation with 2-space indentation while we are here to be consistent with the rest of the file. --- .github/workflows/build_and_test.yml | 5 +++-- .github/workflows/codestyle.yml | 5 +++-- .github/workflows/pg_clients.yml | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 75f5828ef4..3a12d19428 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -11,8 +11,9 @@ defaults: shell: bash -ex {0} concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true env: RUST_BACKTRACE: 1 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 2b8a01e94e..345c1d5397 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -11,8 +11,9 @@ defaults: shell: bash -ex {0} concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true env: RUST_BACKTRACE: 1 diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index fe4dbea8ac..4ff31ac508 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -13,8 +13,9 @@ on: workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true jobs: test-postgres-client-libs: