From 2ba414525e7605c8570eadaccef576321d601571 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 13 Jun 2024 22:56:01 +0300
Subject: [PATCH 1/3] Install rust binaries before running rust tests.

cargo test (or nextest) might rebuild the binaries with different
features/flags, so do install immediately after the build. Triggered by the
particular case of nextest invocations missing $CARGO_FEATURES, which recompiled
safekeeper without 'testing' feature which made python tests needing
it (failpoints) not run in the CI.

Also add CARGO_FEATURES to the nextest runs anyway because there doesn't seem to
be an important reason not to.
---
 .github/workflows/build_and_test.yml | 56 ++++++++++++++--------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 703fc8d145..bd2996ec4c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -337,34 +337,8 @@ jobs:
         run: |
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
+      # Do install *before* running rust tests because they might recompile the
+      # binaries with different features/flags.
       - name: Install rust binaries
         run: |
           # Install target binaries
@@ -405,6 +379,32 @@ jobs:
             done
           fi
 
+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
       - name: Install postgres binaries
         run: cp -a pg_install /tmp/neon/pg_install
 

From 16d80128eea32b0f2fd1051c90e93e8e0d537381 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 17 Jun 2024 11:40:35 +0100
Subject: [PATCH 2/3] storcon: handle entire cluster going unavailable
 correctly (#8060)

## Problem
A period of unavailability for all pageservers in a cluster produced the
following fallout in staging:
all tenants became detached and required manual operation to re-attach.
Manually restarting
the storage controller re-attached all tenants due to a consistency bug.

Turns out there are two related bugs which caused the issue:
1. Pageserver re-attach can be processed before the first heartbeat.
Hence, when handling
the availability delta produced by the heartbeater,
`Node::get_availability_transition` claims
that there's no need to reconfigure the node.
2. We would still attempt to reschedule tenant shards when handling
offline transitions even
if the entire cluster is down. This puts tenant shards into a state
where the reconciler believes
they have to be detached (no pageserver shows up in their intent state).
This is doubly wrong
because we don't mark the tenant shards as detached in the database,
thus causing memory vs
database consistency issues. Luckily, this bug allowed all tenant shards
to re-attach after restart.

## Summary of changes
* For (1), abuse the fact that re-attach requests do not contain an
utilisation score and use that
to differentiate from a node that replied to heartbeats.
* For (2), introduce a special case that skips any rescheduling if the
entire cluster is unavailable.
* Update the storage controller heartbeat test with an extra scenario
where the entire cluster goes
for lunch.

Fixes https://github.com/neondatabase/neon/issues/8044
---
 storage_controller/src/heartbeater.rs         |  4 +
 storage_controller/src/node.rs                | 12 ++-
 storage_controller/src/service.rs             | 88 +++++++++++++----
 .../regress/test_storage_controller.py        | 94 ++++++++++++-------
 4 files changed, 141 insertions(+), 57 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 1ef97e78eb..14cda0a289 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -31,6 +31,7 @@ pub(crate) enum PageserverState {
     Available {
         last_seen_at: Instant,
         utilization: PageserverUtilization,
+        new: bool,
     },
     Offline,
 }
@@ -127,6 +128,7 @@ impl HeartbeaterTask {
             heartbeat_futs.push({
                 let jwt_token = self.jwt_token.clone();
                 let cancel = self.cancel.clone();
+                let new_node = !self.state.contains_key(node_id);
 
                 // Clone the node and mark it as available such that the request
                 // goes through to the pageserver even when the node is marked offline.
@@ -159,6 +161,7 @@ impl HeartbeaterTask {
                         PageserverState::Available {
                             last_seen_at: Instant::now(),
                             utilization,
+                            new: new_node,
                         }
                     } else {
                         PageserverState::Offline
@@ -220,6 +223,7 @@ impl HeartbeaterTask {
                     }
                 },
                 Vacant(_) => {
+                    // This is a new node. Don't generate a delta for it.
                     deltas.push((node_id, ps_state.clone()));
                 }
             }
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 7b5513c908..34dcf0c642 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        TenantLocateResponseShard, UtilizationScore,
     },
     shard::TenantShardId,
 };
@@ -116,6 +116,16 @@ impl Node {
         match (self.availability, availability) {
             (Offline, Active(_)) => ToActive,
             (Active(_), Offline) => ToOffline,
+            // Consider the case when the storage controller handles the re-attach of a node
+            // before the heartbeats detect that the node is back online. We still need
+            // [`Service::node_configure`] to attempt reconciliations for shards with an
+            // unknown observed location.
+            // The unsavoury match arm below handles this situation.
+            (Active(lhs), Active(rhs))
+                if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
+            {
+                ToActive
+            }
             _ => Unchanged,
         }
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cf6a95bf0b..926332f946 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -12,7 +12,7 @@ use crate::{
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
-    scheduler::{ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
     },
@@ -747,29 +747,61 @@ impl Service {
             let res = self.heartbeater.heartbeat(nodes).await;
             if let Ok(deltas) = res {
                 for (node_id, state) in deltas.0 {
-                    let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
-                            UtilizationScore(utilization.utilization_score),
+                    let (new_node, new_availability) = match state {
+                        PageserverState::Available {
+                            utilization, new, ..
+                        } => (
+                            new,
+                            NodeAvailability::Active(UtilizationScore(
+                                utilization.utilization_score,
+                            )),
                         ),
-                        PageserverState::Offline => NodeAvailability::Offline,
+                        PageserverState::Offline => (false, NodeAvailability::Offline),
                     };
-                    let res = self
-                        .node_configure(node_id, Some(new_availability), None)
-                        .await;
 
-                    match res {
-                        Ok(()) => {}
-                        Err(ApiError::NotFound(_)) => {
-                            // This should be rare, but legitimate since the heartbeats are done
-                            // on a snapshot of the nodes.
-                            tracing::info!("Node {} was not found after heartbeat round", node_id);
+                    if new_node {
+                        // When the heartbeats detect a newly added node, we don't wish
+                        // to attempt to reconcile the shards assigned to it. The node
+                        // is likely handling it's re-attach response, so reconciling now
+                        // would be counterproductive.
+                        //
+                        // Instead, update the in-memory state with the details learned about the
+                        // node.
+                        let mut locked = self.inner.write().unwrap();
+                        let (nodes, _tenants, scheduler) = locked.parts_mut();
+
+                        let mut new_nodes = (**nodes).clone();
+
+                        if let Some(node) = new_nodes.get_mut(&node_id) {
+                            node.set_availability(new_availability);
+                            scheduler.node_upsert(node);
                         }
-                        Err(err) => {
-                            tracing::error!(
-                                "Failed to update node {} after heartbeat round: {}",
-                                node_id,
-                                err
-                            );
+
+                        locked.nodes = Arc::new(new_nodes);
+                    } else {
+                        // This is the code path for geniune availability transitions (i.e node
+                        // goes unavailable and/or comes back online).
+                        let res = self
+                            .node_configure(node_id, Some(new_availability), None)
+                            .await;
+
+                        match res {
+                            Ok(()) => {}
+                            Err(ApiError::NotFound(_)) => {
+                                // This should be rare, but legitimate since the heartbeats are done
+                                // on a snapshot of the nodes.
+                                tracing::info!(
+                                    "Node {} was not found after heartbeat round",
+                                    node_id
+                                );
+                            }
+                            Err(err) => {
+                                tracing::error!(
+                                    "Failed to update node {} after heartbeat round: {}",
+                                    node_id,
+                                    err
+                                );
+                            }
                         }
                     }
                 }
@@ -4316,6 +4348,16 @@ impl Service {
                         continue;
                     }
 
+                    if !new_nodes
+                        .values()
+                        .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                    {
+                        // Special case for when all nodes are unavailable and/or unschedulable: there is no point
+                        // trying to reschedule since there's nowhere else to go. Without this
+                        // branch we incorrectly detach tenants in response to node unavailability.
+                        continue;
+                    }
+
                     if tenant_shard.intent.demote_attached(scheduler, node_id) {
                         tenant_shard.sequence = tenant_shard.sequence.next();
 
@@ -4353,6 +4395,12 @@ impl Service {
                 // When a node comes back online, we must reconcile any tenant that has a None observed
                 // location on the node.
                 for tenant_shard in locked.tenants.values_mut() {
+                    // If a reconciliation is already in progress, rely on the previous scheduling
+                    // decision and skip triggering a new reconciliation.
+                    if tenant_shard.reconciler.is_some() {
+                        continue;
+                    }
+
                     if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
                             self.maybe_reconcile_shard(tenant_shard, &new_nodes);
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index f41468210c..8624a45f45 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -934,19 +934,27 @@ class Failure:
     def clear(self, env: NeonEnv):
         raise NotImplementedError()
 
+    def nodes(self):
+        raise NotImplementedError()
+
 
 class NodeStop(Failure):
-    def __init__(self, pageserver_id, immediate):
-        self.pageserver_id = pageserver_id
+    def __init__(self, pageserver_ids, immediate):
+        self.pageserver_ids = pageserver_ids
         self.immediate = immediate
 
     def apply(self, env: NeonEnv):
-        pageserver = env.get_pageserver(self.pageserver_id)
-        pageserver.stop(immediate=self.immediate)
+        for ps_id in self.pageserver_ids:
+            pageserver = env.get_pageserver(ps_id)
+            pageserver.stop(immediate=self.immediate)
 
     def clear(self, env: NeonEnv):
-        pageserver = env.get_pageserver(self.pageserver_id)
-        pageserver.start()
+        for ps_id in self.pageserver_ids:
+            pageserver = env.get_pageserver(ps_id)
+            pageserver.start()
+
+    def nodes(self):
+        return self.pageserver_ids
 
 
 class PageserverFailpoint(Failure):
@@ -962,6 +970,9 @@ class PageserverFailpoint(Failure):
         pageserver = env.get_pageserver(self.pageserver_id)
         pageserver.http_client().configure_failpoints((self.failpoint, "off"))
 
+    def nodes(self):
+        return [self.pageserver_id]
+
 
 def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
     tenants = env.storage_controller.tenant_list()
@@ -985,8 +996,9 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
 @pytest.mark.parametrize(
     "failure",
     [
-        NodeStop(pageserver_id=1, immediate=False),
-        NodeStop(pageserver_id=1, immediate=True),
+        NodeStop(pageserver_ids=[1], immediate=False),
+        NodeStop(pageserver_ids=[1], immediate=True),
+        NodeStop(pageserver_ids=[1, 2], immediate=True),
         PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
     ],
 )
@@ -1039,33 +1051,50 @@ def test_storage_controller_heartbeats(
     wait_until(10, 1, tenants_placed)
 
     # ... then we apply the failure
-    offline_node_id = failure.pageserver_id
-    online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
-    env.get_pageserver(offline_node_id).allowed_errors.append(
-        # In the case of the failpoint failure, the impacted pageserver
-        # still believes it has the tenant attached since location
-        # config calls into it will fail due to being marked offline.
-        ".*Dropped remote consistent LSN updates.*",
-    )
+    offline_node_ids = set(failure.nodes())
+    online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids
+
+    for node_id in offline_node_ids:
+        env.get_pageserver(node_id).allowed_errors.append(
+            # In the case of the failpoint failure, the impacted pageserver
+            # still believes it has the tenant attached since location
+            # config calls into it will fail due to being marked offline.
+            ".*Dropped remote consistent LSN updates.*",
+        )
+
+        if len(offline_node_ids) > 1:
+            env.get_pageserver(node_id).allowed_errors.append(
+                ".*Scheduling error when marking pageserver.*offline.*",
+            )
 
     failure.apply(env)
 
     # ... expecting the heartbeats to mark it offline
-    def node_offline():
+    def nodes_offline():
         nodes = env.storage_controller.node_list()
         log.info(f"{nodes=}")
-        target = next(n for n in nodes if n["id"] == offline_node_id)
-        assert target["availability"] == "Offline"
+        for node in nodes:
+            if node["id"] in offline_node_ids:
+                assert node["availability"] == "Offline"
 
     # A node is considered offline if the last successful heartbeat
     # was more than 10 seconds ago (hardcoded in the storage controller).
-    wait_until(20, 1, node_offline)
+    wait_until(20, 1, nodes_offline)
 
     # .. expecting the tenant on the offline node to be migrated
     def tenant_migrated():
+        if len(online_node_ids) == 0:
+            time.sleep(5)
+            return
+
         node_to_tenants = build_node_to_tenants_map(env)
         log.info(f"{node_to_tenants=}")
-        assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
+
+        observed_tenants = set()
+        for node_id in online_node_ids:
+            observed_tenants |= set(node_to_tenants[node_id])
+
+        assert observed_tenants == set(tenant_ids)
 
     wait_until(10, 1, tenant_migrated)
 
@@ -1073,31 +1102,24 @@ def test_storage_controller_heartbeats(
     failure.clear(env)
 
     # ... expecting the offline node to become active again
-    def node_online():
+    def nodes_online():
         nodes = env.storage_controller.node_list()
-        target = next(n for n in nodes if n["id"] == offline_node_id)
-        assert target["availability"] == "Active"
+        for node in nodes:
+            if node["id"] in online_node_ids:
+                assert node["availability"] == "Active"
 
-    wait_until(10, 1, node_online)
+    wait_until(10, 1, nodes_online)
 
     time.sleep(5)
 
-    # ... then we create a new tenant
-    tid = TenantId.generate()
-    env.storage_controller.tenant_create(tid)
-
-    # ... expecting it to be placed on the node that just came back online
-    tenants = env.storage_controller.tenant_list()
-    newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
-    locations = list(newest_tenant["observed"]["locations"].keys())
-    locations = [int(node_id) for node_id in locations]
-    assert locations == [offline_node_id]
+    node_to_tenants = build_node_to_tenants_map(env)
+    log.info(f"Back online: {node_to_tenants=}")
 
     # ... expecting the storage controller to reach a consistent state
     def storage_controller_consistent():
         env.storage_controller.consistency_check()
 
-    wait_until(10, 1, storage_controller_consistent)
+    wait_until(30, 1, storage_controller_consistent)
 
 
 def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):

From b6e1c09c733a699cbaf76ff8508f9552eec1db7f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 17 Jun 2024 12:47:20 +0100
Subject: [PATCH 3/3] CI(check-build-tools-image): change build-tools image
 persistent tag (#8059)

## Problem

We don't rebuild `build-tools` image for changes in a workflow that
builds this image itself
(`.github/workflows/build-build-tools-image.yml`) or in a workflow that
determines which tag to use
(`.github/workflows/check-build-tools-image.yml`)

## Summary of changes
- Use a hash of `Dockerfile.build-tools` and workflow files as a
persistent tag instead of using a commit sha.
---
 .github/workflows/build-build-tools-image.yml |  1 -
 .github/workflows/check-build-tools-image.yml | 23 ++++++-------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 2c994b08ae..6e90a80ab7 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -30,7 +30,6 @@ jobs:
   check-image:
     uses: ./.github/workflows/check-build-tools-image.yml
 
-  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
   build-image:
     needs: [ check-image ]
     if: needs.check-image.outputs.found == 'false'
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index 97116940a0..807a9ef3bd 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -25,26 +25,17 @@ jobs:
       found: ${{ steps.check-image.outputs.found }}
 
     steps:
+      - uses: actions/checkout@v4
+
       - name: Get build-tools image tag for the current commit
         id: get-build-tools-tag
         env:
-          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
-          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
-          COMMIT_SHA: ${{ github.sha }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          IMAGE_TAG: |
+            ${{ hashFiles('Dockerfile.build-tools',
+                          '.github/workflows/check-build-tools-image.yml',
+                          '.github/workflows/build-build-tools-image.yml') }}
         run: |
-          LAST_BUILD_TOOLS_SHA=$(
-            gh api \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" \
-              --method GET \
-              --field path=Dockerfile.build-tools \
-              --field sha=${COMMIT_SHA} \
-              --field per_page=1 \
-              --jq ".[0].sha" \
-              "/repos/${GITHUB_REPOSITORY}/commits"
-          )
-          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
+          echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
 
       - name: Check if such tag found in the registry
         id: check-image