From 62978433176ca6a9679baea769aa751c48fa037d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Feb 2024 12:49:41 +0000
Subject: [PATCH] tests: flakiness fixes in pageserver tests (#6632)

Fix several test flakes:
- test_sharding_service_smoke had log failures on "Dropped LSN updates"
- test_emergency_mode had log failures on a deletion queue shutdown
check, where the check was incorrect because it was expecting channel
receiver to stay alive after cancellation token was fired.
- test_secondary_mode_eviction had racing heatmap uploads because the
test was using a live migration hook to set up locations, where that
migration was itself uploading heatmaps and generally making the
situation more complex than it needed to be.

These are the failure modes that I saw when spot checking the last few
failures of each test.

This will mostly/completely address #6511, but I'll leave that ticket
open for a couple days and then check if either of the tests named in
that ticket are flaky.

Related #6511
---
 pageserver/src/deletion_queue.rs              |  6 ++--
 test_runner/fixtures/neon_fixtures.py         |  3 +-
 .../regress/test_disk_usage_eviction.py       | 30 ++++++++++---------
 test_runner/regress/test_sharding_service.py  |  5 ++++
 4 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 6a820e1bdc..da1da9331a 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -700,8 +700,6 @@ impl DeletionQueue {
     }
 
     pub async fn shutdown(&mut self, timeout: Duration) {
-        self.cancel.cancel();
-
         match tokio::time::timeout(timeout, self.client.flush()).await {
             Ok(Ok(())) => {
                 tracing::info!("Deletion queue flushed successfully on shutdown")
@@ -715,6 +713,10 @@ impl DeletionQueue {
                 tracing::warn!("Timed out flushing deletion queue on shutdown")
             }
         }
+
+        // We only cancel _after_ flushing: otherwise we would be shutting down the
+        // components that do the flush.
+        self.cancel.cancel();
     }
 }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5ce2fca820..bf7c6ccc14 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,7 +1162,8 @@ class NeonEnv:
         to the attachment service.
         """
         meta = self.attachment_service.inspect(tenant_id)
-        assert meta is not None, f"{tenant_id} attachment location not found"
+        if meta is None:
+            return None
         pageserver_id = meta[1]
         return self.get_pageserver(pageserver_id)
 
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index dcbf8a5025..061c57c88b 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -194,8 +194,10 @@ class EvictionEnv:
 
         # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
         for tenant_id, timeline_id in self.timelines:
-            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
-            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+            tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
+            # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
+            if tenant_ps is not None:
+                tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
 
         def statvfs_called():
             assert pageserver.log_contains(".*running mocked statvfs.*")
@@ -864,18 +866,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
 
     # Set up a situation where one pageserver _only_ has secondary locations on it,
     # so that when we release space we are sure it is via secondary locations.
-
-    log.info("Setting up secondary location...")
-    ps_attached = env.neon_env.pageservers[0]
+    log.info("Setting up secondary locations...")
     ps_secondary = env.neon_env.pageservers[1]
     for tenant_id in tenant_ids:
-        # Migrate all attached tenants to the same pageserver, so that all the secondaries
-        # will run on the other pageserver.  This is necessary because when we create tenants,
-        # they are spread over pageservers by default.
-        env.neon_env.attachment_service.tenant_shard_migrate(
-            TenantShardId(tenant_id, 0, 0), ps_attached.id
-        )
+        # Find where it is attached
+        pageserver = env.neon_env.get_tenant_pageserver(tenant_id)
+        pageserver.http_client().tenant_heatmap_upload(tenant_id)
 
+        # Detach it
+        pageserver.tenant_detach(tenant_id)
+
+        # Create a secondary mode location for the tenant, all tenants on one pageserver that will only
+        # contain secondary locations: this is the one where we will exercise disk usage eviction
         ps_secondary.tenant_location_configure(
             tenant_id,
             {
@@ -887,8 +889,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
         readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
         log.info(f"Read back conf: {readback_conf}")
 
-        # Request secondary location to download all layers that the attached location has
-        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        # Request secondary location to download all layers that the attached location indicated
+        # in its heatmap
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     # Configure the secondary pageserver to have a phony small disk size
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 5c70378ab0..ee57fcb2cf 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -35,6 +35,11 @@ def test_sharding_service_smoke(
     neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_configs()
 
+    for pageserver in env.pageservers:
+        # This test detaches tenants during migration, which can race with deletion queue operations,
+        # during detach we only do an advisory flush, we don't wait for it.
+        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
+
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
     env.attachment_service.start()