tests: flakiness fixes in pageserver tests (#6632)

Fix several test flakes: - test_sharding_service_smoke had log failures on "Dropped LSN updates" - test_emergency_mode had log failures on a deletion queue shutdown check, where the check was incorrect because it was expecting channel receiver to stay alive after cancellation token was fired. - test_secondary_mode_eviction had racing heatmap uploads because the test was using a live migration hook to set up locations, where that migration was itself uploading heatmaps and generally making the situation more complex than it needed to be. These are the failure modes that I saw when spot checking the last few failures of each test. This will mostly/completely address #6511, but I'll leave that ticket open for a couple days and then check if either of the tests named in that ticket are flaky. Related #6511
2026-01-10 06:52:55 +00:00 · 2024-02-06 12:49:41 +00:00
parent dae56ef60c
commit 6297843317
4 changed files with 27 additions and 17 deletions
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -700,8 +700,6 @@ impl DeletionQueue {
    }

    pub async fn shutdown(&mut self, timeout: Duration) {
-        self.cancel.cancel();
-
        match tokio::time::timeout(timeout, self.client.flush()).await {
            Ok(Ok(())) => {
                tracing::info!("Deletion queue flushed successfully on shutdown")
@@ -715,6 +713,10 @@ impl DeletionQueue {
                tracing::warn!("Timed out flushing deletion queue on shutdown")
            }
        }
+
+        // We only cancel _after_ flushing: otherwise we would be shutting down the
+        // components that do the flush.
+        self.cancel.cancel();
    }
 }

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,7 +1162,8 @@ class NeonEnv:
        to the attachment service.
        """
        meta = self.attachment_service.inspect(tenant_id)
-        assert meta is not None, f"{tenant_id} attachment location not found"
+        if meta is None:
+            return None
        pageserver_id = meta[1]
        return self.get_pageserver(pageserver_id)

--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until

 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -194,8 +194,10 @@ class EvictionEnv:

        # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
        for tenant_id, timeline_id in self.timelines:
-            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
-            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+            tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
+            # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
+            if tenant_ps is not None:
+                tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)

        def statvfs_called():
            assert pageserver.log_contains(".*running mocked statvfs.*")
@@ -864,18 +866,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):

    # Set up a situation where one pageserver _only_ has secondary locations on it,
    # so that when we release space we are sure it is via secondary locations.
-
-    log.info("Setting up secondary location...")
-    ps_attached = env.neon_env.pageservers[0]
+    log.info("Setting up secondary locations...")
    ps_secondary = env.neon_env.pageservers[1]
    for tenant_id in tenant_ids:
-        # Migrate all attached tenants to the same pageserver, so that all the secondaries
-        # will run on the other pageserver.  This is necessary because when we create tenants,
-        # they are spread over pageservers by default.
-        env.neon_env.attachment_service.tenant_shard_migrate(
-            TenantShardId(tenant_id, 0, 0), ps_attached.id
-        )
+        # Find where it is attached
+        pageserver = env.neon_env.get_tenant_pageserver(tenant_id)
+        pageserver.http_client().tenant_heatmap_upload(tenant_id)

+        # Detach it
+        pageserver.tenant_detach(tenant_id)
+
+        # Create a secondary mode location for the tenant, all tenants on one pageserver that will only
+        # contain secondary locations: this is the one where we will exercise disk usage eviction
        ps_secondary.tenant_location_configure(
            tenant_id,
            {
@@ -887,8 +889,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
        readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
        log.info(f"Read back conf: {readback_conf}")

-        # Request secondary location to download all layers that the attached location has
-        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        # Request secondary location to download all layers that the attached location indicated
+        # in its heatmap
        ps_secondary.http_client().tenant_secondary_download(tenant_id)

    # Configure the secondary pageserver to have a phony small disk size
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -35,6 +35,11 @@ def test_sharding_service_smoke(
    neon_env_builder.num_pageservers = 3
    env = neon_env_builder.init_configs()

+    for pageserver in env.pageservers:
+        # This test detaches tenants during migration, which can race with deletion queue operations,
+        # during detach we only do an advisory flush, we don't wait for it.
+        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
+
    # Start services by hand so that we can skip a pageserver (this will start + register later)
    env.broker.try_start()
    env.attachment_service.start()