From 62978433176ca6a9679baea769aa751c48fa037d Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 6 Feb 2024 12:49:41 +0000 Subject: [PATCH] tests: flakiness fixes in pageserver tests (#6632) Fix several test flakes: - test_sharding_service_smoke had log failures on "Dropped LSN updates" - test_emergency_mode had log failures on a deletion queue shutdown check, where the check was incorrect because it was expecting channel receiver to stay alive after cancellation token was fired. - test_secondary_mode_eviction had racing heatmap uploads because the test was using a live migration hook to set up locations, where that migration was itself uploading heatmaps and generally making the situation more complex than it needed to be. These are the failure modes that I saw when spot checking the last few failures of each test. This will mostly/completely address #6511, but I'll leave that ticket open for a couple days and then check if either of the tests named in that ticket are flaky. Related #6511 --- pageserver/src/deletion_queue.rs | 6 ++-- test_runner/fixtures/neon_fixtures.py | 3 +- .../regress/test_disk_usage_eviction.py | 30 ++++++++++--------- test_runner/regress/test_sharding_service.py | 5 ++++ 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 6a820e1bdc..da1da9331a 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -700,8 +700,6 @@ impl DeletionQueue { } pub async fn shutdown(&mut self, timeout: Duration) { - self.cancel.cancel(); - match tokio::time::timeout(timeout, self.client.flush()).await { Ok(Ok(())) => { tracing::info!("Deletion queue flushed successfully on shutdown") @@ -715,6 +713,10 @@ impl DeletionQueue { tracing::warn!("Timed out flushing deletion queue on shutdown") } } + + // We only cancel _after_ flushing: otherwise we would be shutting down the + // components that do the flush. + self.cancel.cancel(); } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5ce2fca820..bf7c6ccc14 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1162,7 +1162,8 @@ class NeonEnv: to the attachment service. """ meta = self.attachment_service.inspect(tenant_id) - assert meta is not None, f"{tenant_id} attachment location not found" + if meta is None: + return None pageserver_id = meta[1] return self.get_pageserver(pageserver_id) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index dcbf8a5025..061c57c88b 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -194,8 +194,10 @@ class EvictionEnv: # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction for tenant_id, timeline_id in self.timelines: - pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client() - pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id) + tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id) + # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test + if tenant_ps is not None: + tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id) def statvfs_called(): assert pageserver.log_contains(".*running mocked statvfs.*") @@ -864,18 +866,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): # Set up a situation where one pageserver _only_ has secondary locations on it, # so that when we release space we are sure it is via secondary locations. - - log.info("Setting up secondary location...") - ps_attached = env.neon_env.pageservers[0] + log.info("Setting up secondary locations...") ps_secondary = env.neon_env.pageservers[1] for tenant_id in tenant_ids: - # Migrate all attached tenants to the same pageserver, so that all the secondaries - # will run on the other pageserver. This is necessary because when we create tenants, - # they are spread over pageservers by default. - env.neon_env.attachment_service.tenant_shard_migrate( - TenantShardId(tenant_id, 0, 0), ps_attached.id - ) + # Find where it is attached + pageserver = env.neon_env.get_tenant_pageserver(tenant_id) + pageserver.http_client().tenant_heatmap_upload(tenant_id) + # Detach it + pageserver.tenant_detach(tenant_id) + + # Create a secondary mode location for the tenant, all tenants on one pageserver that will only + # contain secondary locations: this is the one where we will exercise disk usage eviction ps_secondary.tenant_location_configure( tenant_id, { @@ -887,8 +889,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): readback_conf = ps_secondary.read_tenant_location_conf(tenant_id) log.info(f"Read back conf: {readback_conf}") - # Request secondary location to download all layers that the attached location has - ps_attached.http_client().tenant_heatmap_upload(tenant_id) + # Request secondary location to download all layers that the attached location indicated + # in its heatmap ps_secondary.http_client().tenant_secondary_download(tenant_id) # Configure the secondary pageserver to have a phony small disk size diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py index 5c70378ab0..ee57fcb2cf 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_sharding_service.py @@ -35,6 +35,11 @@ def test_sharding_service_smoke( neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_configs() + for pageserver in env.pageservers: + # This test detaches tenants during migration, which can race with deletion queue operations, + # during detach we only do an advisory flush, we don't wait for it. + pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"]) + # Start services by hand so that we can skip a pageserver (this will start + register later) env.broker.try_start() env.attachment_service.start()