tests: flakiness fixes in pageserver tests (#6632)

Fix several test flakes:
- test_sharding_service_smoke had log failures on "Dropped LSN updates"
- test_emergency_mode had log failures on a deletion queue shutdown
check, where the check was incorrect because it was expecting channel
receiver to stay alive after cancellation token was fired.
- test_secondary_mode_eviction had racing heatmap uploads because the
test was using a live migration hook to set up locations, where that
migration was itself uploading heatmaps and generally making the
situation more complex than it needed to be.

These are the failure modes that I saw when spot checking the last few
failures of each test.

This will mostly/completely address #6511, but I'll leave that ticket
open for a couple days and then check if either of the tests named in
that ticket are flaky.

Related #6511
This commit is contained in:
John Spray
2024-02-06 12:49:41 +00:00
committed by GitHub
parent dae56ef60c
commit 6297843317
4 changed files with 27 additions and 17 deletions

View File

@@ -700,8 +700,6 @@ impl DeletionQueue {
}
pub async fn shutdown(&mut self, timeout: Duration) {
self.cancel.cancel();
match tokio::time::timeout(timeout, self.client.flush()).await {
Ok(Ok(())) => {
tracing::info!("Deletion queue flushed successfully on shutdown")
@@ -715,6 +713,10 @@ impl DeletionQueue {
tracing::warn!("Timed out flushing deletion queue on shutdown")
}
}
// We only cancel _after_ flushing: otherwise we would be shutting down the
// components that do the flush.
self.cancel.cancel();
}
}

View File

@@ -1162,7 +1162,8 @@ class NeonEnv:
to the attachment service.
"""
meta = self.attachment_service.inspect(tenant_id)
assert meta is not None, f"{tenant_id} attachment location not found"
if meta is None:
return None
pageserver_id = meta[1]
return self.get_pageserver(pageserver_id)

View File

@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import wait_for_upload_queue_empty
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -194,8 +194,10 @@ class EvictionEnv:
# we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
for tenant_id, timeline_id in self.timelines:
pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
# Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
if tenant_ps is not None:
tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
def statvfs_called():
assert pageserver.log_contains(".*running mocked statvfs.*")
@@ -864,18 +866,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
# Set up a situation where one pageserver _only_ has secondary locations on it,
# so that when we release space we are sure it is via secondary locations.
log.info("Setting up secondary location...")
ps_attached = env.neon_env.pageservers[0]
log.info("Setting up secondary locations...")
ps_secondary = env.neon_env.pageservers[1]
for tenant_id in tenant_ids:
# Migrate all attached tenants to the same pageserver, so that all the secondaries
# will run on the other pageserver. This is necessary because when we create tenants,
# they are spread over pageservers by default.
env.neon_env.attachment_service.tenant_shard_migrate(
TenantShardId(tenant_id, 0, 0), ps_attached.id
)
# Find where it is attached
pageserver = env.neon_env.get_tenant_pageserver(tenant_id)
pageserver.http_client().tenant_heatmap_upload(tenant_id)
# Detach it
pageserver.tenant_detach(tenant_id)
# Create a secondary mode location for the tenant, all tenants on one pageserver that will only
# contain secondary locations: this is the one where we will exercise disk usage eviction
ps_secondary.tenant_location_configure(
tenant_id,
{
@@ -887,8 +889,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
log.info(f"Read back conf: {readback_conf}")
# Request secondary location to download all layers that the attached location has
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
# Request secondary location to download all layers that the attached location indicated
# in its heatmap
ps_secondary.http_client().tenant_secondary_download(tenant_id)
# Configure the secondary pageserver to have a phony small disk size

View File

@@ -35,6 +35,11 @@ def test_sharding_service_smoke(
neon_env_builder.num_pageservers = 3
env = neon_env_builder.init_configs()
for pageserver in env.pageservers:
# This test detaches tenants during migration, which can race with deletion queue operations,
# during detach we only do an advisory flush, we don't wait for it.
pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
# Start services by hand so that we can skip a pageserver (this will start + register later)
env.broker.try_start()
env.attachment_service.start()