From 38ebd6e7a00bb37b221095158e3b79c6a33ba5b5 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 15 Dec 2022 15:16:25 +0100 Subject: [PATCH] [3/4] make initial size estimation task sensitive to task_mgr shutdown requests This exacerbates the problem pointed out in the previous commit. Why? Because with this patch, deleting a timeline also exposes the issue. Extend the test to expose the problem. --- pageserver/src/tenant/timeline.rs | 3 +++ test_runner/regress/test_timeline_size.py | 27 +++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e957878472..b7f12609e6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1290,6 +1290,9 @@ impl Timeline { } Ok(()) }, + _ = task_mgr::shutdown_watcher() => { + anyhow::bail!("aborted because task_mgr shutdown requested"); + } new_event = async { loop { match timeline_state_updates.changed().await { diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index e881608a44..38660cefac 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -9,6 +9,7 @@ from pathlib import Path import psycopg2.errors import psycopg2.extras +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -216,7 +217,10 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" -def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("deletion_method", ["tenant_detach", "timeline_delete"]) +def test_timeline_initial_logical_size_calculation_cancellation( + neon_env_builder: NeonEnvBuilder, deletion_method: str +): env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -259,16 +263,20 @@ def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder assert_size_calculation_not_done() log.info( - "try to delete the timeline, this should cancel size computation tasks and wait for them to finish" - ) - env.pageserver.allowed_errors.append( - f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested" + f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" ) + if deletion_method == "timeline_delete": + env.pageserver.allowed_errors.append( + f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested" + ) delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1) def delete_timeline_thread_fn(): try: - client.tenant_detach(tenant_id) + if deletion_method == "tenant_detach": + client.tenant_detach(tenant_id) + elif deletion_method == "timeline_delete": + client.timeline_delete(tenant_id, timeline_id) delete_timeline_success.put(True) except PageserverApiException: delete_timeline_success.put(False) @@ -278,9 +286,10 @@ def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder delete_timeline_thread.start() # give it some time to settle in the state where it waits for size computation task time.sleep(5) - assert ( - not delete_timeline_success.empty() - ), "delete timeline should be stuck waiting for size computation task" + if not delete_timeline_success.empty(): + assert ( + False + ), f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" log.info( "resume the size calculation. The failpoint checks that the timeline directory still exists."