diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ef2c1a13d9..3f0a1aab01 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -702,22 +702,16 @@ async fn timeline_gc_handler(mut request: Request) -> Result Result>, ApiError> { + let guard = tenants_state::read_tenants(); + + let tenant = guard + .get(&tenant_id) + .map(Arc::clone) + .with_context(|| format!("Tenant {tenant_id} not found")) + .map_err(ApiError::NotFound)?; + + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); + // Use tenant's pitr setting + let pitr = tenant.get_pitr_interval(); + + // Run in task_mgr to avoid race with detach operation + let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::GarbageCollector, + Some(tenant_id), + Some(timeline_id), + &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), + false, + async move { + fail::fail_point!("immediate_gc_task_pre"); + let result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, true) + .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. + match task_done.send(result) { + Ok(_) => (), + Err(result) => error!("failed to send gc result: {result:?}"), + } + Ok(()) + } + ); + + // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task + drop(guard); + + Ok(wait_task_done) +} diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index d20860f25a..403d2bfb6a 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -24,7 +24,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found in the local state") + env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found") # first check for non existing tenant tenant_id = TenantId.generate() @@ -63,7 +63,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") # Detach while running manual GC. - # It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes) + # It should wait for manual GC to finish because it runs in a task associated with the tenant. pageserver_http.configure_failpoints( ("gc_iteration_internal_after_getting_gc_timelines", "return(2000)") )