mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 14:10:37 +00:00
run manual gc in a task_mgr task to prevent race with detach
This fixes flaky test_tenant_detach_smoke.
This commit is contained in:
committed by
Dmitry Rodionov
parent
919f2b261a
commit
66f8f686a0
@@ -702,22 +702,16 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
|
||||
let gc_result = wait_task_done
|
||||
.await
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
.context("wait for gc task")
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, result)
|
||||
|
||||
json_response(StatusCode::OK, gc_result)
|
||||
}
|
||||
|
||||
// Run compaction immediately on given timeline.
|
||||
|
||||
@@ -189,7 +189,7 @@ impl Value {
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize)]
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
|
||||
@@ -504,3 +504,58 @@ pub async fn attach_tenant(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||
let guard = tenants_state::read_tenants();
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
// Run in task_mgr to avoid race with detach operation
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
);
|
||||
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found in the local state")
|
||||
env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found")
|
||||
|
||||
# first check for non existing tenant
|
||||
tenant_id = TenantId.generate()
|
||||
@@ -63,7 +63,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
|
||||
|
||||
# Detach while running manual GC.
|
||||
# It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes)
|
||||
# It should wait for manual GC to finish because it runs in a task associated with the tenant.
|
||||
pageserver_http.configure_failpoints(
|
||||
("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user