diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 295a1e9f02..311ae5adf4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -347,7 +347,8 @@ async fn timeline_detach_handler(request: Request) -> Result anyhow::Result<()> { - self.timelines.lock().unwrap().remove(&timeline_id); + let mut timelines = self.timelines.lock().unwrap(); + ensure!( + timelines.remove(&timeline_id).is_some(), + "cannot detach timeline {timeline_id} that is not available locally" + ); Ok(()) } diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index 7fe25ab36e..c7a2b1fd22 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -332,7 +332,7 @@ mod tests { .await; assert!( matches!( - dbg!(already_downloading_remote_timeline_download), + already_downloading_remote_timeline_download, DownloadedTimeline::Abort, ), "Should not allow downloading for remote timeline that does not expect it" diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index ace6938e6d..3e0a907d00 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -56,7 +56,7 @@ struct Tenant { /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. repo: Arc, /// Timelines, located locally in the pageserver's datadir. - /// Whatever manipulations happen, local timelines are not removed, only incremented with files. + /// Timelines can entirely be removed entirely by the `detach` operation only. /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. @@ -126,8 +126,8 @@ pub fn apply_timeline_sync_status_updates( continue; } }; - match register_new_timelines(&repo, status_updates) { - Ok(()) => info!("successfully applied tenant {tenant_id} sync status updates"), + match apply_timeline_remote_sync_status_updates(&repo, status_updates) { + Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), Err(e) => error!( "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" ), @@ -305,7 +305,11 @@ pub fn get_local_timeline_with_load( Ok(page_tline) } -pub fn detach_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { +pub fn detach_timeline( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> anyhow::Result<()> { // shutdown the timeline threads (this shuts down the walreceiver) thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id)); @@ -320,6 +324,14 @@ pub fn detach_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow None => bail!("Tenant {tenant_id} not found in local tenant state"), } + let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id); + std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?; + Ok(()) } @@ -386,13 +398,13 @@ fn init_local_repositories( // Lets fail here loudly to be on the safe side. // XXX: It may be a better api to actually distinguish between repository startup // and processing of newly downloaded timelines. - register_new_timelines(&repo, status_updates) + apply_timeline_remote_sync_status_updates(&repo, status_updates) .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))? } Ok(()) } -fn register_new_timelines( +fn apply_timeline_remote_sync_status_updates( repo: &LayeredRepository, status_updates: HashMap, ) -> anyhow::Result<()> { diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 8213d2526b..41907adf1a 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -217,6 +217,13 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant_pg.start() + timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant.hex / 'timelines' / timeline.hex + files_before_detach = os.listdir(timeline_to_detach_local_path) + assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ + but got: {files_before_detach}' + assert len(files_before_detach) > 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ + but got {files_before_detach}' + # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver # is no longer involved, and if it is, we will see the errors @@ -238,6 +245,8 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, load_thread.join(timeout=10) log.info('load thread stopped') + assert not os.path.exists(timeline_to_detach_local_path), f'After detach, local timeline dir {timeline_to_detach_local_path} should be removed' + # bring old pageserver back for clean shutdown via zenith cli # new pageserver will be shut down by the context manager cli_config_lines = (env.repo_dir / 'config').read_text().splitlines()