diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 6380a4c6c1..1fd7c775d5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -317,6 +317,7 @@ fn start_pageserver( let http_listener = tcp_listener::bind(http_addr)?; let pg_addr = &conf.listen_pg_addr; + info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; @@ -549,7 +550,7 @@ fn start_pageserver( let router_state = Arc::new( http::routes::State::new( conf, - tenant_manager, + tenant_manager.clone(), http_auth.clone(), remote_storage.clone(), broker_client.clone(), @@ -693,6 +694,7 @@ fn start_pageserver( let bg_remote_storage = remote_storage.clone(); let bg_deletion_queue = deletion_queue.clone(); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( + &tenant_manager, bg_remote_storage.map(|_| bg_deletion_queue), 0, )); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index b00db02a1c..f947a75f61 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -31,6 +31,7 @@ pub mod walredo; use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; +use tenant::mgr::TenantManager; use tracing::info; /// Current storage format version @@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; #[tracing::instrument(skip_all, fields(%exit_code))] -pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) { +pub async fn shutdown_pageserver( + tenant_manager: &TenantManager, + deletion_queue: Option, + exit_code: i32, +) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. timed( - tenant::mgr::shutdown_all_tenants(), + tenant_manager.shutdown(), "shutdown all tenants", Duration::from_secs(5), ) diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 275a72c0b0..69e163effa 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -50,8 +50,6 @@ use once_cell::sync::Lazy; use utils::id::TimelineId; -use crate::shutdown_pageserver; - // // There are four runtimes: // @@ -453,7 +451,7 @@ async fn task_finish( } if shutdown_process { - shutdown_pageserver(None, 1).await; + std::process::exit(1); } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index ffb7206b1e..cab60c3111 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -296,6 +296,7 @@ impl DeleteTenantFlow { remote_storage: Option, tenants: &'static std::sync::RwLock, tenant: Arc, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { span::debug_assert_current_span_has_tenant_id(); @@ -303,7 +304,9 @@ impl DeleteTenantFlow { let mut guard = Self::prepare(&tenant).await?; - if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { + if let Err(e) = + Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await + { tenant.set_broken(format!("{e:#}")).await; return Err(e); } @@ -322,6 +325,7 @@ impl DeleteTenantFlow { conf: &'static PageServerConf, remote_storage: Option<&GenericRemoteStorage>, tenant: &Tenant, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { guard.mark_in_progress()?; @@ -335,15 +339,9 @@ impl DeleteTenantFlow { // Though sounds scary, different mark name? // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark( - conf, - remote_storage, - &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), - ) - .await - .context("remote_mark")? + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) + .await + .context("remote_mark")? } fail::fail_point!("tenant-delete-before-create-local-mark", |_| { @@ -546,8 +544,7 @@ impl DeleteTenantFlow { conf, remote_storage.as_ref(), &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), + &task_mgr::shutdown_token(), ) .await?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 7cf03d8fd6..3aaab6e4ef 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -102,7 +102,7 @@ pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. /// New tenants can be added using [`tenant_map_acquire_slot`]. Open(BTreeMap), - /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`]. + /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`]. /// Existing tenants are still accessible, but no new tenants can be created. ShuttingDown(BTreeMap), } @@ -261,6 +261,12 @@ pub struct TenantManager { // See https://github.com/neondatabase/neon/issues/5796 tenants: &'static std::sync::RwLock, resources: TenantSharedResources, + + // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token. + // This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime), + // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or + // when the tenant detaches. + cancel: CancellationToken, } fn emergency_generations( @@ -620,6 +626,7 @@ pub async fn init_tenant_mgr( conf, tenants: &TENANTS, resources, + cancel: CancellationToken::new(), }) } @@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn( Ok(tenant) } -/// -/// Shut down all tenants. This runs as part of pageserver shutdown. -/// -/// NB: We leave the tenants in the map, so that they remain accessible through -/// the management API until we shut it down. If we removed the shut-down tenants -/// from the tenants map, the management API would return 404 for these tenants, -/// because TenantsMap::get() now returns `None`. -/// That could be easily misinterpreted by control plane, the consumer of the -/// management API. For example, it could attach the tenant on a different pageserver. -/// We would then be in split-brain once this pageserver restarts. -#[instrument(skip_all)] -pub(crate) async fn shutdown_all_tenants() { - shutdown_all_tenants0(&TENANTS).await -} - async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { let mut join_set = JoinSet::new(); @@ -1428,6 +1420,7 @@ impl TenantManager { self.resources.remote_storage.clone(), &TENANTS, tenant, + &self.cancel, ) .await; @@ -1817,6 +1810,23 @@ impl TenantManager { Ok(()) } + + /// + /// Shut down all tenants. This runs as part of pageserver shutdown. + /// + /// NB: We leave the tenants in the map, so that they remain accessible through + /// the management API until we shut it down. If we removed the shut-down tenants + /// from the tenants map, the management API would return 404 for these tenants, + /// because TenantsMap::get() now returns `None`. + /// That could be easily misinterpreted by control plane, the consumer of the + /// management API. For example, it could attach the tenant on a different pageserver. + /// We would then be in split-brain once this pageserver restarts. + #[instrument(skip_all)] + pub(crate) async fn shutdown(&self) { + self.cancel.cancel(); + + shutdown_all_tenants0(self.tenants).await + } } #[derive(Debug, thiserror::Error)] diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 52de889084..a164c7f60a 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # We may leave some upload tasks in the queue. They're likely deletes. # For uploads we explicitly wait with `last_flush_lsn_upload` below. # So by ignoring these instead of waiting for empty upload queue @@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach( # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 96a5cc491a..0eb1327c9e 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( [ f".*{timeline_id}.*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # This happens when we fail before scheduling background operation. # Timeline is left in stopping state and retry tries to stop it again. ".*Ignoring new state, equal to the existing one: Stopping", @@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ".*failpoint: timeline-delete-before-rm", ".*Ignoring new state, equal to the existing one: Stopping", # this happens, because the stuck timeline is visible to shutdown - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ] ) @@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this