diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 10ca96a2c1..107eed6801 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -83,12 +83,12 @@ use utils::{ // This is not functionally necessary (clients will retry), but avoids generating a lot of // failed API calls while tenants are activating. #[cfg(not(feature = "testing"))] -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); // Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to // finish attaching, if calls to remote storage are slow. #[cfg(feature = "testing")] -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); pub struct State { conf: &'static PageServerConf, @@ -571,10 +571,16 @@ async fn timeline_list_handler( parse_query_param(&request, "force-await-initial-logical-size")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, false)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b7f4723702..c765c6bacf 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -32,6 +32,7 @@ use crate::control_plane_client::{ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; +use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ @@ -1489,6 +1490,16 @@ impl TenantManager { peek_slot.and_then(|s| s.get_attached()).cloned() }; if let Some(t) = child_shard { + // Wait for the child shard to become active: this should be very quick because it only + // has to download the index_part that we just uploaded when creating it. + if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await { + // This is not fatal: we have durably created the child shard. It just makes the + // split operation less seamless for clients, as we will may detach the parent + // shard before the child shards are fully ready to serve requests. + tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}"); + continue; + } + let timelines = t.timelines.lock().unwrap().clone(); for timeline in timelines.values() { let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {