pageserver: during shard split, wait for child to activate (#6789)

## Problem test_sharding_split_unsharded was flaky with log errors from tenants not being active. This was happening when the split function enters wait_lsn() while the child shard might still be activating. It's flaky rather than an outright failure because activation is usually very fast. This is also a real bug fix, because in realistic scenarios we could proceed to detach the parent shard before the children are ready, leading to an availability gap for clients. ## Summary of changes - Do a short wait_to_become_active on the child shards before proceeding to wait for their LSNs to advance --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
2026-01-07 13:32:57 +00:00 · 2024-02-18 15:55:19 +00:00
parent 61f99d703d
commit 5667372c61
2 changed files with 20 additions and 3 deletions
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -83,12 +83,12 @@ use utils::{
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
 #[cfg(not(feature = "testing"))]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);

 // Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
 // finish attaching, if calls to remote storage are slow.
 #[cfg(feature = "testing")]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

 pub struct State {
    conf: &'static PageServerConf,
@@ -571,10 +571,16 @@ async fn timeline_list_handler(
        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -32,6 +32,7 @@ use crate::control_plane_client::{
    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
+use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
@@ -1489,6 +1490,16 @@ impl TenantManager {
                peek_slot.and_then(|s| s.get_attached()).cloned()
            };
            if let Some(t) = child_shard {
+                // Wait for the child shard to become active: this should be very quick because it only
+                // has to download the index_part that we just uploaded when creating it.
+                if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await {
+                    // This is not fatal: we have durably created the child shard.  It just makes the
+                    // split operation less seamless for clients, as we will may detach the parent
+                    // shard before the child shards are fully ready to serve requests.
+                    tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}");
+                    continue;
+                }
+
                let timelines = t.timelines.lock().unwrap().clone();
                for timeline in timelines.values() {
                    let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {