temp: planning

2026-01-15 17:32:56 +00:00 · 2024-07-17 12:06:55 +00:00
parent f398ab0264
commit 2e58ccee78
3 changed files with 25 additions and 0 deletions
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -995,6 +995,10 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

+            // FIXME: collect here **any** timelines which have started and not finished
+            // detach_ancestor (ignoring the ones which have started deletion instead)
+            // then later reflect it in the Tenant::detach_ancestor whatever
+
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -150,6 +150,9 @@ pub(super) async fn prepare(
            .values()
            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
            .map(|tl| (tl.ancestor_lsn, tl.clone()))
+            // FIXME: instead of collecting, partition based on if they are still reparentable
+            // if gc has been blocked, we can still reparent these timelines
+            //
            // Collect to avoid lock taking order problem with Tenant::timelines and
            // Timeline::remote_client
            .collect::<Vec<_>>();
@@ -176,12 +179,19 @@ pub(super) async fn prepare(
            return Err(Error::ShuttingDown);
        }

+        // FIXME: reparent the reparentable (return Progress::Prepared if there were any) -- we
+        // will need to acquire the lock as well..? so it would make sense that at load time we
+        // would detect the in-progress ness and "soft-acquire it for us".
+        // FIXME: otherwise release gc blocking if it still is there, and wait for upload
+
        let mut reparented = all_direct_children;
        // why this instead of hashset? there is a reason, but I've forgotten it many times.
        //
        // maybe if this was a hashset we would not be able to distinguish some race condition.
        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));

+        // FIXME: add the non-reparented in to the response -- these would be the reparentable, but
+        // no longer reparentable because they appeared *after* gc blocking was released.
        return Ok(Progress::Done(AncestorDetached {
            reparented_timelines: reparented
                .into_iter()
@@ -217,6 +227,9 @@ pub(super) async fn prepare(
        *guard = Some((detached.timeline_id, barrier));
    }

+    // FIXME: modify the index part to have a "detach-ancestor: inprogress { started_at }"
+    // unsure if it should be awaited to upload yet...
+
    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;

    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
@@ -564,6 +577,9 @@ pub(super) async fn complete(
        )
        .await?;

+    // FIXME: assert that the persistent record of inprogress detach exists
+    // FIXME: assert that gc is still blocked
+
    let mut tasks = tokio::task::JoinSet::new();

    // because we are now keeping the slot in progress, it is unlikely that there will be any
@@ -649,6 +665,7 @@ pub(super) async fn complete(
    }

    if reparenting_candidates != reparented.len() {
+        // FIXME: we must return 503 kind of response
        tracing::info!("failed to reparent some candidates");
    }

@@ -659,5 +676,8 @@ pub(super) async fn complete(
        .map(|(_, timeline_id)| timeline_id)
        .collect();

+    // FIXME: here everything has gone peachy, the tenant will be restarted next.
+    // after restart and before returning the response, the gc blocking must be undone
+
    Ok(reparented)
 }
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2859,6 +2859,7 @@ impl Service {
                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
                        }
                        // rest can be mapped as usual
+                        // FIXME: this converts some 500 to 409 which is not per openapi
                        other => passthrough_api_error(&node, other),
                    }
                })