pageserver: use image consistent LSN for force image layer creation (#12547)

This is a no-op for the neon deployment * Introduce the concept image consistent lsn: of the largest LSN below which all pages have been redone successfully * Use the image consistent LSN for forced image layer creations * Optionally expose the image consistent LSN via the timeline describe HTTP endpoint * Add a sharded timeline describe endpoint to storcon --------- Co-authored-by: Chen Luo <chen.luo@databricks.com>
2026-01-04 03:52:56 +00:00 · 2025-07-11 12:39:51 +01:00
parent c34d36d8a2
commit 15f633922a
13 changed files with 567 additions and 80 deletions
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -850,6 +850,31 @@ async fn handle_tenant_describe(
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

+/* BEGIN_HADRON */
+async fn handle_tenant_timeline_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_timeline_describe(tenant_id, timeline_id)
+            .await?,
+    )
+}
+/* END_HADRON */
+
 async fn handle_tenant_list(
    service: Arc<Service>,
    req: Request<Body>,
@@ -2480,6 +2505,13 @@ pub fn make_router(
            )
        })
        // Timeline operations
+        .get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_describe,
+                RequestName("v1_tenant_timeline_describe"),
+            )
+        })
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(
                r,
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -86,6 +86,23 @@ impl PageserverClient {
        )
    }

+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "tenant_timeline_describe",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner
+                .tenant_timeline_describe(tenant_shard_id, timeline_id,)
+                .await
+        )
+    }
+    /* END_HADRON */
+
    pub(crate) async fn tenant_scan_remote_storage(
        &self,
        tenant_id: TenantId,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,7 +32,7 @@ use pageserver_api::controller_api::{
    ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
    SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
    TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TenantTimelineDescribeResponse,
 };
 use pageserver_api::models::{
    self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
@@ -5486,6 +5486,92 @@ impl Service {
        .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
    }

+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TenantTimelineDescribeResponse, ApiError> {
+        self.tenant_remote_mutation(tenant_id, |locations| async move {
+            if locations.0.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+
+            let locations: Vec<(TenantShardId, Node)> = locations
+                .0
+                .iter()
+                .map(|t| (*t.0, t.1.latest.node.clone()))
+                .collect();
+            let mut futs = FuturesUnordered::new();
+
+            for (shard_id, node) in locations {
+                futs.push({
+                    async move {
+                        let result = node
+                            .with_client_retries(
+                                |client| async move {
+                                    client
+                                        .tenant_timeline_describe(&shard_id, &timeline_id)
+                                        .await
+                                },
+                                &self.http_client,
+                                &self.config.pageserver_jwt_token,
+                                3,
+                                3,
+                                Duration::from_secs(30),
+                                &self.cancel,
+                            )
+                            .await;
+                        (result, shard_id, node.get_id())
+                    }
+                });
+            }
+
+            let mut results: Vec<TimelineInfo> = Vec::new();
+            while let Some((result, tenant_shard_id, node_id)) = futs.next().await {
+                match result {
+                    Some(Ok(timeline_info)) => results.push(timeline_info),
+                    Some(Err(e)) => {
+                        tracing::warn!(
+                            "Failed to describe tenant {} timeline {} for pageserver {}: {e}",
+                            tenant_shard_id,
+                            timeline_id,
+                            node_id,
+                        );
+                        return Err(ApiError::ResourceUnavailable(format!("{e}").into()));
+                    }
+                    None => return Err(ApiError::Cancelled),
+                }
+            }
+            let mut image_consistent_lsn: Option<Lsn> = Some(Lsn::MAX);
+            for timeline_info in &results {
+                if let Some(tline_image_consistent_lsn) = timeline_info.image_consistent_lsn {
+                    image_consistent_lsn = Some(std::cmp::min(
+                        image_consistent_lsn.unwrap(),
+                        tline_image_consistent_lsn,
+                    ));
+                } else {
+                    tracing::warn!(
+                        "Timeline {} on shard {} does not have image consistent lsn",
+                        timeline_info.timeline_id,
+                        timeline_info.tenant_id
+                    );
+                    image_consistent_lsn = None;
+                    break;
+                }
+            }
+
+            Ok(TenantTimelineDescribeResponse {
+                shards: results,
+                image_consistent_lsn,
+            })
+        })
+        .await?
+    }
+    /* END_HADRON */
+
    /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
    /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
    /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses