From 3d5e2bf685dc36e2c5b64b19f3e4d139a5bb10ea Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 12 Jun 2025 12:09:57 +0400 Subject: [PATCH] storcon: add tenant_timeline_locate handler (#12203) ## Problem Compatibility tests may be run against a compatibility snapshot generated with `--timelines-onto-safekeepers=false`. We need to start the compute without a generation (or with 0 generation) if the timeline is not storcon-managed, otherwise the compute will hang. This handler is needed to check if the timeline is storcon-managed. It's also needed for better test coverage of safekeeper migration code. - Relates to https://github.com/neondatabase/neon/pull/11712 ## Summary of changes - Implement `tenant_timeline_locate` handler in storcon to get safekeeper info from storcon's DB --- storage_controller/src/http.rs | 35 ++++++++++++++++ .../src/service/safekeeper_service.rs | 41 ++++++++++++++++++- test_runner/fixtures/neon_fixtures.py | 11 +++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 705b81077e..7051a3326d 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1398,6 +1398,31 @@ async fn handle_timeline_import(req: Request) -> Result, Ap ) } +async fn handle_tenant_timeline_locate( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + check_permissions(&req, Scope::Admin)?; + maybe_rate_limit(&req, tenant_id).await; + + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + + json_response( + StatusCode::OK, + service + .tenant_timeline_locate(tenant_id, timeline_id) + .await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -2139,6 +2164,16 @@ pub fn make_router( ) }, ) + .get( + "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/locate", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_locate, + RequestName("v1_tenant_timeline_locate"), + ) + }, + ) .get("/debug/v1/scheduler", |r| { named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 1f673fe445..61b9ec6b6d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -17,7 +17,7 @@ use pageserver_api::controller_api::{ SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; -use safekeeper_api::membership::{MemberSet, SafekeeperId}; +use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; @@ -26,6 +26,13 @@ use utils::lsn::Lsn; use super::Service; +#[derive(serde::Serialize, serde::Deserialize, Clone)] +pub struct TimelineLocateResponse { + pub generation: SafekeeperGeneration, + pub sk_set: Vec, + pub new_sk_set: Option>, +} + impl Service { /// Timeline creation on safekeepers /// @@ -396,6 +403,38 @@ impl Service { Ok(()) } + /// Locate safekeepers for a timeline. + /// Return the generation, sk_set and new_sk_set if present. + /// If the timeline is not storcon-managed, return NotFound. + pub(crate) async fn tenant_timeline_locate( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let timeline = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + + let Some(timeline) = timeline else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Timeline {}/{} not found", tenant_id, timeline_id).into(), + )); + }; + + Ok(TimelineLocateResponse { + generation: SafekeeperGeneration::new(timeline.generation as u32), + sk_set: timeline + .sk_set + .iter() + .map(|id| NodeId(*id as u64)) + .collect(), + new_sk_set: timeline + .new_sk_set + .map(|sk_set| sk_set.iter().map(|id| NodeId(*id as u64)).collect()), + }) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. pub(super) async fn tenant_timeline_delete_safekeepers( self: &Arc, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 84caf9e2af..df34573b12 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2223,6 +2223,17 @@ class NeonStorageController(MetricsGetter, LogUtils): shards: list[dict[str, Any]] = body["shards"] return shards + def timeline_locate(self, tenant_id: TenantId, timeline_id: TimelineId): + """ + :return: dict {"generation": int, "sk_set": [int], "new_sk_set": [int]} + """ + response = self.request( + "GET", + f"{self.api}/debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + def tenant_describe(self, tenant_id: TenantId): """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}