From 3d5e2bf685dc36e2c5b64b19f3e4d139a5bb10ea Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:09:57 +0400
Subject: [PATCH] storcon: add tenant_timeline_locate handler (#12203)
## Problem
Compatibility tests may be run against a compatibility snapshot
generated with `--timelines-onto-safekeepers=false`. We need to start
the compute without a generation (or with 0 generation) if the timeline
is not storcon-managed, otherwise the compute will hang.
This handler is needed to check if the timeline is storcon-managed.
It's also needed for better test coverage of safekeeper migration code.
- Relates to https://github.com/neondatabase/neon/pull/11712
## Summary of changes
- Implement `tenant_timeline_locate` handler in storcon to get
safekeeper info from storcon's DB
---
storage_controller/src/http.rs | 35 ++++++++++++++++
.../src/service/safekeeper_service.rs | 41 ++++++++++++++++++-
test_runner/fixtures/neon_fixtures.py | 11 +++++
3 files changed, 86 insertions(+), 1 deletion(-)
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 705b81077e..7051a3326d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1398,6 +1398,31 @@ async fn handle_timeline_import(req: Request
) -> Result, Ap
)
}
+async fn handle_tenant_timeline_locate(
+ service: Arc,
+ req: Request,
+) -> Result, ApiError> {
+ let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+ let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+ check_permissions(&req, Scope::Admin)?;
+ maybe_rate_limit(&req, tenant_id).await;
+
+ match maybe_forward(req).await {
+ ForwardOutcome::Forwarded(res) => {
+ return res;
+ }
+ ForwardOutcome::NotForwarded(_req) => {}
+ };
+
+ json_response(
+ StatusCode::OK,
+ service
+ .tenant_timeline_locate(tenant_id, timeline_id)
+ .await?,
+ )
+}
+
async fn handle_tenants_dump(req: Request) -> Result, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -2139,6 +2164,16 @@ pub fn make_router(
)
},
)
+ .get(
+ "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/locate",
+ |r| {
+ tenant_service_handler(
+ r,
+ handle_tenant_timeline_locate,
+ RequestName("v1_tenant_timeline_locate"),
+ )
+ },
+ )
.get("/debug/v1/scheduler", |r| {
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
})
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 1f673fe445..61b9ec6b6d 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -17,7 +17,7 @@ use pageserver_api::controller_api::{
SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
};
use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
-use safekeeper_api::membership::{MemberSet, SafekeeperId};
+use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId};
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use utils::id::{NodeId, TenantId, TimelineId};
@@ -26,6 +26,13 @@ use utils::lsn::Lsn;
use super::Service;
+#[derive(serde::Serialize, serde::Deserialize, Clone)]
+pub struct TimelineLocateResponse {
+ pub generation: SafekeeperGeneration,
+ pub sk_set: Vec,
+ pub new_sk_set: Option>,
+}
+
impl Service {
/// Timeline creation on safekeepers
///
@@ -396,6 +403,38 @@ impl Service {
Ok(())
}
+ /// Locate safekeepers for a timeline.
+ /// Return the generation, sk_set and new_sk_set if present.
+ /// If the timeline is not storcon-managed, return NotFound.
+ pub(crate) async fn tenant_timeline_locate(
+ &self,
+ tenant_id: TenantId,
+ timeline_id: TimelineId,
+ ) -> Result {
+ let timeline = self
+ .persistence
+ .get_timeline(tenant_id, timeline_id)
+ .await?;
+
+ let Some(timeline) = timeline else {
+ return Err(ApiError::NotFound(
+ anyhow::anyhow!("Timeline {}/{} not found", tenant_id, timeline_id).into(),
+ ));
+ };
+
+ Ok(TimelineLocateResponse {
+ generation: SafekeeperGeneration::new(timeline.generation as u32),
+ sk_set: timeline
+ .sk_set
+ .iter()
+ .map(|id| NodeId(*id as u64))
+ .collect(),
+ new_sk_set: timeline
+ .new_sk_set
+ .map(|sk_set| sk_set.iter().map(|id| NodeId(*id as u64)).collect()),
+ })
+ }
+
/// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
pub(super) async fn tenant_timeline_delete_safekeepers(
self: &Arc,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 84caf9e2af..df34573b12 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2223,6 +2223,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
shards: list[dict[str, Any]] = body["shards"]
return shards
+ def timeline_locate(self, tenant_id: TenantId, timeline_id: TimelineId):
+ """
+ :return: dict {"generation": int, "sk_set": [int], "new_sk_set": [int]}
+ """
+ response = self.request(
+ "GET",
+ f"{self.api}/debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate",
+ headers=self.headers(TokenScope.ADMIN),
+ )
+ return response.json()
+
def tenant_describe(self, tenant_id: TenantId):
"""
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}