Merge remote-tracking branch 'origin/main' into HEAD

This commit is contained in:
Heikki Linnakangas
2025-07-12 16:43:57 +03:00
61 changed files with 1448 additions and 448 deletions

View File

@@ -850,6 +850,31 @@ async fn handle_tenant_describe(
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
}
/* BEGIN_HADRON */
async fn handle_tenant_timeline_describe(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Scrubber)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(_req) => {}
};
json_response(
StatusCode::OK,
service
.tenant_timeline_describe(tenant_id, timeline_id)
.await?,
)
}
/* END_HADRON */
async fn handle_tenant_list(
service: Arc<Service>,
req: Request<Body>,
@@ -2480,6 +2505,13 @@ pub fn make_router(
)
})
// Timeline operations
.get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(
r,
handle_tenant_timeline_describe,
RequestName("v1_tenant_timeline_describe"),
)
})
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(
r,

View File

@@ -222,6 +222,9 @@ struct Cli {
/// Primarily useful for testing to reduce test execution time.
#[arg(long, default_value = "false", action=ArgAction::Set)]
kick_secondary_downloads: bool,
#[arg(long)]
shard_split_request_timeout: Option<humantime::Duration>,
}
enum StrictMode {
@@ -470,6 +473,10 @@ async fn async_main() -> anyhow::Result<()> {
timeline_safekeeper_count: args.timeline_safekeeper_count,
posthog_config: posthog_config.clone(),
kick_secondary_downloads: args.kick_secondary_downloads,
shard_split_request_timeout: args
.shard_split_request_timeout
.map(humantime::Duration::into)
.unwrap_or(Duration::MAX),
};
// Validate that we can connect to the database

View File

@@ -86,6 +86,23 @@ impl PageserverClient {
)
}
/* BEGIN_HADRON */
pub(crate) async fn tenant_timeline_describe(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Result<TimelineInfo> {
measured_request!(
"tenant_timeline_describe",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner
.tenant_timeline_describe(tenant_shard_id, timeline_id,)
.await
)
}
/* END_HADRON */
pub(crate) async fn tenant_scan_remote_storage(
&self,
tenant_id: TenantId,

View File

@@ -32,7 +32,7 @@ use pageserver_api::controller_api::{
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse,
TenantShardMigrateRequest, TenantShardMigrateResponse, TenantTimelineDescribeResponse,
};
use pageserver_api::models::{
self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
@@ -60,6 +60,7 @@ use tokio::sync::mpsc::error::TrySendError;
use tokio_util::sync::CancellationToken;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use utils::completion::Barrier;
use utils::env;
use utils::generation::Generation;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -483,6 +484,9 @@ pub struct Config {
/// When set, actively checks and initiates heatmap downloads/uploads.
pub kick_secondary_downloads: bool,
/// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None.
pub shard_split_request_timeout: Duration,
}
impl From<DatabaseError> for ApiError {
@@ -5206,6 +5210,9 @@ impl Service {
match res {
Ok(ok) => Ok(ok),
Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) if msg.contains("Requested tenant is missing") => {
Err(ApiError::ResourceUnavailable("Tenant migration in progress".into()))
},
Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())),
Err(e) => {
Err(
@@ -5486,6 +5493,92 @@ impl Service {
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
}
/* BEGIN_HADRON */
pub(crate) async fn tenant_timeline_describe(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<TenantTimelineDescribeResponse, ApiError> {
self.tenant_remote_mutation(tenant_id, |locations| async move {
if locations.0.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant not found").into(),
));
};
let locations: Vec<(TenantShardId, Node)> = locations
.0
.iter()
.map(|t| (*t.0, t.1.latest.node.clone()))
.collect();
let mut futs = FuturesUnordered::new();
for (shard_id, node) in locations {
futs.push({
async move {
let result = node
.with_client_retries(
|client| async move {
client
.tenant_timeline_describe(&shard_id, &timeline_id)
.await
},
&self.http_client,
&self.config.pageserver_jwt_token,
3,
3,
Duration::from_secs(30),
&self.cancel,
)
.await;
(result, shard_id, node.get_id())
}
});
}
let mut results: Vec<TimelineInfo> = Vec::new();
while let Some((result, tenant_shard_id, node_id)) = futs.next().await {
match result {
Some(Ok(timeline_info)) => results.push(timeline_info),
Some(Err(e)) => {
tracing::warn!(
"Failed to describe tenant {} timeline {} for pageserver {}: {e}",
tenant_shard_id,
timeline_id,
node_id,
);
return Err(ApiError::ResourceUnavailable(format!("{e}").into()));
}
None => return Err(ApiError::Cancelled),
}
}
let mut image_consistent_lsn: Option<Lsn> = Some(Lsn::MAX);
for timeline_info in &results {
if let Some(tline_image_consistent_lsn) = timeline_info.image_consistent_lsn {
image_consistent_lsn = Some(std::cmp::min(
image_consistent_lsn.unwrap(),
tline_image_consistent_lsn,
));
} else {
tracing::warn!(
"Timeline {} on shard {} does not have image consistent lsn",
timeline_info.timeline_id,
timeline_info.tenant_id
);
image_consistent_lsn = None;
break;
}
}
Ok(TenantTimelineDescribeResponse {
shards: results,
image_consistent_lsn,
})
})
.await?
}
/* END_HADRON */
/// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
/// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
/// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
@@ -6317,18 +6410,39 @@ impl Service {
// TODO: issue split calls concurrently (this only matters once we're splitting
// N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
// HADRON: set a timeout for splitting individual shards on page servers.
// Currently we do not perform any retry because it's not clear if page server can handle
// partially split shards correctly.
let shard_split_timeout =
if let Some(env::DeploymentMode::Local) = env::get_deployment_mode() {
Duration::from_secs(30)
} else {
self.config.shard_split_request_timeout
};
let mut http_client_builder = reqwest::ClientBuilder::new()
.pool_max_idle_per_host(0)
.timeout(shard_split_timeout);
for ssl_ca_cert in &self.config.ssl_ca_certs {
http_client_builder = http_client_builder.add_root_certificate(ssl_ca_cert.clone());
}
let http_client = http_client_builder
.build()
.expect("Failed to construct HTTP client");
for target in &targets {
let ShardSplitTarget {
parent_id,
node,
child_ids,
} = target;
let client = PageserverClient::new(
node.get_id(),
self.http_client.clone(),
http_client.clone(),
node.base_url(),
self.config.pageserver_jwt_token.as_deref(),
);
let response = client
.tenant_shard_split(
*parent_id,

View File

@@ -25,7 +25,8 @@ use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
use safekeeper_api::PgVersionId;
use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration};
use safekeeper_api::models::{
PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse,
PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest,
TimelineMembershipSwitchResponse,
};
use safekeeper_api::{INITIAL_TERM, Term};
use safekeeper_client::mgmt_api;
@@ -37,21 +38,14 @@ use utils::lsn::Lsn;
use super::Service;
#[derive(serde::Serialize, serde::Deserialize, Clone)]
pub struct TimelineLocateResponse {
pub generation: SafekeeperGeneration,
pub sk_set: Vec<NodeId>,
pub new_sk_set: Option<Vec<NodeId>>,
}
impl Service {
fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, anyhow::Error> {
let members = safekeepers
.iter()
.map(|sk| sk.get_safekeeper_id())
.collect::<Vec<_>>();
MemberSet::new(members).map_err(ApiError::InternalServerError)
MemberSet::new(members)
}
fn get_safekeepers(&self, ids: &[i64]) -> Result<Vec<Safekeeper>, ApiError> {
@@ -86,7 +80,7 @@ impl Service {
) -> Result<Vec<NodeId>, ApiError> {
let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?;
let mset = Self::make_member_set(&safekeepers)?;
let mset = Self::make_member_set(&safekeepers).map_err(ApiError::InternalServerError)?;
let mconf = safekeeper_api::membership::Configuration::new(mset);
let req = safekeeper_api::models::TimelineCreateRequest {
@@ -1111,6 +1105,26 @@ impl Service {
}
}
if new_sk_set.is_empty() {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"new safekeeper set is empty"
)));
}
if new_sk_set.len() < self.config.timeline_safekeeper_count {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"new safekeeper set must have at least {} safekeepers",
self.config.timeline_safekeeper_count
)));
}
let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
// Construct new member set in advance to validate it.
// E.g. validates that there is no duplicate safekeepers.
let new_sk_member_set =
Self::make_member_set(&new_safekeepers).map_err(ApiError::BadRequest)?;
// TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
let _tenant_lock = trace_shared_lock(
&self.tenant_op_locks,
@@ -1141,6 +1155,18 @@ impl Service {
.map(|&id| NodeId(id as u64))
.collect::<Vec<_>>();
// Validate that we are not migrating to a decomissioned safekeeper.
for sk in new_safekeepers.iter() {
if !cur_sk_set.contains(&sk.get_id())
&& sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned
{
return Err(ApiError::BadRequest(anyhow::anyhow!(
"safekeeper {} is decomissioned",
sk.get_id()
)));
}
}
tracing::info!(
?cur_sk_set,
?new_sk_set,
@@ -1183,11 +1209,8 @@ impl Service {
}
let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?;
let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
let new_sk_member_set = Self::make_member_set(&new_safekeepers)?;
let cur_sk_member_set =
Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
let joint_config = membership::Configuration {
generation,