fix(storcon): correctly pass through lease error code (#12519)

## Problem

close LKB-199

## Summary of changes

We always return the error as 500 to the cplane if a LSN lease request
fails. This cause issues for the cplane as they don't retry on 500. This
patch correctly passes through the error and assign the error code so
that cplane can know if it is a retryable error. (TODO: look at the
cplane code and learn the retry logic).

Note that this patch does not resolve LKB-253 -- we need to handle not
found error separately in the lsn lease path, like wait until the tenant
gets attached, or return 503 so that cplane can retry.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-07-09 16:22:55 -04:00
committed by GitHub
parent 28f604d628
commit 0b639ba608

View File

@@ -4428,7 +4428,7 @@ impl Service {
.await;
let mut failed = 0;
for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) {
for (tid, (_, result)) in targeted_tenant_shards.iter().zip(results.into_iter()) {
match result {
Ok(ok) => {
if tid.is_shard_zero() {
@@ -4795,7 +4795,7 @@ impl Service {
.await;
let mut valid_until = None;
for r in res {
for (node, r) in res {
match r {
Ok(lease) => {
if let Some(ref mut valid_until) = valid_until {
@@ -4805,7 +4805,7 @@ impl Service {
}
}
Err(e) => {
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
return Err(passthrough_api_error(&node, e));
}
}
}
@@ -4919,7 +4919,7 @@ impl Service {
max_retries: u32,
timeout: Duration,
cancel: &CancellationToken,
) -> Vec<mgmt_api::Result<T>>
) -> Vec<(Node, mgmt_api::Result<T>)>
where
O: Fn(TenantShardId, PageserverClient) -> F + Copy,
F: std::future::Future<Output = mgmt_api::Result<T>>,
@@ -4940,16 +4940,16 @@ impl Service {
cancel,
)
.await;
(idx, r)
(idx, node, r)
});
}
while let Some((idx, r)) = futs.next().await {
results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
while let Some((idx, node, r)) = futs.next().await {
results.push((idx, node, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
}
results.sort_by_key(|(idx, _)| *idx);
results.into_iter().map(|(_, r)| r).collect()
results.sort_by_key(|(idx, _, _)| *idx);
results.into_iter().map(|(_, node, r)| (node, r)).collect()
}
/// Helper for safely working with the shards in a tenant remotely on pageservers, for example
@@ -5862,7 +5862,7 @@ impl Service {
return;
}
for result in self
for (_, result) in self
.tenant_for_shards_api(
attached,
|tenant_shard_id, client| async move {
@@ -5881,7 +5881,7 @@ impl Service {
}
}
for result in self
for (_, result) in self
.tenant_for_shards_api(
secondary,
|tenant_shard_id, client| async move {
@@ -8768,7 +8768,7 @@ impl Service {
)
.await;
for ((tenant_shard_id, node, optimization), secondary_status) in
for ((tenant_shard_id, node, optimization), (_, secondary_status)) in
want_secondary_status.into_iter().zip(results.into_iter())
{
match secondary_status {