diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index d55823b0b7..3d863a6518 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -34,6 +34,9 @@ pub enum ApiError {
#[error("Timeout")]
Timeout(Cow<'static, str>),
+ #[error("Request cancelled")]
+ Cancelled,
+
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -74,6 +77,10 @@ impl ApiError {
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
+ ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+ self.to_string(),
+ StatusCode::INTERNAL_SERVER_ERROR,
+ ),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
@@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response
{
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+ ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
_ => info!("Error processing HTTP request: {api_error:#}"),
}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7fa6c35ad6..19bc88fbc7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -181,9 +181,7 @@ impl From for ApiError {
PageReconstructError::MissingKey(e) => {
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
}
- PageReconstructError::Cancelled => {
- ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
- }
+ PageReconstructError::Cancelled => ApiError::Cancelled,
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
}
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0bff4be150..336d1c3fb8 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,6 +36,7 @@ use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, trace, warn};
use utils::bin_ser::DeserializeError;
+use utils::pausable_failpoint;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -409,6 +410,8 @@ impl Timeline {
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result {
+ pausable_failpoint!("find-lsn-for-timestamp-pausable");
+
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
// We use this method to figure out the branching LSN for the new branch, but the
// GC cutoff could be before the branching point and we cannot create a new branch
@@ -424,6 +427,7 @@ impl Timeline {
let mut found_smaller = false;
let mut found_larger = false;
+
while low < high {
if cancel.is_cancelled() {
return Err(PageReconstructError::Cancelled);
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index ab9127b13e..701ab58f63 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -45,6 +45,10 @@ pub fn api_error_into_response(this: ApiError) -> Response> {
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
+ ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+ this.to_string(),
+ StatusCode::INTERNAL_SERVER_ERROR,
+ ),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index f1f96f6d5f..08bf66058a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -630,12 +630,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
timestamp: datetime,
+ **kwargs,
):
log.info(
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
)
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
+ **kwargs,
)
self.verbose_error(res)
res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 83d52d4c4c..263730a823 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,12 +1,15 @@
import re
import time
+from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta, timezone
+import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
from fixtures.pageserver.http import PageserverApiException
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
+from requests.exceptions import ReadTimeout
#
@@ -108,6 +111,52 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
assert Lsn(result["lsn"]) >= last_flush_lsn
+def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
+ """
+ Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled.
+ Added as an effort to improve error handling and avoid full anyhow backtrace.
+ """
+
+ env = neon_env_builder.init_start()
+ env.pageserver.allowed_errors.extend(
+ [
+ ".*request was dropped before completing.*",
+ ".*Cancelled request finished with an error: Cancelled",
+ ]
+ )
+
+ client = env.pageserver.http_client()
+ failpoint = "find-lsn-for-timestamp-pausable"
+ client.configure_failpoints((failpoint, "pause"))
+
+ with ThreadPoolExecutor(max_workers=1) as exec:
+ # Request get_lsn_by_timestamp, hit the pausable failpoint
+ failing = exec.submit(
+ client.timeline_get_lsn_by_timestamp,
+ env.initial_tenant,
+ env.initial_timeline,
+ datetime.now(),
+ timeout=2,
+ )
+
+ _, offset = wait_until(
+ 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+ )
+
+ with pytest.raises(ReadTimeout):
+ failing.result()
+
+ client.configure_failpoints((failpoint, "off"))
+
+ _, offset = wait_until(
+ 20,
+ 0.5,
+ lambda: env.pageserver.assert_log_contains(
+ "Cancelled request finished with an error: Cancelled$", offset
+ ),
+ )
+
+
# Test pageserver get_timestamp_of_lsn API
def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
key_not_found_error = r".*could not find data for key.*"