refactor(pageserver): designated api error type for cancelled request (#7949)

Closes #7406. ## Problem When a `get_lsn_by_timestamp` request is cancelled, an anyhow error is exposed to handle that case, which verbosely logs the error. However, we don't benefit from having the full backtrace provided by anyhow in this case. ## Summary of changes This PR introduces a new `ApiError` type to handle errors caused by cancelled request more robustly. - A new enum variant `ApiError::Cancelled` - Currently the cancelled request is mapped to status code 500. - Need to handle this error in proxy's `http_util` as well. - Added a failpoint test to simulate cancelled `get_lsn_by_timestamp` request. Signed-off-by: Yuchen Liang <yuchen@neon.tech>
2026-01-08 14:02:55 +00:00 · 2024-06-06 10:00:14 -04:00
parent 0a65333fff
commit 630cfbe420
6 changed files with 69 additions and 4 deletions
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -34,6 +34,9 @@ pub enum ApiError {
    #[error("Timeout")]
    Timeout(Cow<'static, str>),

+    #[error("Request cancelled")]
+    Cancelled,
+
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -74,6 +77,10 @@ impl ApiError {
                err.to_string(),
                StatusCode::REQUEST_TIMEOUT,
            ),
+            ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+                self.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
@@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+        ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
        _ => info!("Error processing HTTP request: {api_error:#}"),
    }

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -181,9 +181,7 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::MissingKey(e) => {
                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
            }
-            PageReconstructError::Cancelled => {
-                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
-            }
+            PageReconstructError::Cancelled => ApiError::Cancelled,
            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,6 +36,7 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
+use utils::pausable_failpoint;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -409,6 +410,8 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<LsnForTimestamp, PageReconstructError> {
+        pausable_failpoint!("find-lsn-for-timestamp-pausable");
+
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
        // We use this method to figure out the branching LSN for the new branch, but the
        // GC cutoff could be before the branching point and we cannot create a new branch
@@ -424,6 +427,7 @@ impl Timeline {

        let mut found_smaller = false;
        let mut found_larger = false;
+
        while low < high {
            if cancel.is_cancelled() {
                return Err(PageReconstructError::Cancelled);
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -45,6 +45,10 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
            err.to_string(),
            StatusCode::REQUEST_TIMEOUT,
        ),
+        ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+            this.to_string(),
+            StatusCode::INTERNAL_SERVER_ERROR,
+        ),
        ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
            err.to_string(),
            StatusCode::INTERNAL_SERVER_ERROR,
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -630,12 +630,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        timestamp: datetime,
+        **kwargs,
    ):
        log.info(
            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
        )
        res = self.get(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
+            **kwargs,
        )
        self.verbose_error(res)
        res_json = res.json()
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,12 +1,15 @@
 import re
 import time
+from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timedelta, timezone

+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
+from requests.exceptions import ReadTimeout


 #
@@ -108,6 +111,52 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
        assert Lsn(result["lsn"]) >= last_flush_lsn


+def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
+    """
+    Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled.
+    Added as an effort to improve error handling and avoid full anyhow backtrace.
+    """
+
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*request was dropped before completing.*",
+            ".*Cancelled request finished with an error: Cancelled",
+        ]
+    )
+
+    client = env.pageserver.http_client()
+    failpoint = "find-lsn-for-timestamp-pausable"
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        # Request get_lsn_by_timestamp, hit the pausable failpoint
+        failing = exec.submit(
+            client.timeline_get_lsn_by_timestamp,
+            env.initial_tenant,
+            env.initial_timeline,
+            datetime.now(),
+            timeout=2,
+        )
+
+        _, offset = wait_until(
+            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        with pytest.raises(ReadTimeout):
+            failing.result()
+
+        client.configure_failpoints((failpoint, "off"))
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(
+                "Cancelled request finished with an error: Cancelled$", offset
+            ),
+        )
+
+
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
    key_not_found_error = r".*could not find data for key.*"