From 630cfbe4206b1e3f5439cde96b048f345dce3266 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:00:14 -0400
Subject: [PATCH] refactor(pageserver): designated api error type for cancelled
 request (#7949)

Closes #7406.

## Problem

When a `get_lsn_by_timestamp` request is cancelled, an anyhow error is
exposed to handle that case, which verbosely logs the error. However, we
don't benefit from having the full backtrace provided by anyhow in this
case.

## Summary of changes

This PR introduces a new `ApiError` type to handle errors caused by
cancelled request more robustly.
-  A new enum variant `ApiError::Cancelled`
- Currently the cancelled request is mapped to status code 500.
- Need to handle this error in proxy's `http_util` as well.
- Added a failpoint test to simulate cancelled `get_lsn_by_timestamp`
request.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/utils/src/http/error.rs            |  8 ++++
 pageserver/src/http/routes.rs           |  4 +-
 pageserver/src/pgdatadir_mapping.rs     |  4 ++
 proxy/src/serverless/http_util.rs       |  4 ++
 test_runner/fixtures/pageserver/http.py |  2 +
 test_runner/regress/test_lsn_mapping.py | 51 ++++++++++++++++++++++++-
 6 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index d55823b0b7..3d863a6518 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -34,6 +34,9 @@ pub enum ApiError {
     #[error("Timeout")]
     Timeout(Cow<'static, str>),
 
+    #[error("Request cancelled")]
+    Cancelled,
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -74,6 +77,10 @@ impl ApiError {
                 err.to_string(),
                 StatusCode::REQUEST_TIMEOUT,
             ),
+            ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+                self.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
@@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
         ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
         ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
         ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+        ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
         _ => info!("Error processing HTTP request: {api_error:#}"),
     }
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7fa6c35ad6..19bc88fbc7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -181,9 +181,7 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::MissingKey(e) => {
                 ApiError::InternalServerError(anyhow::anyhow!("{e}"))
             }
-            PageReconstructError::Cancelled => {
-                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
-            }
+            PageReconstructError::Cancelled => ApiError::Cancelled,
             PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0bff4be150..336d1c3fb8 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,6 +36,7 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
+use utils::pausable_failpoint;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -409,6 +410,8 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<LsnForTimestamp, PageReconstructError> {
+        pausable_failpoint!("find-lsn-for-timestamp-pausable");
+
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         // We use this method to figure out the branching LSN for the new branch, but the
         // GC cutoff could be before the branching point and we cannot create a new branch
@@ -424,6 +427,7 @@ impl Timeline {
 
         let mut found_smaller = false;
         let mut found_larger = false;
+
         while low < high {
             if cancel.is_cancelled() {
                 return Err(PageReconstructError::Cancelled);
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index ab9127b13e..701ab58f63 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -45,6 +45,10 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
             err.to_string(),
             StatusCode::REQUEST_TIMEOUT,
         ),
+        ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+            this.to_string(),
+            StatusCode::INTERNAL_SERVER_ERROR,
+        ),
         ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
             err.to_string(),
             StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index f1f96f6d5f..08bf66058a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -630,12 +630,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         timestamp: datetime,
+        **kwargs,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
+            **kwargs,
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 83d52d4c4c..263730a823 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,12 +1,15 @@
 import re
 import time
+from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timedelta, timezone
 
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
+from requests.exceptions import ReadTimeout
 
 
 #
@@ -108,6 +111,52 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         assert Lsn(result["lsn"]) >= last_flush_lsn
 
 
+def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
+    """
+    Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled.
+    Added as an effort to improve error handling and avoid full anyhow backtrace.
+    """
+
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*request was dropped before completing.*",
+            ".*Cancelled request finished with an error: Cancelled",
+        ]
+    )
+
+    client = env.pageserver.http_client()
+    failpoint = "find-lsn-for-timestamp-pausable"
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        # Request get_lsn_by_timestamp, hit the pausable failpoint
+        failing = exec.submit(
+            client.timeline_get_lsn_by_timestamp,
+            env.initial_tenant,
+            env.initial_timeline,
+            datetime.now(),
+            timeout=2,
+        )
+
+        _, offset = wait_until(
+            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        with pytest.raises(ReadTimeout):
+            failing.result()
+
+        client.configure_failpoints((failpoint, "off"))
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(
+                "Cancelled request finished with an error: Cancelled$", offset
+            ),
+        )
+
+
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
     key_not_found_error = r".*could not find data for key.*"