diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index d55823b0b7..3d863a6518 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -34,6 +34,9 @@ pub enum ApiError { #[error("Timeout")] Timeout(Cow<'static, str>), + #[error("Request cancelled")] + Cancelled, + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -74,6 +77,10 @@ impl ApiError { err.to_string(), StatusCode::REQUEST_TIMEOUT, ), + ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( + self.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, @@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response { ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"), ApiError::ShuttingDown => info!("Shut down while processing HTTP request"), ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"), + ApiError::Cancelled => info!("Request cancelled while processing HTTP request"), _ => info!("Error processing HTTP request: {api_error:#}"), } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 7fa6c35ad6..19bc88fbc7 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -181,9 +181,7 @@ impl From for ApiError { PageReconstructError::MissingKey(e) => { ApiError::InternalServerError(anyhow::anyhow!("{e}")) } - PageReconstructError::Cancelled => { - ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) - } + PageReconstructError::Cancelled => ApiError::Cancelled, PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0bff4be150..336d1c3fb8 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -36,6 +36,7 @@ use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; +use utils::pausable_failpoint; use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -409,6 +410,8 @@ impl Timeline { cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { + pausable_failpoint!("find-lsn-for-timestamp-pausable"); + let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); // We use this method to figure out the branching LSN for the new branch, but the // GC cutoff could be before the branching point and we cannot create a new branch @@ -424,6 +427,7 @@ impl Timeline { let mut found_smaller = false; let mut found_larger = false; + while low < high { if cancel.is_cancelled() { return Err(PageReconstructError::Cancelled); diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index ab9127b13e..701ab58f63 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -45,6 +45,10 @@ pub fn api_error_into_response(this: ApiError) -> Response> { err.to_string(), StatusCode::REQUEST_TIMEOUT, ), + ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index f1f96f6d5f..08bf66058a 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -630,12 +630,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, timestamp: datetime, + **kwargs, ): log.info( f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" ) res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z", + **kwargs, ) self.verbose_error(res) res_json = res.json() diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 83d52d4c4c..263730a823 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,12 +1,15 @@ import re import time +from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timedelta, timezone +import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import PageserverApiException -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until +from requests.exceptions import ReadTimeout # @@ -108,6 +111,52 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): assert Lsn(result["lsn"]) >= last_flush_lsn +def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder): + """ + Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled. + Added as an effort to improve error handling and avoid full anyhow backtrace. + """ + + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [ + ".*request was dropped before completing.*", + ".*Cancelled request finished with an error: Cancelled", + ] + ) + + client = env.pageserver.http_client() + failpoint = "find-lsn-for-timestamp-pausable" + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + # Request get_lsn_by_timestamp, hit the pausable failpoint + failing = exec.submit( + client.timeline_get_lsn_by_timestamp, + env.initial_tenant, + env.initial_timeline, + datetime.now(), + timeout=2, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + with pytest.raises(ReadTimeout): + failing.result() + + client.configure_failpoints((failpoint, "off")) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains( + "Cancelled request finished with an error: Cancelled$", offset + ), + ) + + # Test pageserver get_timestamp_of_lsn API def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): key_not_found_error = r".*could not find data for key.*"