refactor(pageserver): designated api error type for cancelled request (#7949)

Closes #7406.

## Problem

When a `get_lsn_by_timestamp` request is cancelled, an anyhow error is
exposed to handle that case, which verbosely logs the error. However, we
don't benefit from having the full backtrace provided by anyhow in this
case.

## Summary of changes

This PR introduces a new `ApiError` type to handle errors caused by
cancelled request more robustly.
-  A new enum variant `ApiError::Cancelled`
- Currently the cancelled request is mapped to status code 500.
- Need to handle this error in proxy's `http_util` as well.
- Added a failpoint test to simulate cancelled `get_lsn_by_timestamp`
request.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
This commit is contained in:
Yuchen Liang
2024-06-06 10:00:14 -04:00
committed by GitHub
parent 0a65333fff
commit 630cfbe420
6 changed files with 69 additions and 4 deletions

View File

@@ -34,6 +34,9 @@ pub enum ApiError {
#[error("Timeout")]
Timeout(Cow<'static, str>),
#[error("Request cancelled")]
Cancelled,
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -74,6 +77,10 @@ impl ApiError {
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
self.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
@@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
_ => info!("Error processing HTTP request: {api_error:#}"),
}

View File

@@ -181,9 +181,7 @@ impl From<PageReconstructError> for ApiError {
PageReconstructError::MissingKey(e) => {
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
}
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::Cancelled => ApiError::Cancelled,
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
}

View File

@@ -36,6 +36,7 @@ use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::pausable_failpoint;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -409,6 +410,8 @@ impl Timeline {
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<LsnForTimestamp, PageReconstructError> {
pausable_failpoint!("find-lsn-for-timestamp-pausable");
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
// We use this method to figure out the branching LSN for the new branch, but the
// GC cutoff could be before the branching point and we cannot create a new branch
@@ -424,6 +427,7 @@ impl Timeline {
let mut found_smaller = false;
let mut found_larger = false;
while low < high {
if cancel.is_cancelled() {
return Err(PageReconstructError::Cancelled);

View File

@@ -45,6 +45,10 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
this.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -630,12 +630,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
timestamp: datetime,
**kwargs,
):
log.info(
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
)
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
**kwargs,
)
self.verbose_error(res)
res_json = res.json()

View File

@@ -1,12 +1,15 @@
import re
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta, timezone
import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
from fixtures.pageserver.http import PageserverApiException
from fixtures.utils import query_scalar
from fixtures.utils import query_scalar, wait_until
from requests.exceptions import ReadTimeout
#
@@ -108,6 +111,52 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
assert Lsn(result["lsn"]) >= last_flush_lsn
def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
"""
Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled.
Added as an effort to improve error handling and avoid full anyhow backtrace.
"""
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[
".*request was dropped before completing.*",
".*Cancelled request finished with an error: Cancelled",
]
)
client = env.pageserver.http_client()
failpoint = "find-lsn-for-timestamp-pausable"
client.configure_failpoints((failpoint, "pause"))
with ThreadPoolExecutor(max_workers=1) as exec:
# Request get_lsn_by_timestamp, hit the pausable failpoint
failing = exec.submit(
client.timeline_get_lsn_by_timestamp,
env.initial_tenant,
env.initial_timeline,
datetime.now(),
timeout=2,
)
_, offset = wait_until(
20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
)
with pytest.raises(ReadTimeout):
failing.result()
client.configure_failpoints((failpoint, "off"))
_, offset = wait_until(
20,
0.5,
lambda: env.pageserver.assert_log_contains(
"Cancelled request finished with an error: Cancelled$", offset
),
)
# Test pageserver get_timestamp_of_lsn API
def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
key_not_found_error = r".*could not find data for key.*"