Compare commits

...

5 Commits

Author SHA1 Message Date
Christian Schwarz
6f07fc933f notes 2024-07-02 17:33:30 +00:00
Christian Schwarz
45035ca950 Revert "toy around with different ways to shut down"
This reverts commit d359d88f51.
2024-07-02 17:33:30 +00:00
Christian Schwarz
d359d88f51 toy around with different ways to shut down 2024-07-02 17:32:19 +00:00
Christian Schwarz
03613b0017 slipped a comment in 2024-07-01 17:04:39 +00:00
Christian Schwarz
4841f93c3e fix: noisy logging when download gets cancelled during shutdown
Before this PR, during timeline shutdown, we'd occasionally see
log lines like this one:

```
2024-06-26T18:28:11.063402Z  INFO initial_size_calculation{tenant_id=$TENANT,shard_id=0000 timeline_id=$TIMELINE}:logical_size_calculation_task:get_or_maybe_download{layer=000000000000000000000000000000000000-000000067F0001A3950001C1630100000000__0000000D88265898}: layer file download failed, and caller has been cancelled: Cancelled, shutting down
Stack backtrace:
   0: <core::result::Result<T,F> as core::ops::try_trait::FromResidual<core::result::Result<core::convert::Infallible,E>>>::from_residual
             at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/result.rs:1964:27
      pageserver::tenant::remote_timeline_client::RemoteTimelineClient::download_layer_file::{{closure}}
             at /home/nonroot/pageserver/src/tenant/remote_timeline_client.rs:531:13
      pageserver::tenant::storage_layer::layer::LayerInner::download_and_init::{{closure}}
             at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1136:14
      pageserver::tenant::storage_layer::layer::LayerInner::download_init_and_wait::{{closure}}::{{closure}}
             at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1082:74
```
2024-07-01 16:44:17 +00:00
5 changed files with 15 additions and 18 deletions

View File

@@ -317,6 +317,9 @@ struct PageServerHandler {
/// to see is the number of shards divided by the number of pageservers (typically < 2),
/// or the ratio used when splitting shards (i.e. how many children created from one)
/// parent shard, where a "large" number might be ~8.
///
/// Hmm, we never remove timelines from this map, even not on remove_tenant_from_memory.
/// Shouldn't matter though.
shard_timelines: HashMap<ShardIndex, HandlerTimeline>,
}
@@ -585,7 +588,7 @@ impl PageServerHandler {
let msg = tokio::select! {
biased;
_ = self.await_connection_cancelled() => {
_ = self.await_connection_cancelled() => { // we're sensitive here, and we see prompt sensitivy in the logs
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
@@ -1176,6 +1179,7 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = match self.get_cached_timeline_for_page(req) {
// on cache hit, we don't check if timeline is active
Ok(tl) => {
set_tracing_field_shard_id(tl);
tl
@@ -1210,6 +1214,7 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
// if `timeline` is not active, but not_modified_since is < last_record_lsn, this doesn't wait
timeline,
req.request_lsn,
req.not_modified_since,
@@ -1219,6 +1224,7 @@ impl PageServerHandler {
.await?;
let page = timeline
// but if tl is cancelled, then we'd quickly return from here
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
.await?;

View File

@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<u64> {
) -> Result<u64, DownloadError> {
let downloaded_size = {
let _unfinished_gauge_guard = self.metrics.call_begin(
&RemoteOpFileKind::Layer,

View File

@@ -1096,19 +1096,10 @@ impl LayerInner {
match rx.await {
Ok(Ok(res)) => Ok(res),
Ok(Err(e)) => {
// sleep already happened in the spawned task, if it was not cancelled
match e.downcast_ref::<remote_storage::DownloadError>() {
// If the download failed due to its cancellation token,
// propagate the cancellation error upstream.
Some(remote_storage::DownloadError::Cancelled) => {
Err(DownloadError::DownloadCancelled)
}
// FIXME: this is not embedding the error because historically it would had
// been output to compute, however that is no longer the case.
_ => Err(DownloadError::DownloadFailed),
}
}
Ok(Err(e)) => match e {
remote_storage::DownloadError::Cancelled => Err(DownloadError::DownloadCancelled),
_ => Err(DownloadError::DownloadFailed),
},
Err(_gone) => Err(DownloadError::DownloadCancelled),
}
}
@@ -1118,7 +1109,7 @@ impl LayerInner {
timeline: Arc<Timeline>,
permit: heavier_once_cell::InitPermit,
ctx: &RequestContext,
) -> anyhow::Result<Arc<DownloadedLayer>> {
) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
let result = timeline
.remote_client
.download_layer_file(

View File

@@ -883,7 +883,7 @@ impl Timeline {
// page_service.
debug_assert!(!self.shard_identity.is_key_disposable(&key));
self.timeline_get_throttle.throttle(ctx, 1).await;
self.timeline_get_throttle.throttle(ctx, 1).await; // throttling should prob be sensitive to self cancellation, but really, caller should tokio::select! us
match self.conf.get_impl {
GetImpl::Legacy => {

View File

@@ -121,7 +121,7 @@ pub(super) async fn handle_walreceiver_connection(
// prevent timeline shutdown from finishing until we have exited
let _guard = timeline.gate.enter().map_err(|e| match e {
GateError::GateClosed => WalReceiverError::ClosedGate,
GateError::GateClosed => WalReceiverError::ClosedGate, // this one might be candidate
})?;
// This function spawns a side-car task (WalReceiverConnectionPoller).
// Get its gate guard now as well.