mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
Support cancellations of timelines with hanging ondemand downloads (#12330)
In `test_layer_download_cancelled_by_config_location`, we simulate hung downloads via the `before-downloading-layer-stream-pausable` failpoint. Then, we cancel a timeline via the `location_config` endpoint. With the new default as of https://github.com/neondatabase/neon/pull/11712, we would be creating the timeline on safekeepers regardless if there have been writes or not, and it turns out the test relied on the timeline not existing on safekeepers, due to a cancellation bug: * as established before, the test makes the read path hang * the timeline cancellation function first cancels the walreceiver, and only then cancels the timeline's token * `WalIngest::new` is requesting a checkpoint, which hits the read path * at cancellation time, we'd be hanging inside the read, not seeing the cancellation of the walreceiver * the test would time out due to the hang This is probably also reproducible in the wild when there is S3 unavailabilies or bottlenecks. So we thought that it's worthwhile to fix the hang issue. The approach chosen in the end involves the `tokio::select` macro. In PR 11712, we originally punted on the test due to the hang and opted it out from the new default, but now we can use the new default. Part of https://github.com/neondatabase/neon/issues/12299
This commit is contained in:
@@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
|
||||
let walingest_res = select! {
|
||||
walingest_res = walingest_future => walingest_res,
|
||||
_ = cancellation.cancelled() => {
|
||||
// We are doing reads in WalIngest::new, and those can hang as they come from the network.
|
||||
// Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
|
||||
debug!("Connection cancelled");
|
||||
return Err(WalReceiverError::Cancelled);
|
||||
},
|
||||
};
|
||||
let mut walingest = walingest_res.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let (format, compression) = match protocol {
|
||||
PostgresClientProtocol::Interpreted {
|
||||
|
||||
@@ -671,12 +671,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
|
||||
# On the new mode, the test runs into a cancellation issue, i.e. the walproposer can't shut down
|
||||
# as it is hang-waiting on the timeline_checkpoint call in WalIngest::new.
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": False,
|
||||
}
|
||||
|
||||
# turn off background tasks so that they don't interfere with the downloads
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
|
||||
Reference in New Issue
Block a user