From 8e6b27bf7c5a78bd716c343f1113228d927399fb Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 17 Feb 2023 14:15:29 +0200 Subject: [PATCH] fix: avoid busy loop on replacement failure (#3613) Add an AtomicBool per RemoteLayer, use it to mark together with closed semaphore that remotelayer is unusable until restart or ignore+load. https://github.com/neondatabase/neon/issues/3533#issuecomment-1431481554 --- pageserver/src/tenant/layer_map.rs | 6 ++ .../src/tenant/storage_layer/remote_layer.rs | 13 ++++ pageserver/src/tenant/timeline.rs | 43 +++++++++++-- test_runner/fixtures/neon_fixtures.py | 2 + test_runner/regress/test_ondemand_download.py | 62 +++++++++++++++++++ 5 files changed, 121 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index e446e34f4e..8d7d9c6f8f 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -154,6 +154,12 @@ where expected: &Arc, new: Arc, ) -> anyhow::Result>> { + fail::fail_point!("layermap-replace-notfound", |_| Ok( + // this is not what happens if an L0 layer was not found a anyhow error but perhaps + // that should be changed. this is good enough to show a replacement failure. + Replacement::NotFound + )); + self.layer_map.replace_historic_noflush(expected, new) } diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 51bb4dcc2a..8465a99339 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -49,6 +49,17 @@ pub struct RemoteLayer { access_stats: LayerAccessStats, pub(crate) ongoing_download: Arc, + + /// Has `LayerMap::replace` failed for this (true) or not (false). + /// + /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`. + /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load) + /// unprocessable, because a LayerMap::replace failed. + /// + /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids + /// a possible fast loop between `Timeline::get_reconstruct_data` and + /// `Timeline::download_remote_layer`, which also logs. + pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool, } impl std::fmt::Debug for RemoteLayer { @@ -207,6 +218,7 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } @@ -228,6 +240,7 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 66afe2cdce..683c2cd2d3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3624,14 +3624,26 @@ impl Timeline { &self, remote_layer: Arc, ) -> anyhow::Result<()> { + use std::sync::atomic::Ordering::Relaxed; + let permit = match Arc::clone(&remote_layer.ongoing_download) .acquire_owned() .await { Ok(permit) => permit, Err(_closed) => { - info!("download of layer has already finished"); - return Ok(()); + if remote_layer.download_replacement_failure.load(Relaxed) { + // this path will be hit often, in case there are upper retries. however + // hitting this error will prevent a busy loop between get_reconstruct_data and + // download, so an error is prefered. + // + // TODO: we really should poison the timeline, but panicking is not yet + // supported. Related: https://github.com/neondatabase/neon/issues/3621 + anyhow::bail!("an earlier download succeeded but LayerMap::replace failed") + } else { + info!("download of layer has already finished"); + return Ok(()); + } } }; @@ -3667,8 +3679,8 @@ impl Timeline { { use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); - match updates.replace_historic(&l, new_layer) { - Ok(Replacement::Replaced { .. }) => { /* expected */ } + let failure = match updates.replace_historic(&l, new_layer) { + Ok(Replacement::Replaced { .. }) => false, Ok(Replacement::NotFound) => { // TODO: the downloaded file should probably be removed, otherwise // it will be added to the layermap on next load? we should @@ -3676,6 +3688,7 @@ impl Timeline { // // See: https://github.com/neondatabase/neon/issues/3533 error!("replacing downloaded layer into layermap failed because layer was not found"); + true } Ok(Replacement::RemovalBuffered) => { unreachable!("current implementation does not remove anything") @@ -3694,12 +3707,32 @@ impl Timeline { ?other, "replacing downloaded layer into layermap failed because another layer was found instead of expected" ); + true } Err(e) => { // this is a precondition failure, the layer filename derived // attributes didn't match up, which doesn't seem likely. - error!("replacing downloaded layer into layermap failed: {e:#?}") + error!("replacing downloaded layer into layermap failed: {e:#?}"); + true } + }; + + if failure { + // mark the remote layer permanently failed; the timeline is most + // likely unusable after this. sadly we cannot just poison the layermap + // lock with panic, because that would create an issue with shutdown. + // + // this does not change the retry semantics on failed downloads. + // + // use of Relaxed is valid because closing of the semaphore gives + // happens-before and wakes up any waiters; we write this value before + // and any waiters (or would be waiters) will load it after closing + // semaphore. + // + // See: https://github.com/neondatabase/neon/issues/3533 + remote_layer + .download_replacement_failure + .store(true, Relaxed); } } updates.flush(); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0620ad8a35..ca5288fa0a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1288,6 +1288,7 @@ class PageserverHttpClient(requests.Session): timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, + **kwargs, ) -> Dict[Any, Any]: params = {} if include_non_incremental_logical_size: @@ -1298,6 +1299,7 @@ class PageserverHttpClient(requests.Session): res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params=params, + **kwargs, ) self.verbose_error(res) res_json = res.json() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index eab9c41c57..09657470b6 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -10,6 +10,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + PageserverApiException, RemoteStorageKind, assert_tenant_status, available_remote_storages, @@ -18,6 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, wait_until, + wait_until_tenant_state, ) from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -623,3 +625,63 @@ def test_compaction_downloads_on_demand_with_image_creation( def stringify(conf: Dict[str, Any]) -> Dict[str, str]: return dict(map(lambda x: (x[0], str(x[1])), conf.items())) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_ondemand_download_failure_to_replace( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking. + + See: https://github.com/neondatabase/neon/issues/3533 + """ + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_failure_to_replace", + ) + + # disable gc and compaction via default tenant config because config is lost while detaching + # so that compaction will not be the one to download the layer but the http handler is + neon_env_builder.pageserver_config_override = ( + """tenant_config={gc_period = "0s", compaction_period = "0s"}""" + ) + + env = neon_env_builder.init_start() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + + env.initial_tenant = tenant_id + pageserver_http = env.pageserver.http_client() + + lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"]) + + wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + + # remove layers so that they will be redownloaded + pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_attach(tenant_id) + + wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) + pageserver_http.configure_failpoints(("layermap-replace-notfound", "return")) + + # requesting details with non-incremental size should trigger a download of the only layer + # this will need to be adjusted if an index for logical sizes is ever implemented + with pytest.raises(PageserverApiException): + # error message is not useful + pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2) + + actual_message = ( + ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found" + ) + assert env.pageserver.log_contains(actual_message) is not None + env.pageserver.allowed_errors.append(actual_message) + + env.pageserver.allowed_errors.append( + ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info" + ) + # this might get to run and attempt on-demand, but not always + env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'") + + # if the above returned, then we didn't have a livelock, and all is well