fix: avoid busy loop on replacement failure (#3613)

Add an AtomicBool per RemoteLayer, use it to mark together with closed
semaphore that remotelayer is unusable until restart or ignore+load.

https://github.com/neondatabase/neon/issues/3533#issuecomment-1431481554
This commit is contained in:
Joonas Koivunen
2023-02-17 14:15:29 +02:00
committed by GitHub
parent ae3eff1ad2
commit 8e6b27bf7c
5 changed files with 121 additions and 5 deletions

View File

@@ -154,6 +154,12 @@ where
expected: &Arc<L>,
new: Arc<L>,
) -> anyhow::Result<Replacement<Arc<L>>> {
fail::fail_point!("layermap-replace-notfound", |_| Ok(
// this is not what happens if an L0 layer was not found a anyhow error but perhaps
// that should be changed. this is good enough to show a replacement failure.
Replacement::NotFound
));
self.layer_map.replace_historic_noflush(expected, new)
}

View File

@@ -49,6 +49,17 @@ pub struct RemoteLayer {
access_stats: LayerAccessStats,
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
/// Has `LayerMap::replace` failed for this (true) or not (false).
///
/// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
/// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
/// unprocessable, because a LayerMap::replace failed.
///
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
/// a possible fast loop between `Timeline::get_reconstruct_data` and
/// `Timeline::download_remote_layer`, which also logs.
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
}
impl std::fmt::Debug for RemoteLayer {
@@ -207,6 +218,7 @@ impl RemoteLayer {
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
access_stats,
}
}
@@ -228,6 +240,7 @@ impl RemoteLayer {
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
access_stats,
}
}

View File

@@ -3624,14 +3624,26 @@ impl Timeline {
&self,
remote_layer: Arc<RemoteLayer>,
) -> anyhow::Result<()> {
use std::sync::atomic::Ordering::Relaxed;
let permit = match Arc::clone(&remote_layer.ongoing_download)
.acquire_owned()
.await
{
Ok(permit) => permit,
Err(_closed) => {
info!("download of layer has already finished");
return Ok(());
if remote_layer.download_replacement_failure.load(Relaxed) {
// this path will be hit often, in case there are upper retries. however
// hitting this error will prevent a busy loop between get_reconstruct_data and
// download, so an error is prefered.
//
// TODO: we really should poison the timeline, but panicking is not yet
// supported. Related: https://github.com/neondatabase/neon/issues/3621
anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
} else {
info!("download of layer has already finished");
return Ok(());
}
}
};
@@ -3667,8 +3679,8 @@ impl Timeline {
{
use crate::tenant::layer_map::Replacement;
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
match updates.replace_historic(&l, new_layer) {
Ok(Replacement::Replaced { .. }) => { /* expected */ }
let failure = match updates.replace_historic(&l, new_layer) {
Ok(Replacement::Replaced { .. }) => false,
Ok(Replacement::NotFound) => {
// TODO: the downloaded file should probably be removed, otherwise
// it will be added to the layermap on next load? we should
@@ -3676,6 +3688,7 @@ impl Timeline {
//
// See: https://github.com/neondatabase/neon/issues/3533
error!("replacing downloaded layer into layermap failed because layer was not found");
true
}
Ok(Replacement::RemovalBuffered) => {
unreachable!("current implementation does not remove anything")
@@ -3694,12 +3707,32 @@ impl Timeline {
?other,
"replacing downloaded layer into layermap failed because another layer was found instead of expected"
);
true
}
Err(e) => {
// this is a precondition failure, the layer filename derived
// attributes didn't match up, which doesn't seem likely.
error!("replacing downloaded layer into layermap failed: {e:#?}")
error!("replacing downloaded layer into layermap failed: {e:#?}");
true
}
};
if failure {
// mark the remote layer permanently failed; the timeline is most
// likely unusable after this. sadly we cannot just poison the layermap
// lock with panic, because that would create an issue with shutdown.
//
// this does not change the retry semantics on failed downloads.
//
// use of Relaxed is valid because closing of the semaphore gives
// happens-before and wakes up any waiters; we write this value before
// and any waiters (or would be waiters) will load it after closing
// semaphore.
//
// See: https://github.com/neondatabase/neon/issues/3533
remote_layer
.download_replacement_failure
.store(true, Relaxed);
}
}
updates.flush();

View File

@@ -1288,6 +1288,7 @@ class PageserverHttpClient(requests.Session):
timeline_id: TimelineId,
include_non_incremental_logical_size: bool = False,
include_timeline_dir_layer_file_size_sum: bool = False,
**kwargs,
) -> Dict[Any, Any]:
params = {}
if include_non_incremental_logical_size:
@@ -1298,6 +1299,7 @@ class PageserverHttpClient(requests.Session):
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
params=params,
**kwargs,
)
self.verbose_error(res)
res_json = res.json()

View File

@@ -10,6 +10,7 @@ import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PageserverApiException,
RemoteStorageKind,
assert_tenant_status,
available_remote_storages,
@@ -18,6 +19,7 @@ from fixtures.neon_fixtures import (
wait_for_sk_commit_lsn_to_reach_remote_storage,
wait_for_upload,
wait_until,
wait_until_tenant_state,
)
from fixtures.types import Lsn
from fixtures.utils import query_scalar
@@ -623,3 +625,63 @@ def test_compaction_downloads_on_demand_with_image_creation(
def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_ondemand_download_failure_to_replace(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
"""
Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
See: https://github.com/neondatabase/neon/issues/3533
"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ondemand_download_failure_to_replace",
)
# disable gc and compaction via default tenant config because config is lost while detaching
# so that compaction will not be the one to download the layer but the http handler is
neon_env_builder.pageserver_config_override = (
"""tenant_config={gc_period = "0s", compaction_period = "0s"}"""
)
env = neon_env_builder.init_start()
tenant_id, timeline_id = env.neon_cli.create_tenant()
env.initial_tenant = tenant_id
pageserver_http = env.pageserver.http_client()
lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
# remove layers so that they will be redownloaded
pageserver_http.tenant_detach(tenant_id)
pageserver_http.tenant_attach(tenant_id)
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
# requesting details with non-incremental size should trigger a download of the only layer
# this will need to be adjusted if an index for logical sizes is ever implemented
with pytest.raises(PageserverApiException):
# error message is not useful
pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
actual_message = (
".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
)
assert env.pageserver.log_contains(actual_message) is not None
env.pageserver.allowed_errors.append(actual_message)
env.pageserver.allowed_errors.append(
".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
)
# this might get to run and attempt on-demand, but not always
env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
# if the above returned, then we didn't have a livelock, and all is well