mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
fix: avoid busy loop on replacement failure (#3613)
Add an AtomicBool per RemoteLayer, use it to mark together with closed semaphore that remotelayer is unusable until restart or ignore+load. https://github.com/neondatabase/neon/issues/3533#issuecomment-1431481554
This commit is contained in:
@@ -154,6 +154,12 @@ where
|
||||
expected: &Arc<L>,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(
|
||||
// this is not what happens if an L0 layer was not found a anyhow error but perhaps
|
||||
// that should be changed. this is good enough to show a replacement failure.
|
||||
Replacement::NotFound
|
||||
));
|
||||
|
||||
self.layer_map.replace_historic_noflush(expected, new)
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,17 @@ pub struct RemoteLayer {
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// Has `LayerMap::replace` failed for this (true) or not (false).
|
||||
///
|
||||
/// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
|
||||
/// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
|
||||
/// unprocessable, because a LayerMap::replace failed.
|
||||
///
|
||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||
/// `Timeline::download_remote_layer`, which also logs.
|
||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for RemoteLayer {
|
||||
@@ -207,6 +218,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
@@ -228,6 +240,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3624,14 +3624,26 @@ impl Timeline {
|
||||
&self,
|
||||
remote_layer: Arc<RemoteLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::sync::atomic::Ordering::Relaxed;
|
||||
|
||||
let permit = match Arc::clone(&remote_layer.ongoing_download)
|
||||
.acquire_owned()
|
||||
.await
|
||||
{
|
||||
Ok(permit) => permit,
|
||||
Err(_closed) => {
|
||||
info!("download of layer has already finished");
|
||||
return Ok(());
|
||||
if remote_layer.download_replacement_failure.load(Relaxed) {
|
||||
// this path will be hit often, in case there are upper retries. however
|
||||
// hitting this error will prevent a busy loop between get_reconstruct_data and
|
||||
// download, so an error is prefered.
|
||||
//
|
||||
// TODO: we really should poison the timeline, but panicking is not yet
|
||||
// supported. Related: https://github.com/neondatabase/neon/issues/3621
|
||||
anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
|
||||
} else {
|
||||
info!("download of layer has already finished");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3667,8 +3679,8 @@ impl Timeline {
|
||||
{
|
||||
use crate::tenant::layer_map::Replacement;
|
||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||
match updates.replace_historic(&l, new_layer) {
|
||||
Ok(Replacement::Replaced { .. }) => { /* expected */ }
|
||||
let failure = match updates.replace_historic(&l, new_layer) {
|
||||
Ok(Replacement::Replaced { .. }) => false,
|
||||
Ok(Replacement::NotFound) => {
|
||||
// TODO: the downloaded file should probably be removed, otherwise
|
||||
// it will be added to the layermap on next load? we should
|
||||
@@ -3676,6 +3688,7 @@ impl Timeline {
|
||||
//
|
||||
// See: https://github.com/neondatabase/neon/issues/3533
|
||||
error!("replacing downloaded layer into layermap failed because layer was not found");
|
||||
true
|
||||
}
|
||||
Ok(Replacement::RemovalBuffered) => {
|
||||
unreachable!("current implementation does not remove anything")
|
||||
@@ -3694,12 +3707,32 @@ impl Timeline {
|
||||
?other,
|
||||
"replacing downloaded layer into layermap failed because another layer was found instead of expected"
|
||||
);
|
||||
true
|
||||
}
|
||||
Err(e) => {
|
||||
// this is a precondition failure, the layer filename derived
|
||||
// attributes didn't match up, which doesn't seem likely.
|
||||
error!("replacing downloaded layer into layermap failed: {e:#?}")
|
||||
error!("replacing downloaded layer into layermap failed: {e:#?}");
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
if failure {
|
||||
// mark the remote layer permanently failed; the timeline is most
|
||||
// likely unusable after this. sadly we cannot just poison the layermap
|
||||
// lock with panic, because that would create an issue with shutdown.
|
||||
//
|
||||
// this does not change the retry semantics on failed downloads.
|
||||
//
|
||||
// use of Relaxed is valid because closing of the semaphore gives
|
||||
// happens-before and wakes up any waiters; we write this value before
|
||||
// and any waiters (or would be waiters) will load it after closing
|
||||
// semaphore.
|
||||
//
|
||||
// See: https://github.com/neondatabase/neon/issues/3533
|
||||
remote_layer
|
||||
.download_replacement_failure
|
||||
.store(true, Relaxed);
|
||||
}
|
||||
}
|
||||
updates.flush();
|
||||
|
||||
@@ -1288,6 +1288,7 @@ class PageserverHttpClient(requests.Session):
|
||||
timeline_id: TimelineId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
**kwargs,
|
||||
) -> Dict[Any, Any]:
|
||||
params = {}
|
||||
if include_non_incremental_logical_size:
|
||||
@@ -1298,6 +1299,7 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
|
||||
@@ -10,6 +10,7 @@ import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
RemoteStorageKind,
|
||||
assert_tenant_status,
|
||||
available_remote_storages,
|
||||
@@ -18,6 +19,7 @@ from fixtures.neon_fixtures import (
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage,
|
||||
wait_for_upload,
|
||||
wait_until,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
@@ -623,3 +625,63 @@ def test_compaction_downloads_on_demand_with_image_creation(
|
||||
|
||||
def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
|
||||
return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_ondemand_download_failure_to_replace(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
|
||||
|
||||
See: https://github.com/neondatabase/neon/issues/3533
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ondemand_download_failure_to_replace",
|
||||
)
|
||||
|
||||
# disable gc and compaction via default tenant config because config is lost while detaching
|
||||
# so that compaction will not be the one to download the layer but the http handler is
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
"""tenant_config={gc_period = "0s", compaction_period = "0s"}"""
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
|
||||
env.initial_tenant = tenant_id
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
|
||||
|
||||
wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
|
||||
|
||||
# remove layers so that they will be redownloaded
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
|
||||
|
||||
# requesting details with non-incremental size should trigger a download of the only layer
|
||||
# this will need to be adjusted if an index for logical sizes is ever implemented
|
||||
with pytest.raises(PageserverApiException):
|
||||
# error message is not useful
|
||||
pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
|
||||
|
||||
actual_message = (
|
||||
".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
|
||||
)
|
||||
assert env.pageserver.log_contains(actual_message) is not None
|
||||
env.pageserver.allowed_errors.append(actual_message)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
|
||||
)
|
||||
# this might get to run and attempt on-demand, but not always
|
||||
env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
|
||||
|
||||
# if the above returned, then we didn't have a livelock, and all is well
|
||||
|
||||
Reference in New Issue
Block a user