mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
fix: long timeline create cancelled by tenant delete (#5917)
Fix the fallible vs. infallible check order with `UninitTimeline::finish_creation` so that the incomplete timeline can be removed. Currently the order of drop guard unwrapping causes uninit files to be left on pageserver, blocking the tenant deletion. Cc: #5914 Cc: #investigation-2023-11-23-stuck-tenant-deletion
This commit is contained in:
@@ -1858,6 +1858,7 @@ impl Tenant {
|
||||
});
|
||||
})
|
||||
};
|
||||
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
||||
tracing::info!("Waiting for timelines...");
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
|
||||
@@ -2600,6 +2600,8 @@ impl Timeline {
|
||||
)
|
||||
};
|
||||
|
||||
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
@@ -45,12 +45,20 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
let timeline_id = self.timeline_id;
|
||||
let tenant_id = self.owning_tenant.tenant_id;
|
||||
|
||||
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
|
||||
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
if self.raw_timeline.is_none() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"No timeline for initialization found for {tenant_id}/{timeline_id}"
|
||||
));
|
||||
}
|
||||
|
||||
// Check that the caller initialized disk_consistent_lsn
|
||||
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
|
||||
let new_disk_consistent_lsn = self
|
||||
.raw_timeline
|
||||
.as_ref()
|
||||
.expect("checked above")
|
||||
.0
|
||||
.get_disk_consistent_lsn();
|
||||
|
||||
anyhow::ensure!(
|
||||
new_disk_consistent_lsn.is_valid(),
|
||||
"new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
|
||||
@@ -62,6 +70,13 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
|
||||
),
|
||||
Entry::Vacant(v) => {
|
||||
// after taking here should be no fallible operations, because the drop guard will not
|
||||
// cleanup after and would block for example the tenant deletion
|
||||
let (new_timeline, uninit_mark) =
|
||||
self.raw_timeline.take().expect("already checked");
|
||||
|
||||
// this is the mutual exclusion between different retries to create the timeline;
|
||||
// this should be an assertion.
|
||||
uninit_mark.remove_uninit_mark().with_context(|| {
|
||||
format!(
|
||||
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
|
||||
@@ -70,10 +85,10 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
|
||||
new_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
Ok(new_timeline)
|
||||
}
|
||||
}
|
||||
|
||||
Ok(new_timeline)
|
||||
}
|
||||
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
|
||||
@@ -1646,7 +1646,7 @@ class NeonPageserver(PgProtocol):
|
||||
# env.pageserver.allowed_errors.append(".*could not open garage door.*")
|
||||
#
|
||||
# The entries in the list are regular experessions.
|
||||
self.allowed_errors = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
|
||||
self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
|
||||
|
||||
def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
|
||||
"""Get a timeline directory's path based on the repo directory of the test environment"""
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import enum
|
||||
import os
|
||||
import shutil
|
||||
from threading import Thread
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
@@ -27,7 +28,7 @@ from fixtures.remote_storage import (
|
||||
available_s3_storages,
|
||||
)
|
||||
from fixtures.types import TenantId
|
||||
from fixtures.utils import run_pg_bench_small
|
||||
from fixtures.utils import run_pg_bench_small, wait_until
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
||||
@@ -399,4 +400,78 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
)
|
||||
|
||||
|
||||
def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
|
||||
"""Reproduction of 2023-11-23 stuck tenants investigation"""
|
||||
|
||||
# do not use default tenant/timeline creation because it would output the failpoint log message too early
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
# happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Timeline got dropped without initializing, cleaning its files"
|
||||
)
|
||||
# the response hit_pausable_failpoint_and_later_fail
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
|
||||
)
|
||||
|
||||
pageserver_http.tenant_create(env.initial_tenant)
|
||||
|
||||
failpoint = "flush-layer-cancel-after-writing-layer-out-pausable"
|
||||
pageserver_http.configure_failpoints((failpoint, "pause"))
|
||||
|
||||
def hit_pausable_failpoint_and_later_fail():
|
||||
with pytest.raises(
|
||||
PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn"
|
||||
):
|
||||
pageserver_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
|
||||
def start_deletion():
|
||||
pageserver_http.tenant_delete(env.initial_tenant)
|
||||
|
||||
def has_hit_failpoint():
|
||||
assert env.pageserver.log_contains(f"at failpoint {failpoint}") is not None
|
||||
|
||||
def deletion_has_started_waiting_for_timelines():
|
||||
assert env.pageserver.log_contains("Waiting for timelines...") is not None
|
||||
|
||||
def tenant_is_deleted():
|
||||
try:
|
||||
pageserver_http.tenant_status(env.initial_tenant)
|
||||
except PageserverApiException as e:
|
||||
assert e.status_code == 404
|
||||
else:
|
||||
raise RuntimeError("tenant was still accessible")
|
||||
|
||||
creation = Thread(target=hit_pausable_failpoint_and_later_fail)
|
||||
creation.start()
|
||||
|
||||
deletion = None
|
||||
|
||||
try:
|
||||
wait_until(10, 1, has_hit_failpoint)
|
||||
|
||||
# it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated
|
||||
# then deletion should fail and set the tenant broken
|
||||
deletion = Thread(target=start_deletion)
|
||||
deletion.start()
|
||||
|
||||
wait_until(10, 1, deletion_has_started_waiting_for_timelines)
|
||||
|
||||
pageserver_http.configure_failpoints((failpoint, "off"))
|
||||
|
||||
creation.join()
|
||||
deletion.join()
|
||||
|
||||
wait_until(10, 1, tenant_is_deleted)
|
||||
finally:
|
||||
creation.join()
|
||||
if deletion is not None:
|
||||
deletion.join()
|
||||
|
||||
|
||||
# TODO test concurrent deletions with "hang" failpoint
|
||||
|
||||
Reference in New Issue
Block a user