Fix timeline creation and tenant deletion race (#6310)

Fixes the race condition between timeline creation and tenant deletion
outlined in #6255.

Related: #5914, which is a similar race condition about the uninit
marker file.

Fixes #6255
This commit is contained in:
Arpad Müller
2024-01-13 09:15:58 +01:00
committed by GitHub
parent b76454ae41
commit 60ced06586
3 changed files with 127 additions and 25 deletions

View File

@@ -3197,8 +3197,17 @@ impl Tenant {
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
));
scopeguard::defer! {
if let Err(e) = fs::remove_file(&temp_path) {
error!("Failed to remove temporary initdb archive '{temp_path}': {e}");
}
}
let (pgdata_zstd, tar_zst_size) =
import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
pausable_failpoint!("before-initdb-upload");
backoff::retry(
|| async {
self::remote_timeline_client::upload_initdb_dir(
@@ -3219,18 +3228,6 @@ impl Tenant {
)
.await?;
tokio::fs::remove_file(&temp_path)
.await
.or_else(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
// If something else already removed the file, ignore the error
Ok(())
} else {
Err(e)
}
})
.with_context(|| format!("tempfile removal {temp_path}"))?;
Ok(())
}
@@ -3283,23 +3280,18 @@ impl Tenant {
)
.await
.context("download initdb tar")?;
scopeguard::defer! {
if let Err(e) = fs::remove_file(&initdb_tar_zst_path) {
error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}");
}
}
let buf_read =
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
.await
.context("extract initdb tar")?;
tokio::fs::remove_file(&initdb_tar_zst_path)
.await
.or_else(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
// If something else already removed the file, ignore the error
Ok(())
} else {
Err(e)
}
})
.with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
} else {
// Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

View File

@@ -542,6 +542,7 @@ impl DeleteTenantFlow {
)
.await?;
pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"

View File

@@ -24,8 +24,9 @@ from fixtures.pageserver.utils import (
wait_until_tenant_state,
)
from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
from fixtures.types import TenantId
from fixtures.types import TenantId, TimelineId
from fixtures.utils import run_pg_bench_small, wait_until
from requests.exceptions import ReadTimeout
def test_tenant_delete_smoke(
@@ -553,3 +554,111 @@ def test_tenant_delete_concurrent(
# Zero tenants remain (we deleted the default tenant)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
def test_tenant_delete_races_timeline_creation(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
"""
Validate that timeline creation executed in parallel with deletion works correctly.
This is a reproducer for https://github.com/neondatabase/neon/issues/6255
"""
# The remote storage kind doesn't really matter but we use it for iterations calculation below
# (and there is no way to reconstruct the used remote storage kind)
remote_storage_kind = RemoteStorageKind.MOCK_S3
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
ps_http = env.pageserver.http_client()
tenant_id = env.initial_tenant
# Sometimes it ends with "InternalServerError(Cancelled", sometimes with "InternalServerError(Operation was cancelled"
CANCELLED_ERROR = (
".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
)
env.pageserver.allowed_errors.extend(
[
# lucky race with stopping from flushing a layer we fail to schedule any uploads
".*layer flush task.+: could not flush frozen layer: update_metadata_file",
# We need the http connection close for successful reproduction
".*POST.*/timeline.* request was dropped before completing",
# Timeline creation runs into this error
CANCELLED_ERROR,
]
)
BEFORE_INITDB_UPLOAD_FAILPOINT = "before-initdb-upload"
DELETE_BEFORE_CLEANUP_FAILPOINT = "tenant-delete-before-cleanup-remaining-fs-traces-pausable"
# Wait just before the initdb upload
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))
def timeline_create():
try:
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
raise RuntimeError("creation succeeded even though it shouldn't")
except ReadTimeout:
pass
Thread(target=timeline_create).start()
def hit_initdb_upload_failpoint():
assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
wait_until(100, 0.1, hit_initdb_upload_failpoint)
def creation_connection_timed_out():
assert env.pageserver.log_contains(
"POST.*/timeline.* request was dropped before completing"
)
# Wait so that we hit the timeout and the connection is dropped
# (But timeline creation still continues)
wait_until(100, 0.1, creation_connection_timed_out)
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
def tenant_delete():
ps_http.tenant_delete(tenant_id)
Thread(target=tenant_delete).start()
def deletion_arrived():
assert env.pageserver.log_contains(
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
)
wait_until(100, 0.1, deletion_arrived)
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
# Disable the failpoint and wait for deletion to finish
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
try:
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
except PageserverApiException:
pass
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
# Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
assert env.pageserver.log_contains(CANCELLED_ERROR)
assert not env.pageserver.log_contains(
".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
)
# Zero tenants remain (we deleted the default tenant)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0