mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
keep track of timeline deletion status in IndexPart to prevent timeline resurrection (#3919)
Before this patch, the following sequence would lead to the resurrection of a deleted timeline: - create timeline - wait for its index part to reach s3 - delete timeline - wait an arbitrary amount of time, including 0 seconds - detach tenant - attach tenant - the timeline is there and Active again This happens because we only kept track of the deletion in the tenant dir (by deleting the timeline dir) but not in S3. The solution is to turn the deleted timeline's IndexPart into a tombstone. The deletion status of the timeline is expressed in the `deleted_at: Option<NativeDateTime>` field of IndexPart. It's `None` while the timeline is alive and `Some(deletion time stamp)` if it is deleted. We change the timeline deletion handler to upload this tombstoned IndexPart. The handler does not return success if the upload fails. Coincidentally, this fixes the long-stanging TODO about the `std::fs::remove_dir_all` being not atomic. It need not be atomic anymore because we set the `deleted_at=Some()` before starting the `remove_dir_all`. The tombstone is in the IndexPart only, not in the `metadata`. So, we only have the tombstone and the `remove_dir_all` benefits mentioned above if remote storage is configured. This was a conscious trade-off because there's no good format evolution story for the current metadata file format. The introduction of this additional step into `delete_timeline` was painful because delete_timeline needs to be 1. cancel-safe 2. idempotent 3. safe to call concurrently These are mostly self-inflicted limitations that can be avoided by using request-coalescing. PR https://github.com/neondatabase/neon/pull/4159 will do that. fixes https://github.com/neondatabase/neon/issues/3560 refs https://github.com/neondatabase/neon/issues/3889 (part of tenant relocation) Co-authored-by: Joonas Koivunen <joonas@neon.tech> Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
@@ -314,9 +314,9 @@ class PageserverHttpClient(requests.Session):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
|
||||
@@ -87,7 +87,9 @@ def wait_until_tenant_state(
|
||||
|
||||
time.sleep(period)
|
||||
|
||||
raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
|
||||
raise Exception(
|
||||
f"Tenant {tenant_id} did not become {expected_state} within {iterations * period} seconds"
|
||||
)
|
||||
|
||||
|
||||
def wait_until_tenant_active(
|
||||
|
||||
@@ -143,6 +143,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
|
||||
assert env.pageserver.log_contains(
|
||||
".*WARN.*ignored .* unexpected bytes after the tar archive.*"
|
||||
)
|
||||
|
||||
# NOTE: delete can easily come before upload operations are completed
|
||||
client.timeline_delete(tenant, timeline)
|
||||
|
||||
# Importing correct backup works
|
||||
|
||||
@@ -1,8 +1,26 @@
|
||||
import os
|
||||
import queue
|
||||
import shutil
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
import requests
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
|
||||
|
||||
def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
@@ -39,23 +57,17 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
"test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
timeline_path = (
|
||||
env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(parent_timeline_id)
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
PageserverApiException, match="Cannot delete timeline which has child timelines"
|
||||
) as exc:
|
||||
timeline_path = (
|
||||
env.repo_dir
|
||||
/ "tenants"
|
||||
/ str(env.initial_tenant)
|
||||
/ "timelines"
|
||||
/ str(parent_timeline_id)
|
||||
)
|
||||
assert timeline_path.exists()
|
||||
|
||||
ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)
|
||||
|
||||
assert not timeline_path.exists()
|
||||
|
||||
assert exc.value.status_code == 400
|
||||
|
||||
timeline_path = (
|
||||
@@ -87,3 +99,350 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
)
|
||||
|
||||
assert exc.value.status_code == 404
|
||||
|
||||
# Check that we didn't pick up the timeline again after restart.
|
||||
# See https://github.com/neondatabase/neon/issues/3560
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.pageserver.start()
|
||||
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
|
||||
) as exc:
|
||||
ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
|
||||
|
||||
|
||||
# cover the two cases: remote storage configured vs not configured
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_delete_timeline_post_rm_failure(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
If there is a failure after removing the timeline directory, the delete operation
|
||||
should be retryable.
|
||||
"""
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_timeline_post_rm_failure"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
assert env.initial_timeline
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
failpoint_name = "timeline-delete-after-rm"
|
||||
ps_http.configure_failpoints((failpoint_name, "return"))
|
||||
|
||||
with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"):
|
||||
ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
|
||||
env.pageserver.allowed_errors.append(at_failpoint_log_message)
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
|
||||
)
|
||||
|
||||
# retry without failpoint, it should succeed
|
||||
ps_http.configure_failpoints((failpoint_name, "off"))
|
||||
|
||||
# this should succeed
|
||||
ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2)
|
||||
# the second call will try to transition the timeline into Stopping state, but it's already in that state
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
||||
@pytest.mark.parametrize("fill_branch", [True, False])
|
||||
def test_timeline_resurrection_on_attach(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
fill_branch: bool,
|
||||
):
|
||||
"""
|
||||
After deleting a timeline it should never appear again.
|
||||
This test ensures that this invariant holds for detach+attach.
|
||||
Original issue: https://github.com/neondatabase/neon/issues/3560
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_timeline_resurrection_on_attach",
|
||||
)
|
||||
|
||||
##### First start, insert data and upload it to the remote storage
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
pg = env.endpoints.create_start("main")
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE f (i integer);")
|
||||
cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
# wait until pageserver receives that data
|
||||
wait_for_last_record_lsn(ps_http, tenant_id, main_timeline_id, current_lsn)
|
||||
|
||||
# run checkpoint manually to be sure that data landed in remote storage
|
||||
ps_http.timeline_checkpoint(tenant_id, main_timeline_id)
|
||||
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
log.info("waiting for checkpoint upload")
|
||||
wait_for_upload(ps_http, tenant_id, main_timeline_id, current_lsn)
|
||||
log.info("upload of checkpoint is done")
|
||||
|
||||
branch_timeline_id = env.neon_cli.create_branch("new", "main")
|
||||
|
||||
# Two variants of this test:
|
||||
# - In fill_branch=True, the deleted branch has layer files.
|
||||
# - In fill_branch=False, it doesn't, it just has the metadata file.
|
||||
# A broken implementation is conceivable that tries to "optimize" handling of empty branches, e.g.,
|
||||
# by skipping IndexPart uploads if the layer file set doesn't change. That would be wrong, catch those.
|
||||
if fill_branch:
|
||||
with env.endpoints.create_start("new") as new_pg:
|
||||
with new_pg.cursor() as cur:
|
||||
cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
# wait until pageserver receives that data
|
||||
wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn)
|
||||
|
||||
# run checkpoint manually to be sure that data landed in remote storage
|
||||
ps_http.timeline_checkpoint(tenant_id, branch_timeline_id)
|
||||
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
log.info("waiting for checkpoint upload")
|
||||
wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
|
||||
log.info("upload of checkpoint is done")
|
||||
else:
|
||||
pass
|
||||
|
||||
# delete new timeline
|
||||
ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id)
|
||||
|
||||
##### Stop the pageserver instance, erase all its data
|
||||
env.endpoints.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
dir_to_clear = Path(env.repo_dir) / "tenants"
|
||||
shutil.rmtree(dir_to_clear)
|
||||
os.mkdir(dir_to_clear)
|
||||
|
||||
##### Second start, restore the data and ensure that we see only timeline that wasnt deleted
|
||||
env.pageserver.start()
|
||||
|
||||
ps_http.tenant_attach(tenant_id=tenant_id)
|
||||
|
||||
wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
|
||||
|
||||
timelines = ps_http.timeline_list(tenant_id=tenant_id)
|
||||
assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
|
||||
main_timeline_id
|
||||
}, "the deleted timeline should not have been resurrected"
|
||||
assert all([tl["state"] == "Active" for tl in timelines])
|
||||
|
||||
|
||||
def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
When deleting a timeline, if we succeed in setting the deleted flag remotely
|
||||
but fail to delete the local state, restarting the pageserver should resume
|
||||
the deletion of the local state.
|
||||
(Deletion of the state in S3 is not implemented yet.)
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name="test_timeline_delete_fail_before_local_delete",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm")
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited"
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
ps_http.configure_failpoints(("timeline-delete-before-rm", "return"))
|
||||
|
||||
# construct pair of branches
|
||||
intermediate_timeline_id = env.neon_cli.create_branch(
|
||||
"test_timeline_delete_fail_before_local_delete"
|
||||
)
|
||||
|
||||
leaf_timeline_id = env.neon_cli.create_branch(
|
||||
"test_timeline_delete_fail_before_local_delete1",
|
||||
"test_timeline_delete_fail_before_local_delete",
|
||||
)
|
||||
|
||||
leaf_timeline_path = (
|
||||
env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match="failpoint: timeline-delete-before-rm",
|
||||
):
|
||||
ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
|
||||
|
||||
assert leaf_timeline_path.exists(), "the failpoint didn't work"
|
||||
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
# Wait for tenant to finish loading.
|
||||
wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5)
|
||||
|
||||
assert (
|
||||
not leaf_timeline_path.exists()
|
||||
), "timeline load procedure should have resumed the deletion interrupted by the failpoint"
|
||||
timelines = ps_http.timeline_list(env.initial_tenant)
|
||||
assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
|
||||
intermediate_timeline_id,
|
||||
env.initial_timeline,
|
||||
}, "other timelines should not have been affected"
|
||||
assert all([tl["state"] == "Active" for tl in timelines])
|
||||
|
||||
|
||||
def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
If we're stuck uploading the index file with the is_delete flag,
|
||||
eventually console will hand up and retry.
|
||||
If we're still stuck at the retry time, ensure that the retry
|
||||
fails with status 500, signalling to console that it should retry
|
||||
later.
|
||||
Ideally, timeline_delete should return 202 Accepted and require
|
||||
console to poll for completion, but, that would require changing
|
||||
the API contract.
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name="test_concurrent_timeline_delete_if_first_stuck_at_index_upload",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
child_timeline_id = env.neon_cli.create_branch("child", "main")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# make the first call sleep practically forever
|
||||
failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
|
||||
ps_http.configure_failpoints((failpoint_name, "pause"))
|
||||
|
||||
def first_call(result_queue):
|
||||
try:
|
||||
log.info("first call start")
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10)
|
||||
log.info("first call success")
|
||||
result_queue.put("success")
|
||||
except Exception:
|
||||
log.exception("first call failed")
|
||||
result_queue.put("failure, see log for stack trace")
|
||||
|
||||
first_call_result: queue.Queue[str] = queue.Queue()
|
||||
first_call_thread = threading.Thread(target=first_call, args=(first_call_result,))
|
||||
first_call_thread.start()
|
||||
|
||||
try:
|
||||
|
||||
def first_call_hit_failpoint():
|
||||
assert env.pageserver.log_contains(
|
||||
f".*{child_timeline_id}.*at failpoint {failpoint_name}"
|
||||
)
|
||||
|
||||
wait_until(50, 0.1, first_call_hit_failpoint)
|
||||
|
||||
# make the second call and assert behavior
|
||||
log.info("second call start")
|
||||
error_msg_re = "another task is already setting the deleted_flag, started at"
|
||||
with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
|
||||
assert second_call_err.value.status_code == 500
|
||||
env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
|
||||
# the second call will try to transition the timeline into Stopping state as well
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
log.info("second call failed as expected")
|
||||
|
||||
# by now we know that the second call failed, let's ensure the first call will finish
|
||||
ps_http.configure_failpoints((failpoint_name, "off"))
|
||||
|
||||
result = first_call_result.get()
|
||||
assert result == "success"
|
||||
|
||||
finally:
|
||||
log.info("joining first call thread")
|
||||
# in any case, make sure the lifetime of the thread is bounded to this test
|
||||
first_call_thread.join()
|
||||
|
||||
|
||||
def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
If the client hangs up before we start the index part upload but after we mark it
|
||||
deleted in local memory, a subsequent delete_timeline call should be able to do
|
||||
another delete timeline operation.
|
||||
|
||||
This tests cancel safety up to the given failpoint.
|
||||
"""
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
test_name="test_delete_timeline_client_hangup",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
child_timeline_id = env.neon_cli.create_branch("child", "main")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
|
||||
ps_http.configure_failpoints((failpoint_name, "pause"))
|
||||
|
||||
with pytest.raises(requests.exceptions.Timeout):
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
|
||||
|
||||
# make sure the timeout was due to the failpoint
|
||||
at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
|
||||
|
||||
def hit_failpoint():
|
||||
assert env.pageserver.log_contains(at_failpoint_log_message)
|
||||
|
||||
wait_until(50, 0.1, hit_failpoint)
|
||||
|
||||
# we log this error if a client hangs up
|
||||
# might as well use it as another indicator that the test works
|
||||
hangup_log_message = f".*DELETE.*{child_timeline_id}.*request was dropped before completing"
|
||||
env.pageserver.allowed_errors.append(hangup_log_message)
|
||||
|
||||
def got_hangup_log_message():
|
||||
assert env.pageserver.log_contains(hangup_log_message)
|
||||
|
||||
wait_until(50, 0.1, got_hangup_log_message)
|
||||
|
||||
# ok, retry without failpoint, it should succeed
|
||||
ps_http.configure_failpoints((failpoint_name, "off"))
|
||||
|
||||
# this should succeed
|
||||
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
|
||||
# the second call will try to transition the timeline into Stopping state, but it's already in that state
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user