import concurrent.futures import enum import os import shutil from threading import Thread import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, last_flush_lsn_upload, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, assert_prefix_not_empty, poll_for_remote_storage_iterations, tenant_delete_wait_completed, wait_tenant_status_404, wait_until_tenant_active, wait_until_tenant_state, ) from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage from fixtures.types import TenantId, TimelineId from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout def test_tenant_delete_smoke( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): neon_env_builder.pageserver_config_override = "test_remote_failures=1" remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( [ # The deletion queue will complain when it encounters simulated S3 errors ".*deletion executor: DeleteObjects request failed.*", # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file", ] ) ps_http = env.pageserver.http_client() # first try to delete non existing tenant tenant_id = TenantId.generate() env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"): ps_http.tenant_delete(tenant_id=tenant_id) env.neon_cli.create_tenant( tenant_id=tenant_id, conf=MANY_SMALL_LAYERS_TENANT_CONFIG, ) # Default tenant and the one we created assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 # create two timelines one being the parent of another parent = None for timeline in ["first", "second"]: timeline_id = env.neon_cli.create_branch( timeline, tenant_id=tenant_id, ancestor_branch_name=parent ) with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) parent = timeline iterations = poll_for_remote_storage_iterations(remote_storage_kind) assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 tenant_delete_wait_completed(ps_http, tenant_id, iterations) assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() assert_prefix_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) # Deletion updates the tenant count: the one default tenant remains assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 class Check(enum.Enum): RETRY_WITHOUT_RESTART = enum.auto() RETRY_WITH_RESTART = enum.auto() FAILPOINTS = [ "tenant-delete-before-shutdown", "tenant-delete-before-create-remote-mark", "tenant-delete-before-create-local-mark", "tenant-delete-before-background", "tenant-delete-before-polling-ongoing-deletions", "tenant-delete-before-cleanup-remaining-fs-traces", "tenant-delete-before-remove-timelines-dir", "tenant-delete-before-remove-deleted-mark", "tenant-delete-before-remove-tenant-dir", # Some failpoints from timeline deletion "timeline-delete-before-index-deleted-at", "timeline-delete-before-rm", "timeline-delete-before-index-delete", "timeline-delete-after-rm-dir", ] FAILPOINTS_BEFORE_BACKGROUND = [ "timeline-delete-before-schedule", "tenant-delete-before-shutdown", "tenant-delete-before-create-remote-mark", "tenant-delete-before-create-local-mark", "tenant-delete-before-background", ] def combinations(): result = [] remotes = available_s3_storages() for remote_storage_kind in remotes: for delete_failpoint in FAILPOINTS: # Simulate failures for only one type of remote storage # to avoid log pollution and make tests run faster if remote_storage_kind is RemoteStorageKind.MOCK_S3: simulate_failures = True else: simulate_failures = False result.append((remote_storage_kind, delete_failpoint, simulate_failures)) return result @pytest.mark.parametrize("check", list(Check)) @pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations()) def test_delete_tenant_exercise_crash_safety_failpoints( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, failpoint: str, simulate_failures: bool, check: Check, pg_bin: PgBin, ): if simulate_failures: neon_env_builder.pageserver_config_override = "test_remote_failures=1" neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) tenant_id = env.initial_tenant env.pageserver.allowed_errors.extend( [ # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # We may leave some upload tasks in the queue. They're likely deletes. # For uploads we explicitly wait with `last_flush_lsn_upload` below. # So by ignoring these instead of waiting for empty upload queue # we execute more distinct code paths. '.*stopping left-over name="remote upload".*', ] ) if simulate_failures: env.pageserver.allowed_errors.append( # The deletion queue will complain when it encounters simulated S3 errors ".*deletion executor: DeleteObjects request failed.*", ) ps_http = env.pageserver.http_client() timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id) with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint: # generate enough layers run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) ps_http.configure_failpoints((failpoint, "return")) iterations = poll_for_remote_storage_iterations(remote_storage_kind) # These failpoints are earlier than background task is spawned. # so they result in api request failure. if failpoint in FAILPOINTS_BEFORE_BACKGROUND: with pytest.raises(PageserverApiException, match=failpoint): ps_http.tenant_delete(tenant_id) else: ps_http.tenant_delete(tenant_id) tenant_info = wait_until_tenant_state( pageserver_http=ps_http, tenant_id=tenant_id, expected_state="Broken", iterations=iterations, ) reason = tenant_info["state"]["data"]["reason"] log.info(f"tenant broken: {reason}") # failpoint may not be the only error in the stack assert reason.endswith(f"failpoint: {failpoint}"), reason if check is Check.RETRY_WITH_RESTART: env.pageserver.restart() if failpoint in ( "tenant-delete-before-shutdown", "tenant-delete-before-create-remote-mark", ): wait_until_tenant_active( ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25 ) tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) else: # Pageserver should've resumed deletion after restart. wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed # this also checks that delete can be retried even when tenant is in Broken state ps_http.configure_failpoints((failpoint, "off")) tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) tenant_dir = env.pageserver.tenant_dir(tenant_id) # Check local is empty assert not tenant_dir.exists() # Check remote is empty assert_prefix_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), allowed_postfix="initdb.tar.zst", ) def test_tenant_delete_is_resumed_on_attach( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) env.pageserver.allowed_errors.append( # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file" ) tenant_id = env.initial_tenant ps_http = env.pageserver.http_client() # create two timelines for timeline in ["first", "second"]: timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id) with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) # sanity check, data should be there assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) # failpoint before we remove index_part from s3 failpoint = "timeline-delete-before-index-delete" ps_http.configure_failpoints((failpoint, "return")) env.pageserver.allowed_errors.extend( ( # allow errors caused by failpoints f".*failpoint: {failpoint}", # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', ) ) iterations = poll_for_remote_storage_iterations(remote_storage_kind) ps_http.tenant_delete(tenant_id) tenant_info = wait_until_tenant_state( pageserver_http=ps_http, tenant_id=tenant_id, expected_state="Broken", iterations=iterations, ) assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) reason = tenant_info["state"]["data"]["reason"] # failpoint may not be the only error in the stack assert reason.endswith(f"failpoint: {failpoint}"), reason # now we stop pageserver and remove local tenant state env.endpoints.stop_all() env.pageserver.stop() dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) env.pageserver.start() # now we call attach env.pageserver.tenant_attach(tenant_id=tenant_id) # delete should be resumed wait_tenant_status_404(ps_http, tenant_id, iterations) # we shouldn've created tenant dir on disk tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() ps_http.deletion_queue_flush(execute=True) assert_prefix_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder): """Reproduction of 2023-11-23 stuck tenants investigation""" # do not use default tenant/timeline creation because it would output the failpoint log message too early env = neon_env_builder.init_configs() env.start() pageserver_http = env.pageserver.http_client() env.pageserver.allowed_errors.extend( [ # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero ".*Timeline got dropped without initializing, cleaning its files", # the response hit_pausable_failpoint_and_later_fail f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn", ] ) env.pageserver.tenant_create(env.initial_tenant) failpoint = "flush-layer-cancel-after-writing-layer-out-pausable" pageserver_http.configure_failpoints((failpoint, "pause")) def hit_pausable_failpoint_and_later_fail(): with pytest.raises(PageserverApiException, match="NotFound: tenant"): pageserver_http.timeline_create( env.pg_version, env.initial_tenant, env.initial_timeline ) def start_deletion(): pageserver_http.tenant_delete(env.initial_tenant) def has_hit_failpoint(): assert env.pageserver.log_contains(f"at failpoint {failpoint}") is not None def deletion_has_started_waiting_for_timelines(): assert env.pageserver.log_contains("Waiting for timelines...") is not None def tenant_is_deleted(): try: pageserver_http.tenant_status(env.initial_tenant) except PageserverApiException as e: assert e.status_code == 404 else: raise RuntimeError("tenant was still accessible") creation = Thread(target=hit_pausable_failpoint_and_later_fail) creation.start() deletion = None try: wait_until(10, 1, has_hit_failpoint) # it should start ok, sync up with the stuck creation, then hang waiting for the timeline # to shut down. deletion = Thread(target=start_deletion) deletion.start() wait_until(10, 1, deletion_has_started_waiting_for_timelines) pageserver_http.configure_failpoints((failpoint, "off")) creation.join() deletion.join() wait_until(10, 1, tenant_is_deleted) finally: creation.join() if deletion is not None: deletion.join() def test_tenant_delete_concurrent( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): """ Validate that concurrent delete requests to the same tenant behave correctly: exactly one should succeed. This is a reproducer for https://github.com/neondatabase/neon/issues/5936 """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) ps_http = env.pageserver.http_client() tenant_id = env.initial_tenant timeline_id = env.initial_timeline # Populate some data with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken" env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file", # Errors logged from our 4xx requests f".*{CONFLICT_MESSAGE}.*", ] ) BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove" BEFORE_RUN_FAILPOINT = "tenant-delete-before-run" # We will let the initial delete run until right before it would remove # the tenant's TenantSlot. This pauses it in a state where the tenant # is visible in Stopping state, and concurrent requests should fail with 4xx. ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause")) def delete_tenant(): return ps_http.tenant_delete(tenant_id) def hit_remove_failpoint(): assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") def hit_run_failpoint(): assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") with concurrent.futures.ThreadPoolExecutor() as executor: background_200_req = executor.submit(delete_tenant) assert background_200_req.result(timeout=10).status_code == 202 # Wait until the first request completes its work and is blocked on removing # the TenantSlot from tenant manager. wait_until(100, 0.1, hit_remove_failpoint) # Start another request: this should fail when it sees a tenant in Stopping state with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): ps_http.tenant_delete(tenant_id) # Start another background request, which will pause after acquiring a TenantSlotGuard # but before completing. ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause")) background_4xx_req = executor.submit(delete_tenant) wait_until(100, 0.1, hit_run_failpoint) # The TenantSlot is still present while the original request is hung before # final removal assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 # Permit the original request to run to success ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off")) # Permit the duplicate background request to run to completion and fail. ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): background_4xx_req.result(timeout=10) # Physical deletion should have happened assert_prefix_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 def test_tenant_delete_races_timeline_creation( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): """ Validate that timeline creation executed in parallel with deletion works correctly. This is a reproducer for https://github.com/neondatabase/neon/issues/6255 """ # The remote storage kind doesn't really matter but we use it for iterations calculation below # (and there is no way to reconstruct the used remote storage kind) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) ps_http = env.pageserver.http_client() tenant_id = env.initial_tenant # When timeline creation is cancelled by tenant deletion, it is during Tenant::shutdown(), and # acting on a shutdown tenant generates a 503 response (if caller retried they would later) get # a 404 after the tenant is fully deleted. CANCELLED_ERROR = ( ".*POST.*Cancelled request finished successfully status=503 Service Unavailable" ) # This can occur sometimes. CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*" env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file", # We need the http connection close for successful reproduction ".*POST.*/timeline.* request was dropped before completing", # Timeline creation runs into this error CANCELLED_ERROR, # Timeline deletion can run into this error during deletion CONFLICT_MESSAGE, ".*tenant_delete_handler.*still waiting, taking longer than expected.*", ] ) BEFORE_INITDB_UPLOAD_FAILPOINT = "before-initdb-upload" DELETE_BEFORE_CLEANUP_FAILPOINT = "tenant-delete-before-cleanup-remaining-fs-traces-pausable" # Wait just before the initdb upload ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause")) def timeline_create(): try: ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) raise RuntimeError("creation succeeded even though it shouldn't") except ReadTimeout: pass Thread(target=timeline_create).start() def hit_initdb_upload_failpoint(): assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") wait_until(100, 0.1, hit_initdb_upload_failpoint) def creation_connection_timed_out(): assert env.pageserver.log_contains( "POST.*/timeline.* request was dropped before completing" ) # Wait so that we hit the timeout and the connection is dropped # (But timeline creation still continues) wait_until(100, 0.1, creation_connection_timed_out) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) def tenant_delete(): def tenant_delete_inner(): ps_http.tenant_delete(tenant_id) wait_until(100, 0.5, tenant_delete_inner) Thread(target=tenant_delete).start() def deletion_arrived(): assert env.pageserver.log_contains( f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" ) wait_until(100, 0.1, deletion_arrived) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) # Disable the failpoint and wait for deletion to finish ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) iterations = poll_for_remote_storage_iterations(remote_storage_kind) tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True) # Physical deletion should have happened assert_prefix_empty( neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", str(tenant_id), ) ), ) # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file assert env.pageserver.log_contains(CANCELLED_ERROR) assert not env.pageserver.log_contains( ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion" ) # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0