diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 52dfbc2857..367122362e 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -215,7 +215,7 @@ use std::sync::{Arc, Mutex}; use remote_storage::{DownloadError, GenericRemoteStorage}; use std::ops::DerefMut; use tokio::runtime::Runtime; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info, warn, Span}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; @@ -650,6 +650,22 @@ impl RemoteTimelineClient { stopped.last_uploaded_index_part.clone() }; + #[cfg(feature = "testing")] + tokio::task::spawn_blocking({ + let current = Span::current(); + move || { + let _entered = current.entered(); + tracing::info!( + "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ); + fail::fail_point!( + "persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ); + } + }) + .await + .expect("spawn_blocking"); + upload::upload_index_part( self.conf, &self.storage_impl, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 69042478c7..fe6da9f774 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -311,9 +311,9 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs): res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs ) self.verbose_error(res) res_json = res.json() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 028e104b16..3098e6cefc 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -2,6 +2,7 @@ # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import os +import queue import shutil import threading import time @@ -906,4 +907,84 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild assert all([tl["state"] == "Active" for tl in timelines]) +def test_concurrent_timeline_delete_if_first_stuck_at_index_upload( + neon_env_builder: NeonEnvBuilder, +): + """ + If we're stuck uploading the index file with the is_delete flag, + eventually console will hand up and retry. + If we're still stuck at the retry time, ensure that the retry + fails with status 500, signalling to console that it should retry + later. + Ideally, timeline_delete should return 202 Accepted and require + console to poll for completion, but, that would require changing + the API contract. + """ + + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + test_name="test_concurrent_timeline_delete_if_first_stuck_at_index_upload", + ) + + env = neon_env_builder.init_start() + + child_timeline_id = env.neon_cli.create_branch("child", "main") + + ps_http = env.pageserver.http_client() + + # make the first call sleep practically forever + failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ps_http.configure_failpoints((failpoint_name, "pause")) + + def first_call(result_queue): + try: + log.info("first call start") + ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10) + log.info("first call success") + result_queue.put("success") + except Exception: + log.exception("first call failed") + result_queue.put("failure, see log for stack trace") + + first_call_result: queue.Queue[str] = queue.Queue() + first_call_thread = threading.Thread(target=first_call, args=(first_call_result,)) + first_call_thread.start() + + try: + + def first_call_hit_failpoint(): + assert env.pageserver.log_contains( + f".*{child_timeline_id}.*at failpoint {failpoint_name}" + ) + + wait_until(50, 0.1, first_call_hit_failpoint) + + # make the second call and assert behavior + log.info("second call start") + with pytest.raises( + PageserverApiException, match="timeline is deleting, deleted_at" + ) as second_call_err: + ps_http.timeline_delete(env.initial_tenant, child_timeline_id) + assert second_call_err.value.status_code == 500 + env.pageserver.allowed_errors.append( + f".*{child_timeline_id}.*timeline is deleting, deleted_at: .*" + ) + # the second call will try to transition the timeline into Stopping state as well + env.pageserver.allowed_errors.append( + f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" + ) + log.info("second call failed as expected") + + # by now we know that the second call failed, let's ensure the first call will finish + ps_http.configure_failpoints((failpoint_name, "off")) + + result = first_call_result.get() + assert result == "success" + + finally: + log.info("joining first call thread") + # in any case, make sure the lifetime of the thread is bounded to this test + first_call_thread.join() + + # TODO Test that we correctly handle GC of files that are stuck in upload queue.