From b953daa21fce8b41e4c4556da06279223f8f411a Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 3 Mar 2025 16:03:51 +0000 Subject: [PATCH] safekeeper: allow remote deletion to proceed after dropped requests (#11042) ## Problem If a caller times out on safekeeper timeline deletion on a large timeline, and waits a while before retrying, the deletion will not progress while the retry is waiting. The net effect is very very slow deletion as it only proceeds in 30 second bursts across 5 minute idle periods. Related: https://github.com/neondatabase/neon/issues/10265 ## Summary of changes - Run remote deletion in a background task - Carry a watch::Receiver on the Timeline for other callers to join the wait - Restart deletion if the API is called again and the previous attempt failed --- safekeeper/src/timeline.rs | 94 ++++- safekeeper/src/wal_backup.rs | 8 +- test_runner/fixtures/remote_storage.py | 11 + test_runner/fixtures/safekeeper/http.py | 3 +- test_runner/fixtures/safekeeper_utils.py | 92 +++++ .../regress/test_safekeeper_deletion.py | 331 ++++++++++++++++++ test_runner/regress/test_wal_acceptor.py | 307 +--------------- 7 files changed, 541 insertions(+), 305 deletions(-) create mode 100644 test_runner/fixtures/safekeeper_utils.py create mode 100644 test_runner/regress/test_safekeeper_deletion.py diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 930f66a207..d3c841ec09 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -415,6 +415,9 @@ impl From for ApiError { } } +/// We run remote deletion in a background task, this is how it sends its results back. +type RemoteDeletionReceiver = tokio::sync::watch::Receiver>>; + /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { @@ -446,6 +449,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + remote_deletion: std::sync::Mutex>, + /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding /// this gate, you must respect [`Timeline::cancel`] pub(crate) gate: Gate, @@ -494,6 +499,7 @@ impl Timeline { walreceivers, gate: Default::default(), cancel: CancellationToken::default(), + remote_deletion: std::sync::Mutex::new(None), manager_ctl: ManagerCtl::new(), conf, broker_active: AtomicBool::new(false), @@ -598,15 +604,95 @@ impl Timeline { shared_state.sk.close_wal_store(); if !only_local && self.conf.is_wal_backup_enabled() { - // Note: we concurrently delete remote storage data from multiple - // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we - // do some retries anyway. - wal_backup::delete_timeline(&self.ttid).await?; + self.remote_delete().await?; } let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } + /// Delete timeline content from remote storage. If the returned future is dropped, + /// deletion will continue in the background. + /// + /// This function ordinarily spawns a task and stashes a result receiver into [`Self::remote_deletion`]. If + /// deletion is already happening, it may simply wait for an existing task's result. + /// + /// Note: we concurrently delete remote storage data from multiple + /// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we + /// do some retries anyway. + async fn remote_delete(&self) -> Result<()> { + // We will start a background task to do the deletion, so that it proceeds even if our + // API request is dropped. Future requests will see the existing deletion task and wait + // for it to complete. + let mut result_rx = { + let mut remote_deletion_state = self.remote_deletion.lock().unwrap(); + let result_rx = if let Some(result_rx) = remote_deletion_state.as_ref() { + if let Some(result) = result_rx.borrow().as_ref() { + if let Err(e) = result { + // A previous remote deletion failed: we will start a new one + tracing::error!("remote deletion failed, will retry ({e})"); + None + } else { + // A previous remote deletion call already succeeded + return Ok(()); + } + } else { + // Remote deletion is still in flight + Some(result_rx.clone()) + } + } else { + // Remote deletion was not attempted yet, start it now. + None + }; + + match result_rx { + Some(result_rx) => result_rx, + None => self.start_remote_delete(&mut remote_deletion_state), + } + }; + + // Wait for a result + let Ok(result) = result_rx.wait_for(|v| v.is_some()).await else { + // Unexpected: sender should always send a result before dropping the channel, even if it has an error + return Err(anyhow::anyhow!( + "remote deletion task future was dropped without sending a result" + )); + }; + + result + .as_ref() + .expect("We did a wait_for on this being Some above") + .as_ref() + .map(|_| ()) + .map_err(|e| anyhow::anyhow!("remote deletion failed: {e}")) + } + + /// Spawn background task to do remote deletion, return a receiver for its outcome + fn start_remote_delete( + &self, + guard: &mut std::sync::MutexGuard>, + ) -> RemoteDeletionReceiver { + tracing::info!("starting remote deletion"); + let (result_tx, result_rx) = tokio::sync::watch::channel(None); + let ttid = self.ttid; + tokio::task::spawn( + async move { + let r = wal_backup::delete_timeline(&ttid).await; + if let Err(e) = &r { + // Log error here in case nobody ever listens for our result (e.g. dropped API request) + tracing::error!("remote deletion failed: {e}"); + } + + // Ignore send results: it's legal for the Timeline to give up waiting for us. + let _ = result_tx.send(Some(r)); + } + .instrument(info_span!("remote_delete", timeline = %self.ttid)), + ); + + **guard = Some(result_rx.clone()); + + result_rx + } + /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { self.cancel.is_cancelled() diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 6176e64698..56f4a2faf9 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -21,9 +21,9 @@ use tokio::sync::{OnceCell, watch}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::backoff; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; +use utils::{backoff, pausable_failpoint}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::WalResidentTimeline; @@ -564,6 +564,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { // We don't currently have http requests timeout cancellation, but if/once // we have listing should get streaming interface to make progress. + pausable_failpoint!("sk-delete-timeline-remote-pause"); + + fail::fail_point!("sk-delete-timeline-remote", |_| { + Err(anyhow::anyhow!("failpoint: sk-delete-timeline-remote")) + }); + let cancel = CancellationToken::new(); // not really used backoff::retry( || async { diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 4df2b2df2b..cac84c07e7 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -282,6 +282,17 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def safekeeper_tenants_path(self) -> str: + return f"{self.prefix_in_bucket}" + + def safekeeper_tenant_path(self, tenant_id: TenantShardId | TenantId) -> str: + return f"{self.safekeeper_tenants_path()}/{tenant_id}" + + def safekeeper_timeline_path( + self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId + ) -> str: + return f"{self.safekeeper_tenant_path(tenant_id)}/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: """ Gets the latest generation key from a list of keys. diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 7038d87aba..e409151b76 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -229,13 +229,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): # only_local doesn't remove segments in the remote storage. def timeline_delete( - self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False + self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False, **kwargs ) -> dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params={ "only_local": str(only_local).lower(), }, + **kwargs, ) res.raise_for_status() res_json = res.json() diff --git a/test_runner/fixtures/safekeeper_utils.py b/test_runner/fixtures/safekeeper_utils.py new file mode 100644 index 0000000000..158baf7bb6 --- /dev/null +++ b/test_runner/fixtures/safekeeper_utils.py @@ -0,0 +1,92 @@ +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonPageserver, Safekeeper +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import get_dir_size + + +def is_segment_offloaded( + sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn +): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.backup_lsn >= seg_end + + +def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.flush_lsn >= lsn + + +def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") + return sk_wal_size_mb <= target_size_mb + + +def wait_lsn_force_checkpoint( + tenant_id: TenantId, + timeline_id: TimelineId, + endpoint: Endpoint, + ps: NeonPageserver, + pageserver_conn_options=None, +): + pageserver_conn_options = pageserver_conn_options or {} + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + + wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at_sk( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) + wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at( + lsn: Lsn, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + """ + Wait until pageserver receives given lsn, force checkpoint and wait for + upload, i.e. remote_consistent_lsn advancement. + """ + pageserver_conn_options = pageserver_conn_options or {} + + auth_token = None + if "password" in pageserver_conn_options: + auth_token = pageserver_conn_options["password"] + + # wait for the pageserver to catch up + wait_for_last_record_lsn( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) + + # force checkpoint to advance remote_consistent_lsn + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) + + # ensure that remote_consistent_lsn is advanced + wait_for_upload( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py new file mode 100644 index 0000000000..b46095d583 --- /dev/null +++ b/test_runner/regress/test_safekeeper_deletion.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import threading +import time +from contextlib import closing +from enum import StrEnum + +import pytest +import requests +from fixtures.common_types import Lsn, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnvBuilder, +) +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.safekeeper_utils import is_segment_offloaded +from fixtures.utils import wait_until + + +@pytest.mark.parametrize("auth_enabled", [False, True]) +def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() + + # FIXME: are these expected? + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + + # Create two tenants: one will be deleted, other should be preserved. + tenant_id = env.initial_tenant + timeline_id_1 = env.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant + + tenant_id_other, timeline_id_other = env.create_tenant() + + # Populate branches + endpoint_1 = env.endpoints.create_start("br1") + endpoint_2 = env.endpoints.create_start("br2") + endpoint_3 = env.endpoints.create_start("br3") + endpoint_4 = env.endpoints.create_start("br4") + endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) + for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key)") + sk = env.safekeepers[0] + sk_data_dir = sk.data_dir + if not auth_enabled: + sk_http = sk.http_client() + sk_http_other = sk_http + else: + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + sk_http_other = sk.http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) + ) + sk_http_noauth = sk.http_client(gen_sk_wide_token=False) + assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. + endpoint_2.stop_and_destroy() + endpoint_4.stop_and_destroy() + sk.stop() + sk.start() + + # Ensure connections to Safekeeper are established + for endpoint in [endpoint_1, endpoint_3, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (1)") + + # Stop all computes gracefully before safekeepers stop responding to them + endpoint_1.stop_and_destroy() + endpoint_3.stop_and_destroy() + + # Remove initial tenant's br1 (active) + assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure repeated deletion succeeds + assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + if auth_enabled: + # Ensure we cannot delete the other tenant + for sk_h in [sk_http, sk_http_noauth]: + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.tenant_delete_force(tenant_id_other) + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant's br2 (inactive) + assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove non-existing branch, should succeed + assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant fully (two branches are active) + response = sk_http.tenant_delete_force(tenant_id) + assert response[str(timeline_id_3)]["dir_existed"] + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant again. + response = sk_http.tenant_delete_force(tenant_id) + # assert response == {} + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure the other tenant still works + sk_http_other.timeline_status(tenant_id_other, timeline_id_other) + with closing(endpoint_other.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (123)") + + +def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): + """ + Test deleting timelines on a safekeeper while they're under load. + + This should not happen under normal operation, but it can happen if + there is some rogue compute/pageserver that is writing/reading to a + safekeeper that we're migrating a timeline away from, or if the timeline + is being deleted while such a rogue client is running. + """ + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + # Create two endpoints that will generate load + timeline_id_a = env.create_branch("deleteme_a") + timeline_id_b = env.create_branch("deleteme_b") + + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + endpoint_b = env.endpoints.create("deleteme_b") + endpoint_b.start() + + # Get tenant and timeline IDs + tenant_id = env.initial_tenant + + # Start generating load on both timelines + def generate_load(endpoint: Endpoint): + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") + while True: + try: + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") + except: # noqa + # Ignore errors since timeline may be deleted + break + + t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) + t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) + try: + t_a.start() + t_b.start() + + # Let the load run for a bit + log.info("Warming up...") + time.sleep(2) + + # Safekeeper errors will propagate to the pageserver: it is correct that these are + # logged at error severity because they indicate the pageserver is trying to read + # a timeline that it shouldn't. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline.*was cancelled.*", + ".*Timeline.*was not found.*", + ] + ) + + # Try deleting timelines while under load + sk = env.safekeepers[0] + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + + # Delete first timeline + log.info(f"Deleting {timeline_id_a}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] + + # Delete second timeline + log.info(f"Deleting {timeline_id_b}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] + + # Verify timelines are gone from disk + sk_data_dir = sk.data_dir + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() + # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() + + finally: + log.info("Stopping endpoints...") + # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang + endpoint_a.stop(mode="immediate") + endpoint_b.stop(mode="immediate") + log.info("Joining threads...") + t_a.join() + t_b.join() + + +class RemoteDeleteFailpoint(StrEnum): + PAUSE = "sk-delete-timeline-remote-pause" + FAIL = "sk-delete-timeline-remote" + + +@pytest.mark.parametrize("failpoint", [RemoteDeleteFailpoint.PAUSE, RemoteDeleteFailpoint.FAIL]) +def test_safekeeper_delete_remote_errors( + neon_env_builder: NeonEnvBuilder, failpoint: RemoteDeleteFailpoint +): + """ + Test that errors and delays during remote deletion are handled correctly. + """ + + # Configure safekeepers with ultra-fast eviction policy + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--control-file-save-interval", + "1s", + ] + neon_env_builder.enable_safekeeper_remote_storage(s3_storage()) + env = neon_env_builder.init_start() + + # FIXME: pageserver is intermittently emitting this + env.pageserver.allowed_errors.extend( + [ + ".*unsupported command START_WAL_PUSH in START_WAL_PUSH.*", + ] + ) + + timeline_id_a = env.create_branch("deleteme_a") + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + with closing(endpoint_a.connect()) as conn: + with conn.cursor() as cur: + # roughly fills one segment + cur.execute("create table t(key int, value text)") + cur.execute("insert into t select generate_series(1,250000), 'payload'") + endpoint_a.stop() + + # Ensure something is uploaded to remote storage + def assert_is_uploaded(): + assert is_segment_offloaded( + env.safekeepers[0], env.initial_tenant, timeline_id_a, Lsn("0/2000000") + ) + + wait_until(assert_is_uploaded) + + def list_timeline_remote(): + assert isinstance(env.safekeepers_remote_storage, S3Storage) + prefix = f"{env.safekeepers_remote_storage.safekeeper_timeline_path(env.initial_tenant, timeline_id_a)}/" + + listing = env.safekeepers_remote_storage.client.list_objects_v2( + Bucket=env.safekeepers_remote_storage.bucket_name, + Prefix=prefix, + ) + return listing.get("Contents", []) + + assert list_timeline_remote() != [] + + sk_http = env.safekeepers[0].http_client() + env.pageserver.http_client().timeline_delete(env.initial_tenant, timeline_id_a) + + # Set up failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + sk_http.configure_failpoints((failpoint, "pause")) + elif failpoint == RemoteDeleteFailpoint.FAIL: + sk_http.configure_failpoints((failpoint, "return")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Delete the timeline - this should hit the configured failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + # Expect time out + with pytest.raises(requests.exceptions.ReadTimeout, match="timed out"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a, timeout=5) + + # Assert deletion didn't happy yet + assert list_timeline_remote() != [] + + # Unblock the background task that should still be running + sk_http.configure_failpoints((failpoint, "off")) + + # Expect that after unblocking, remote deletion proceeds + def assert_remote_deleted(): + assert list_timeline_remote() == [] + + wait_until(assert_remote_deleted) + + elif failpoint == RemoteDeleteFailpoint.FAIL: + # Expect immediate failure + with pytest.raises(sk_http.HTTPError, match="Internal Server Error"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + sk_http.configure_failpoints((failpoint, "off")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Retry should succeed + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + # Remote storage should be empty + assert list_timeline_remote() == [] diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8f70b460c6..0366e88389 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -27,7 +27,6 @@ from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( Endpoint, NeonEnvBuilder, - NeonPageserver, PgBin, PgProtocol, Safekeeper, @@ -38,8 +37,6 @@ from fixtures.pageserver.utils import ( assert_prefix_empty, assert_prefix_not_empty, timeline_delete_wait_completed, - wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor @@ -55,9 +52,16 @@ from fixtures.safekeeper.http import ( TimelineCreateRequest, ) from fixtures.safekeeper.utils import wait_walreceivers_absent +from fixtures.safekeeper_utils import ( + is_flush_lsn_caught_up, + is_segment_offloaded, + is_wal_trimmed, + wait_lsn_force_checkpoint, + wait_lsn_force_checkpoint_at, + wait_lsn_force_checkpoint_at_sk, +) from fixtures.utils import ( PropagatingThread, - get_dir_size, query_scalar, run_only_on_default_postgres, skip_in_debug_build, @@ -69,68 +73,6 @@ if TYPE_CHECKING: from typing import Any, Self -def wait_lsn_force_checkpoint( - tenant_id: TenantId, - timeline_id: TimelineId, - endpoint: Endpoint, - ps: NeonPageserver, - pageserver_conn_options=None, -): - pageserver_conn_options = pageserver_conn_options or {} - lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") - - wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at_sk( - safekeeper: Safekeeper, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) - wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at( - lsn: Lsn, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - """ - Wait until pageserver receives given lsn, force checkpoint and wait for - upload, i.e. remote_consistent_lsn advancement. - """ - pageserver_conn_options = pageserver_conn_options or {} - - auth_token = None - if "password" in pageserver_conn_options: - auth_token = pageserver_conn_options["password"] - - # wait for the pageserver to catch up - wait_for_last_record_lsn( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - # force checkpoint to advance remote_consistent_lsn - ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) - - # ensure that remote_consistent_lsn is advanced - wait_for_upload( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - @dataclass class TimelineMetrics: timeline_id: TimelineId @@ -475,31 +417,6 @@ def wait(f, desc, timeout=30, wait_f=None): wait_f() -def is_segment_offloaded( - sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn -): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.backup_lsn >= seg_end - - -def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.flush_lsn >= lsn - - -def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) - sk_wal_size_mb = sk_wal_size / 1024 / 1024 - log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") - return sk_wal_size_mb <= target_size_mb - - def test_wal_backup(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 remote_storage_kind = s3_storage() @@ -1685,214 +1602,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) -@pytest.mark.parametrize("auth_enabled", [False, True]) -def test_delete(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): - neon_env_builder.auth_enabled = auth_enabled - env = neon_env_builder.init_start() - - # FIXME: are these expected? - env.pageserver.allowed_errors.extend( - [ - ".*Timeline .* was not found in global map.*", - ".*Timeline .* was cancelled and cannot be used anymore.*", - ] - ) - - # Create two tenants: one will be deleted, other should be preserved. - tenant_id = env.initial_tenant - timeline_id_1 = env.create_branch("br1") # Active, delete explicitly - timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly - timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant - timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant - - tenant_id_other, timeline_id_other = env.create_tenant() - - # Populate branches - endpoint_1 = env.endpoints.create_start("br1") - endpoint_2 = env.endpoints.create_start("br2") - endpoint_3 = env.endpoints.create_start("br3") - endpoint_4 = env.endpoints.create_start("br4") - endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) - for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE t(key int primary key)") - sk = env.safekeepers[0] - sk_data_dir = sk.data_dir - if not auth_enabled: - sk_http = sk.http_client() - sk_http_other = sk_http - else: - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - sk_http_other = sk.http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) - ) - sk_http_noauth = sk.http_client(gen_sk_wide_token=False) - assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. - endpoint_2.stop_and_destroy() - endpoint_4.stop_and_destroy() - sk.stop() - sk.start() - - # Ensure connections to Safekeeper are established - for endpoint in [endpoint_1, endpoint_3, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (1)") - - # Stop all computes gracefully before safekeepers stop responding to them - endpoint_1.stop_and_destroy() - endpoint_3.stop_and_destroy() - - # Remove initial tenant's br1 (active) - assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure repeated deletion succeeds - assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - if auth_enabled: - # Ensure we cannot delete the other tenant - for sk_h in [sk_http, sk_http_noauth]: - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.tenant_delete_force(tenant_id_other) - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant's br2 (inactive) - assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove non-existing branch, should succeed - assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant fully (two branches are active) - response = sk_http.tenant_delete_force(tenant_id) - assert response[str(timeline_id_3)]["dir_existed"] - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant again. - response = sk_http.tenant_delete_force(tenant_id) - # assert response == {} - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure the other tenant still works - sk_http_other.timeline_status(tenant_id_other, timeline_id_other) - with closing(endpoint_other.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (123)") - - -def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): - """ - Test deleting timelines on a safekeeper while they're under load. - - This should not happen under normal operation, but it can happen if - there is some rogue compute/pageserver that is writing/reading to a - safekeeper that we're migrating a timeline away from, or if the timeline - is being deleted while such a rogue client is running. - """ - neon_env_builder.auth_enabled = True - env = neon_env_builder.init_start() - - # Create two endpoints that will generate load - timeline_id_a = env.create_branch("deleteme_a") - timeline_id_b = env.create_branch("deleteme_b") - - endpoint_a = env.endpoints.create("deleteme_a") - endpoint_a.start() - endpoint_b = env.endpoints.create("deleteme_b") - endpoint_b.start() - - # Get tenant and timeline IDs - tenant_id = env.initial_tenant - - # Start generating load on both timelines - def generate_load(endpoint: Endpoint): - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") - while True: - try: - cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") - except: # noqa - # Ignore errors since timeline may be deleted - break - - t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) - t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) - try: - t_a.start() - t_b.start() - - # Let the load run for a bit - log.info("Warming up...") - time.sleep(2) - - # Safekeeper errors will propagate to the pageserver: it is correct that these are - # logged at error severity because they indicate the pageserver is trying to read - # a timeline that it shouldn't. - env.pageserver.allowed_errors.extend( - [ - ".*Timeline.*was cancelled.*", - ".*Timeline.*was not found.*", - ] - ) - - # Try deleting timelines while under load - sk = env.safekeepers[0] - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - - # Delete first timeline - log.info(f"Deleting {timeline_id_a}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] - - # Delete second timeline - log.info(f"Deleting {timeline_id_b}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] - - # Verify timelines are gone from disk - sk_data_dir = sk.data_dir - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() - # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() - - finally: - log.info("Stopping endpoints...") - # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang - endpoint_a.stop(mode="immediate") - endpoint_b.stop(mode="immediate") - log.info("Joining threads...") - t_a.join() - t_b.join() - - # Basic pull_timeline test. # When live_sk_change is False, compute is restarted to change set of # safekeepers; otherwise it is live reload.