From 55b246085ea30341f2479ecfadff374a5487e74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 16 Oct 2024 16:47:17 +0200 Subject: [PATCH] Activate timelines during unoffload (#9399) The current code has forgotten to activate timelines during unoffload, leading to inability to receive the basebackup, due to the timeline still being in loading state. ``` stderr: command failed: compute startup failed: failed to get basebackup@0/0 from pageserver postgresql://no_user@localhost:15014 Caused by: 0: db error: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading 1: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading ``` Therefore, also activate the timeline during unoffloading. Part of #8088 --- pageserver/src/http/routes.rs | 7 +++- pageserver/src/tenant.rs | 40 +++++++++++++------- test_runner/regress/test_timeline_archive.py | 17 +++++++++ 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index dd403c1cef..36a6ed427b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -720,7 +720,12 @@ async fn timeline_archival_config_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; tenant - .apply_timeline_archival_config(timeline_id, request_data.state, ctx) + .apply_timeline_archival_config( + timeline_id, + request_data.state, + state.broker_client.clone(), + ctx, + ) .await?; Ok::<_, ApiError>(()) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 20925c7fd6..689982ddd4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1554,6 +1554,7 @@ impl Tenant { async fn unoffload_timeline( self: &Arc, timeline_id: TimelineId, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); @@ -1605,25 +1606,37 @@ impl Tenant { }) .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); - if let Some(timeline) = timelines.get(&timeline_id) { - let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); - if offloaded_timelines.remove(&timeline_id).is_none() { - warn!("timeline already removed from offloaded timelines"); - } - info!("timeline unoffloading complete"); - Ok(Arc::clone(timeline)) - } else { + let Some(timeline) = timelines.get(&timeline_id) else { warn!("timeline not available directly after attach"); - Err(TimelineArchivalError::Other(anyhow::anyhow!( + return Err(TimelineArchivalError::Other(anyhow::anyhow!( "timeline not available directly after attach" - ))) + ))); + }; + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); } + + // Activate the timeline (if it makes sense) + if !(timeline.is_broken() || timeline.is_stopping()) { + let background_jobs_can_start = None; + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + &ctx, + ); + } + + info!("timeline unoffloading complete"); + Ok(Arc::clone(timeline)) } pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, new_state: TimelineArchivalState, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); @@ -1664,12 +1677,13 @@ impl Tenant { Some(Arc::clone(timeline)) }; - // Second part: unarchive timeline (if needed) + // Second part: unoffload timeline (if needed) let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { // Turn offloaded timeline into a non-offloaded one - self.unoffload_timeline(timeline_id, ctx).await? + self.unoffload_timeline(timeline_id, broker_client, ctx) + .await? }; // Third part: upload new timeline archival state and block until it is present in S3 @@ -3354,7 +3368,7 @@ impl Tenant { /// Populate all Timelines' `GcInfo` with information about their children. We do not set the /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] /// - /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`]. fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 971cc57a1c..ffaed5e130 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -136,6 +136,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" ) + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + ps_http.timeline_archival_config( tenant_id, leaf_timeline_id, @@ -197,4 +208,10 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b ) assert leaf_detail["is_archived"] is False + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + assert sum == sum_again + assert not timeline_offloaded(initial_timeline_id)