mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-13 08:22:55 +00:00
pageserver: run all Rust tests with remote storage enabled (#5164)
For [#5086](https://github.com/neondatabase/neon/pull/5086#issuecomment-1701331777) we will require remote storage to be configured in pageserver. This PR enables `localfs`-based storage for all Rust unit tests. Changes: - In `TenantHarness`, set up localfs remote storage for the tenant. - `create_test_timeline` should mimic what real timeline creation does, and real timeline creation waits for the timeline to reach remote storage. With this PR, `create_test_timeline` now does that as well. - All the places that create the harness tenant twice need to shut down the tenant before the re-create through a second call to `try_load` or `load`. - Without shutting down, upload tasks initiated by/through the first incarnation of the harness tenant might still be ongoing when the second incarnation of the harness tenant is `try_load`/`load`ed. That doesn't make sense in the tests that do that, they generally try to set up a scenario similar to pageserver stop & start. - There was one test that recreates a timeline, not the tenant. For that case, I needed to create a `Timeline::shutdown` method. It's a refactoring of the existing `Tenant::shutdown` method. - The remote_timeline_client tests previously set up their own `GenericRemoteStorage` and `RemoteTimelineClient`. Now they re-use the one that's pre-created by the TenantHarness. Some adjustments to the assertions were needed because the assertions now need to account for the initial image layer that's created by `create_test_timeline` to be present.
This commit is contained in:
committed by
GitHub
parent
aa22000e67
commit
cfc0fb573d
@@ -195,7 +195,7 @@ pub struct Tenant {
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
|
||||
// provides access to timeline data sitting in the remote storage
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
pub(crate) remote_storage: Option<GenericRemoteStorage>,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
@@ -1517,6 +1517,15 @@ impl Tenant {
|
||||
tline.maybe_spawn_flush_loop();
|
||||
tline.freeze_and_flush().await.context("freeze_and_flush")?;
|
||||
|
||||
// Make sure the freeze_and_flush reaches remote storage.
|
||||
tline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.wait_completion()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let tl = uninit_tl.finish_creation()?;
|
||||
// The non-test code would call tl.activate() here.
|
||||
tl.set_state(TimelineState::Active);
|
||||
@@ -1693,65 +1702,6 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush all in-memory data to disk and remote storage, if any.
|
||||
///
|
||||
/// Used at graceful shutdown.
|
||||
async fn freeze_and_flush_on_shutdown(&self) {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
|
||||
// execute on each timeline on the JoinSet, join after.
|
||||
let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
match timeline.freeze_and_flush().await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let res = if let Some(client) = timeline.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.wait_completion().await
|
||||
} else {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
if let Err(e) = res {
|
||||
warn!("failed to await for frozen and flushed uploads: {e:#}");
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
|
||||
};
|
||||
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(id, tl)| (*id, Arc::clone(tl)))
|
||||
.for_each(|(timeline_id, timeline)| {
|
||||
js.spawn(per_timeline(timeline_id, timeline));
|
||||
})
|
||||
};
|
||||
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
|
||||
Err(je) if je.is_panic() => { /* logged already */ }
|
||||
Err(je) => warn!("unexpected JoinError: {je:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TenantState {
|
||||
self.state.borrow().clone()
|
||||
}
|
||||
@@ -1882,19 +1832,22 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
if freeze_and_flush {
|
||||
// walreceiver has already began to shutdown with TenantState::Stopping, but we need to
|
||||
// await for them to stop.
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_id),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// this will wait for uploads to complete; in the past, it was done outside tenant
|
||||
// shutdown in pageserver::shutdown_pageserver.
|
||||
self.freeze_and_flush_on_shutdown().await;
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines.values().for_each(|timeline| {
|
||||
let timeline = Arc::clone(timeline);
|
||||
let span = Span::current();
|
||||
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
|
||||
})
|
||||
};
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
|
||||
Err(je) if je.is_panic() => { /* logged already */ }
|
||||
Err(je) => warn!("unexpected JoinError: {je:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service
|
||||
@@ -3467,6 +3420,8 @@ pub mod harness {
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: TenantId,
|
||||
pub generation: Generation,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
pub remote_fs_dir: PathBuf,
|
||||
}
|
||||
|
||||
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
|
||||
@@ -3504,29 +3459,39 @@ pub mod harness {
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
|
||||
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
let remote_fs_dir = conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&remote_fs_dir).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
generation: Generation::new(0xdeadbeef),
|
||||
remote_storage,
|
||||
remote_fs_dir,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
(
|
||||
self.try_load(&ctx, None)
|
||||
self.try_load(&ctx)
|
||||
.await
|
||||
.expect("failed to load test tenant"),
|
||||
ctx,
|
||||
)
|
||||
}
|
||||
|
||||
pub async fn try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
remote_storage: Option<remote_storage::GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
@@ -3536,7 +3501,7 @@ pub mod harness {
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
self.generation,
|
||||
remote_storage,
|
||||
Some(self.remote_storage.clone()),
|
||||
));
|
||||
tenant
|
||||
.load(None, ctx)
|
||||
@@ -4004,6 +3969,13 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let (tenant, _ctx) = harness.load().await;
|
||||
@@ -4037,6 +4009,14 @@ mod tests {
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
|
||||
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// check that both of them are initially unloaded
|
||||
@@ -4089,6 +4069,13 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
drop(tline);
|
||||
// so that all uploads finish & we can call harness.try_load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
drop(tenant);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
@@ -4100,11 +4087,7 @@ mod tests {
|
||||
metadata_bytes[8] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let err = harness
|
||||
.try_load(&ctx, None)
|
||||
.await
|
||||
.err()
|
||||
.expect("should fail");
|
||||
let err = harness.try_load(&ctx).await.err().expect("should fail");
|
||||
// get all the stack with all .context, not only the last one
|
||||
let message = format!("{err:#}");
|
||||
let expected = "failed to load metadata";
|
||||
@@ -4558,6 +4541,11 @@ mod tests {
|
||||
let tline =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
||||
// Keeps uninit mark in place
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown(false)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
|
||||
.await;
|
||||
std::mem::forget(tline);
|
||||
}
|
||||
|
||||
|
||||
@@ -1468,11 +1468,8 @@ mod tests {
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use std::{collections::HashSet, path::Path};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
@@ -1529,8 +1526,6 @@ mod tests {
|
||||
tenant: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
tenant_ctx: RequestContext,
|
||||
remote_fs_dir: PathBuf,
|
||||
client: Arc<RemoteTimelineClient>,
|
||||
}
|
||||
|
||||
impl TestSetup {
|
||||
@@ -1540,52 +1535,15 @@ mod tests {
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
// create an empty timeline directory
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
|
||||
|
||||
let storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
conf: harness.conf,
|
||||
runtime: tokio::runtime::Handle::current(),
|
||||
tenant_id: harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
generation,
|
||||
storage_impl: storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&harness.tenant_id,
|
||||
&TIMELINE_ID,
|
||||
)),
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
harness,
|
||||
tenant,
|
||||
timeline,
|
||||
tenant_ctx: ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1610,26 +1568,37 @@ mod tests {
|
||||
let TestSetup {
|
||||
harness,
|
||||
tenant: _tenant,
|
||||
timeline: _timeline,
|
||||
timeline,
|
||||
tenant_ctx: _tenant_ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
} = TestSetup::new("upload_scheduling").await.unwrap();
|
||||
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let initial_index_part = match client.download_index_file().await.unwrap() {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
let initial_layers = initial_index_part
|
||||
.layer_metadata
|
||||
.keys()
|
||||
.map(|f| f.to_owned())
|
||||
.collect::<HashSet<LayerFileName>>();
|
||||
let initial_layer = {
|
||||
assert!(initial_layers.len() == 1);
|
||||
initial_layers.into_iter().next().unwrap()
|
||||
};
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
println!("workdir: {}", harness.conf.workdir.display());
|
||||
|
||||
let remote_timeline_dir =
|
||||
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
|
||||
let remote_timeline_dir = harness
|
||||
.remote_fs_dir
|
||||
.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
|
||||
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
|
||||
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client
|
||||
.init_upload_queue_for_empty_remote(&metadata)
|
||||
.unwrap();
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
let generation = harness.generation;
|
||||
|
||||
// Create a couple of dummy files, schedule upload for them
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
@@ -1710,6 +1679,7 @@ mod tests {
|
||||
.map(|f| f.to_owned())
|
||||
.collect(),
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
],
|
||||
@@ -1739,6 +1709,7 @@ mod tests {
|
||||
}
|
||||
assert_remote_files(
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
"index_part.json",
|
||||
@@ -1752,6 +1723,7 @@ mod tests {
|
||||
|
||||
assert_remote_files(
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
&layer_file_name_3.file_name(),
|
||||
"index_part.json",
|
||||
@@ -1768,16 +1740,10 @@ mod tests {
|
||||
let TestSetup {
|
||||
harness,
|
||||
tenant: _tenant,
|
||||
timeline: _timeline,
|
||||
client,
|
||||
timeline,
|
||||
..
|
||||
} = TestSetup::new("metrics").await.unwrap();
|
||||
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client
|
||||
.init_upload_queue_for_empty_remote(&metadata)
|
||||
.unwrap();
|
||||
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
@@ -1788,11 +1754,20 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
struct BytesStartedFinished {
|
||||
started: Option<usize>,
|
||||
finished: Option<usize>,
|
||||
}
|
||||
impl std::ops::Add for BytesStartedFinished {
|
||||
type Output = Self;
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self {
|
||||
started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
|
||||
finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
|
||||
}
|
||||
}
|
||||
}
|
||||
let get_bytes_started_stopped = || {
|
||||
let started = client
|
||||
.metrics
|
||||
@@ -1809,47 +1784,38 @@ mod tests {
|
||||
};
|
||||
|
||||
// Test
|
||||
tracing::info!("now doing actual test");
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
let init = get_bytes_started_stopped();
|
||||
let actual_a = get_bytes_started_stopped();
|
||||
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64, generation),
|
||||
&LayerFileMetadata::new(content_1.len() as u64, harness.generation),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pre = get_bytes_started_stopped();
|
||||
let actual_b = get_bytes_started_stopped();
|
||||
|
||||
client.wait_completion().await.unwrap();
|
||||
|
||||
let post = get_bytes_started_stopped();
|
||||
let actual_c = get_bytes_started_stopped();
|
||||
|
||||
// Validate
|
||||
|
||||
assert_eq!(
|
||||
init,
|
||||
BytesStartedFinished {
|
||||
started: None,
|
||||
finished: None
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
pre,
|
||||
BytesStartedFinished {
|
||||
let expected_b = actual_a
|
||||
+ BytesStartedFinished {
|
||||
started: Some(content_1.len()),
|
||||
// assert that the _finished metric is created eagerly so that subtractions work on first sample
|
||||
finished: Some(0),
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
post,
|
||||
BytesStartedFinished {
|
||||
};
|
||||
assert_eq!(actual_b, expected_b);
|
||||
|
||||
let expected_c = actual_a
|
||||
+ BytesStartedFinished {
|
||||
started: Some(content_1.len()),
|
||||
finished: Some(content_1.len())
|
||||
}
|
||||
);
|
||||
finished: Some(content_1.len()),
|
||||
};
|
||||
assert_eq!(actual_c, expected_c);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,6 +90,7 @@ use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::config::TenantConf;
|
||||
use super::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::storage_layer::{
|
||||
@@ -933,6 +934,48 @@ impl Timeline {
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// prevent writes to the InMemoryLayer
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// now all writers to InMemory layer are gone, do the final flush if requested
|
||||
if freeze_and_flush {
|
||||
match self.freeze_and_flush().await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
return; // TODO: should probably drain remote timeline client anyways?
|
||||
}
|
||||
}
|
||||
|
||||
// drain the upload queue
|
||||
let res = if let Some(client) = self.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.wait_completion().await
|
||||
} else {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
if let Err(e) = res {
|
||||
warn!("failed to await for frozen and flushed uploads: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
match (self.current_state(), new_state) {
|
||||
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
|
||||
@@ -4742,22 +4785,8 @@ mod tests {
|
||||
let harness =
|
||||
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
|
||||
|
||||
let remote_storage = {
|
||||
// this is never used for anything, because of how the create_test_timeline works, but
|
||||
// it is with us in spirit and a Some.
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
|
||||
let path = harness.conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&path).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(path),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let tenant = harness.try_load(&ctx).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
@@ -4807,22 +4836,8 @@ mod tests {
|
||||
async fn layer_eviction_aba_fails() {
|
||||
let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
|
||||
|
||||
let remote_storage = {
|
||||
// this is never used for anything, because of how the create_test_timeline works, but
|
||||
// it is with us in spirit and a Some.
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
|
||||
let path = harness.conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&path).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(path),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let tenant = harness.try_load(&ctx).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
|
||||
@@ -192,7 +192,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# We may leave some upload tasks in the queue. They're likely deletes.
|
||||
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
|
||||
# So by ignoring these instead of waiting for empty upload queue
|
||||
@@ -338,7 +338,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
# From deletion polling
|
||||
f".*NotFound: tenant {env.initial_tenant}.*",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
|
||||
|
||||
@@ -231,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
|
||||
# It appears when we stopped flush loop during deletion and then pageserver is stopped
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
)
|
||||
# This happens when we fail before scheduling background operation.
|
||||
# Timeline is left in stopping state and retry tries to stop it again.
|
||||
@@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
)
|
||||
# this happens, because the stuck timeline is visible to shutdown
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
@@ -881,7 +881,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
# Polling after attach may fail with this
|
||||
|
||||
Reference in New Issue
Block a user