Clean up timeline initialization code.

Clarify who's responsible for initializing the layer map. There were
previously two different ways to do it:

- create_empty_timeline and bootstrap_timeline let prepare_timeline()
  initialize an empty layer map.

- branch_timeline passed a flag to initialize_with_lock() to tell
  initialize_with_lock to call load_layer_map(). Because it was a
  newly created timeline, load_layer_map() never found any layer
  files, so it just initialized an empty layer map.

With this commit, prepare_new_timeline() always does it. The LSN to
initialize it with is passed as argument.

Other changes per function:

prepare_timeline:
- rename to 'prepare_new_timeline' to make it clear that it's only used
  when creating a new timeline, not when loading an existing timeline
- always initialize an empty layer map. The caller can pass the LSN to
  initialize it with. (Previously, prepare_timeline would optionally
  load the layer map at 'initdb_lsn'. Some caller used that, while others
  let initialize_with_lock do it

initialize_with_lock:
- As mentioned above, remove the option to load the layer map
- Acquire the 'timelines' lock in the function itself. None of the callers
  did any other work while holding the lock.
- Rename it to finish_creation() to make its intent more clear. It's only
  used when creating a new timeline now.

create_timeline_data:
- Rename to create_timeline_struct() for clarity. It just initializes
  the Timeline struct, not any other "data"

create_timeline_files:
- use create_dir rather than create_dir_all, to be a little more strict.
  We know that the parent directory should already exist, and the timeline
  directory should not exist.
- Move the call to create_timeline_struct() to the caller. It was just
  being "passed through"

Part of https://github.com/neondatabase/neon/pull/4364
This commit is contained in:
Heikki Linnakangas
2023-05-28 02:40:58 +03:00
committed by Christian Schwarz
parent f450369b20
commit 8d106708d7
2 changed files with 112 additions and 117 deletions

View File

@@ -186,18 +186,13 @@ struct TimelineUninitMark {
}
impl UninitializedTimeline<'_> {
/// Ensures timeline data is valid, loads it into pageserver's memory and removes
/// uninit mark file on success.
/// Finish timeline creation: insert it into the Tenant's timelines map and remove the
/// uninit mark file.
///
/// This function launches the flush loop if not already done.
///
/// The caller is responsible for activating the timeline (function `.activate()`).
fn initialize_with_lock(
mut self,
_ctx: &RequestContext,
timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
load_layer_map: bool,
) -> anyhow::Result<Arc<Timeline>> {
fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
let timeline_id = self.timeline_id;
let tenant_id = self.owning_tenant.tenant_id;
@@ -205,25 +200,19 @@ impl UninitializedTimeline<'_> {
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
})?;
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
// TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
// ensure!(new_disk_consistent_lsn.is_valid(),
// "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn and cannot be initialized");
// Check that the caller initialized disk_consistent_lsn
//
// TODO: many our unit tests violate this.
//let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
//anyhow::ensure!(new_disk_consistent_lsn.is_valid(),
// "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn");
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
match timelines.entry(timeline_id) {
Entry::Occupied(_) => anyhow::bail!(
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
),
Entry::Vacant(v) => {
if load_layer_map {
new_timeline
.load_layer_map(new_disk_consistent_lsn)
.with_context(|| {
format!(
"Failed to load layermap for timeline {tenant_id}/{timeline_id}"
)
})?;
}
uninit_mark.remove_uninit_mark().with_context(|| {
format!(
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
@@ -252,9 +241,10 @@ impl UninitializedTimeline<'_> {
.await
.context("Failed to import basebackup")?;
// Flush the new layer files to disk, before we make the timeline as available to
// the outside world.
//
// Flush loop needs to be spawned in order to be able to flush.
// We want to run proper checkpoint before we mark timeline as available to outside world
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
raw_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
@@ -266,10 +256,9 @@ impl UninitializedTimeline<'_> {
.await
.context("Failed to flush after basebackup import")?;
// Initialize without loading the layer map. We started with an empty layer map, and already
// updated it for the layers that we created during the import.
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
// All the data has been imported. Insert the Timeline into the tenant's timelines
// map and remove the uninit mark file.
let tl = self.finish_creation()?;
tl.activate(broker_client, None, ctx);
Ok(tl)
}
@@ -516,24 +505,25 @@ impl Tenant {
.context("merge_local_remote_metadata")?
.to_owned();
let timeline = {
let timeline = self.create_timeline_data(
timeline_id,
up_to_date_metadata,
ancestor.clone(),
remote_client,
init_order,
)?;
let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
// TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
// ensure!(new_disk_consistent_lsn.is_valid(),
// "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn and cannot be initialized");
timeline
.load_layer_map(new_disk_consistent_lsn)
.with_context(|| {
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
})?;
let timeline = self.create_timeline_struct(
timeline_id,
up_to_date_metadata,
ancestor.clone(),
remote_client,
init_order,
)?;
let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
new_disk_consistent_lsn.is_valid(),
"Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
);
timeline
.load_layer_map(new_disk_consistent_lsn)
.with_context(|| {
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
})?;
{
// avoiding holding it across awaits
let mut timelines_accessor = self.timelines.lock().unwrap();
match timelines_accessor.entry(timeline_id) {
@@ -547,7 +537,6 @@ impl Tenant {
Entry::Vacant(v) => {
v.insert(Arc::clone(&timeline));
timeline.maybe_spawn_flush_loop();
timeline
}
}
};
@@ -1139,14 +1128,14 @@ impl Tenant {
.init_upload_queue_stopped_to_continue_deletion(&index_part)?;
let timeline = self
.create_timeline_data(
.create_timeline_struct(
timeline_id,
&local_metadata,
ancestor,
Some(remote_client),
init_order,
)
.context("create_timeline_data")?;
.context("create_timeline_struct")?;
let guard = Arc::clone(&timeline.delete_lock).lock_owned().await;
@@ -1272,6 +1261,8 @@ impl Tenant {
drop(timelines);
let new_metadata = TimelineMetadata::new(
// Initialize disk_consistent LSN to 0, The caller must import some data to
// make it valid, before calling finish_creation()
Lsn(0),
None,
None,
@@ -1280,11 +1271,11 @@ impl Tenant {
initdb_lsn,
pg_version,
);
self.prepare_timeline(
self.prepare_new_timeline(
new_timeline_id,
&new_metadata,
timeline_uninit_mark,
true,
initdb_lsn,
None,
)
}
@@ -1315,10 +1306,7 @@ impl Tenant {
.commit()
.context("commit init_empty_test_timeline modification")?;
let mut timelines = self.timelines.lock().unwrap();
// load_layers=false because create_empty_timeline already did that what's necessary (set next_open_layer)
// and modification.init_empty() already created layers.
let tl = uninit_tl.initialize_with_lock(ctx, &mut timelines, false)?;
let tl = uninit_tl.finish_creation()?;
// The non-test code would call tl.activate() here.
tl.set_state(TimelineState::Active);
Ok(tl)
@@ -2250,7 +2238,12 @@ impl Tenant {
}
}
fn create_timeline_data(
/// Helper function to create a new Timeline struct.
///
/// The returned Timeline is in Loading state. The caller is responsible for
/// initializing any on-disk state, and for inserting the Timeline to the 'timelines'
/// map.
fn create_timeline_struct(
&self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
@@ -2670,7 +2663,7 @@ impl Tenant {
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
start_lsn: Option<Lsn>,
ctx: &RequestContext,
_ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
let src_id = src_timeline.timeline_id;
@@ -2755,17 +2748,15 @@ impl Tenant {
src_timeline.pg_version,
);
let new_timeline = {
let mut timelines = self.timelines.lock().unwrap();
self.prepare_timeline(
dst_id,
&metadata,
timeline_uninit_mark,
false,
Some(Arc::clone(src_timeline)),
)?
.initialize_with_lock(ctx, &mut timelines, true)?
};
let uninitialized_timeline = self.prepare_new_timeline(
dst_id,
&metadata,
timeline_uninit_mark,
start_lsn + 1,
Some(Arc::clone(src_timeline)),
)?;
let new_timeline = uninitialized_timeline.finish_creation()?;
// Root timeline gets its layers during creation and uploads them along with the metadata.
// A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -2841,8 +2832,13 @@ impl Tenant {
pgdata_lsn,
pg_version,
);
let raw_timeline =
self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?;
let raw_timeline = self.prepare_new_timeline(
timeline_id,
&new_metadata,
timeline_uninit_mark,
pgdata_lsn,
None,
)?;
let tenant_id = raw_timeline.owning_tenant.tenant_id;
let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2858,10 +2854,10 @@ impl Tenant {
format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
})?;
// Flush the new layer files to disk, before we mark the timeline as available to
// Flush the new layer files to disk, before we make the timeline as available to
// the outside world.
//
// Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock
// Flush loop needs to be spawned in order to be able to flush.
unfinished_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
@@ -2877,12 +2873,8 @@ impl Tenant {
)
})?;
// Initialize the timeline without loading the layer map, because we already updated the layer
// map above, when we imported the datadir.
let timeline = {
let mut timelines = self.timelines.lock().unwrap();
raw_timeline.initialize_with_lock(ctx, &mut timelines, false)?
};
// All done!
let timeline = raw_timeline.finish_creation()?;
info!(
"created root timeline {} timeline.lsn {}",
@@ -2893,14 +2885,18 @@ impl Tenant {
Ok(timeline)
}
/// Creates intermediate timeline structure and its files, without loading it into memory.
/// It's up to the caller to import the necesary data and import the timeline into memory.
fn prepare_timeline(
/// Creates intermediate timeline structure and its files.
///
/// An empty layer map is initialized, and new data and WAL can be imported starting
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
/// uninit mark file.
fn prepare_new_timeline(
&self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
uninit_mark: TimelineUninitMark,
init_layers: bool,
start_lsn: Lsn,
ancestor: Option<Arc<Timeline>>,
) -> anyhow::Result<UninitializedTimeline> {
let tenant_id = self.tenant_id;
@@ -2918,33 +2914,27 @@ impl Tenant {
None
};
match self.create_timeline_files(
&uninit_mark.timeline_path,
new_timeline_id,
new_metadata,
ancestor,
remote_client,
) {
Ok(new_timeline) => {
if init_layers {
new_timeline.layers.write().unwrap().next_open_layer_at =
Some(new_timeline.initdb_lsn);
}
debug!(
"Successfully created initial files for timeline {tenant_id}/{new_timeline_id}"
);
Ok(UninitializedTimeline {
owning_tenant: self,
timeline_id: new_timeline_id,
raw_timeline: Some((new_timeline, uninit_mark)),
})
}
Err(e) => {
error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
cleanup_timeline_directory(uninit_mark);
Err(e)
}
let timeline_struct = self
.create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None)
.context("Failed to create timeline data structure")?;
timeline_struct.init_empty_layer_map(start_lsn);
if let Err(e) =
self.create_timeline_files(&uninit_mark.timeline_path, new_timeline_id, new_metadata)
{
error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
cleanup_timeline_directory(uninit_mark);
return Err(e);
}
debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}");
Ok(UninitializedTimeline {
owning_tenant: self,
timeline_id: new_timeline_id,
raw_timeline: Some((timeline_struct, uninit_mark)),
})
}
fn create_timeline_files(
@@ -2952,13 +2942,8 @@ impl Tenant {
timeline_path: &Path,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
ancestor: Option<Arc<Timeline>>,
remote_client: Option<RemoteTimelineClient>,
) -> anyhow::Result<Arc<Timeline>> {
let timeline_data = self
.create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client, None)
.context("Failed to create timeline data structure")?;
crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
) -> anyhow::Result<()> {
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
@@ -2972,8 +2957,7 @@ impl Tenant {
true,
)
.context("Failed to create timeline metadata")?;
Ok(timeline_data)
Ok(())
}
/// Attempts to create an uninit mark file for the timeline initialization.

View File

@@ -1592,9 +1592,16 @@ impl Timeline {
));
}
///
/// Initialize with an empty layer map. Used when creating a new timeline.
///
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
let mut layers = self.layers.write().unwrap();
layers.next_open_layer_at = Some(Lsn(start_lsn.0));
}
///
/// Scan the timeline directory to populate the layer map.
/// Returns all timeline-related files that were found and loaded.
///
pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
let mut layers = self.layers.write().unwrap();
@@ -2670,7 +2677,11 @@ impl Timeline {
let layer;
if let Some(open_layer) = &layers.open_layer {
if open_layer.get_lsn_range().start > lsn {
bail!("unexpected open layer in the future");
bail!(
"unexpected open layer in the future: open layers starts at {}, write lsn {}",
open_layer.get_lsn_range().start,
lsn
);
}
layer = Arc::clone(open_layer);