Don't upload index file in compaction, if there was nothing to do. (#3149)

This splits the storage_sync2::schedule_index_file into two (public)
functions:
1. `schedule_index_upload_for_metadata_update`, for when the metadata
(e.g. disk_consistent_lsn or last_gc_cutoff) has changed, and

2. `schedule_index_upload_for_file_changes`, for when layer file uploads
or deletions have been scheduled.

We now keep track of whether there have been any uploads or deletions
since the last index-file upload, and skip the upload in
`schedule_index_upload_for_file_changes` if there haven't been any
changes. That allows us to call the function liberally in timeline.rs,
whenever layer file uploads or deletions might've been scheduled,
without starting a lot of unnecessary index file uploads.

GC was covered earlier by commit c262390214, but that missed that we
have the same problem with compaction.
This commit is contained in:
Heikki Linnakangas
2022-12-19 23:58:24 +02:00
committed by GitHub
parent 3735aece56
commit 39f58038d1
4 changed files with 114 additions and 37 deletions

View File

@@ -32,7 +32,8 @@
//! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
//!
//! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file.
//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
//! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
//!
//! Internally, these functions create [`UploadOp`]s and put them in a queue.
@@ -290,6 +291,10 @@ struct UploadQueueInitialized {
/// in-progress and queued operations
latest_files: HashMap<LayerFileName, LayerFileMetadata>,
/// How many file uploads or deletions been scheduled, since the
/// last (scheduling of) metadata index upload?
latest_files_changes_since_metadata_upload_scheduled: u64,
/// Metadata stored in the remote storage, taking into account all
/// in-progress and queued operations.
/// DANGER: do not return to outside world, e.g., safekeepers.
@@ -339,6 +344,7 @@ impl UploadQueue {
let state = UploadQueueInitialized {
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
@@ -385,6 +391,7 @@ impl UploadQueue {
let state = UploadQueueInitialized {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part_metadata.clone(),
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
// what follows are boring default initializations
@@ -558,7 +565,9 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
upgraded.merge(&new_metadata);
if upgraded.merge(&new_metadata) {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
// If we don't do an index file upload inbetween here and restart,
// the value will go back down after pageserver restart, since we will
// have lost this data point.
@@ -583,14 +592,20 @@ impl RemoteTimelineClient {
//
///
/// Launch an index-file upload operation in the background.
/// Launch an index-file upload operation in the background, with
/// updated metadata.
///
/// The upload will be added to the queue immediately, but it
/// won't be performed until all previosuly scheduled layer file
/// upload operations have completed successfully. This is to
/// ensure that when the index file claims that layers X, Y and Z
/// exist in remote storage, they really do.
pub fn schedule_index_upload(
/// exist in remote storage, they really do. To wait for the upload
/// to complete, use `wait_completion`.
///
/// If there were any changes to the list of files, i.e. if any
/// layer file uploads were scheduled, since the last index file
/// upload, those will be included too.
pub fn schedule_index_upload_for_metadata_update(
self: &Arc<Self>,
metadata: &TimelineMetadata,
) -> anyhow::Result<()> {
@@ -601,26 +616,60 @@ impl RemoteTimelineClient {
// ahead of what's _actually_ on the remote during index upload.
upload_queue.latest_metadata = metadata.clone();
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
/// Use this function to schedule the update of the index file after
/// scheduling file uploads or deletions. If no file uploads or deletions
/// have been scheduled since the last index file upload, this does
/// nothing.
///
/// Like schedule_index_upload_for_metadata_update(), this merely adds
/// the upload to the upload queue and returns quickly.
pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
}
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
metadata_bytes: Vec<u8>,
) {
info!(
"scheduling metadata upload with {} files ({} changed)",
upload_queue.latest_files.len(),
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
upload_queue.latest_metadata.to_bytes()?,
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.update_upload_queue_unfinished_metric(1, &op);
upload_queue.queued_operations.push_back(op);
info!(
"scheduled metadata upload with {} files",
upload_queue.latest_files.len()
);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
}
///
@@ -644,6 +693,7 @@ impl RemoteTimelineClient {
upload_queue
.latest_files
.insert(layer_file_name.clone(), layer_metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
self.update_upload_queue_unfinished_metric(1, &op);
@@ -662,8 +712,11 @@ impl RemoteTimelineClient {
///
/// Launch a delete operation in the background.
///
/// The deletion won't actually be performed, until all preceding
/// upload operations have completed succesfully.
/// Note: This schedules an index file upload before the deletions. The
/// deletion won't actually be performed, until any previously scheduled
/// upload operations, and the index file upload, have completed
/// succesfully.
///
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerFileName],
@@ -674,7 +727,6 @@ impl RemoteTimelineClient {
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
// Update the remote index file, removing the to-be-deleted files from the index,
// before deleting the actual files.
@@ -686,16 +738,12 @@ impl RemoteTimelineClient {
let no_bail_here = || {
for name in names {
upload_queue.latest_files.remove(name);
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.update_upload_queue_unfinished_metric(1, &op);
upload_queue.queued_operations.push_back(op);
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, metadata_bytes);
}
// schedule the actual deletions
for name in names {
@@ -1244,15 +1292,19 @@ mod tests {
assert!(upload_queue.queued_operations.is_empty());
assert!(upload_queue.inprogress_tasks.len() == 2);
assert!(upload_queue.num_inprogress_layer_uploads == 2);
// also check that `latest_file_changes` was updated
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
}
// Schedule upload of index. Check that it is queued
let metadata = dummy_metadata(Lsn(0x20));
client.schedule_index_upload(&metadata)?;
client.schedule_index_upload_for_metadata_update(&metadata)?;
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
assert!(upload_queue.queued_operations.len() == 1);
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
}
// Wait for the uploads to finish
@@ -1288,6 +1340,7 @@ mod tests {
assert!(upload_queue.inprogress_tasks.len() == 1);
assert!(upload_queue.num_inprogress_layer_uploads == 1);
assert!(upload_queue.num_inprogress_deletions == 0);
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
}
assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);

View File

@@ -48,9 +48,17 @@ impl LayerFileMetadata {
/// Metadata has holes due to version upgrades. This method is called to upgrade self with the
/// other value.
///
/// This is called on the possibly outdated version.
pub fn merge(&mut self, other: &Self) {
self.file_size = other.file_size.or(self.file_size);
/// This is called on the possibly outdated version. Returns true if any changes
/// were made.
pub fn merge(&mut self, other: &Self) -> bool {
let mut changed = false;
if self.file_size != other.file_size {
self.file_size = other.file_size.or(self.file_size);
changed = true;
}
changed
}
}

View File

@@ -589,6 +589,18 @@ impl Timeline {
let timer = self.metrics.compact_time_histo.start_timer();
self.compact_level0(target_file_size).await?;
timer.stop_and_record();
// If `create_image_layers' or `compact_level0` scheduled any
// uploads or deletions, but didn't update the index file yet,
// do it now.
//
// This isn't necessary for correctness, the remote state is
// consistent without the uploads and deletions, and we would
// update the index file on next flush iteration too. But it
// could take a while until that happens.
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_index_upload_for_file_changes()?;
}
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -1215,9 +1227,7 @@ impl Timeline {
remote_client
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
}
if !local_only_layers.is_empty() {
remote_client.schedule_index_upload(up_to_date_metadata)?;
}
remote_client.schedule_index_upload_for_file_changes()?;
info!("Done");
@@ -1923,13 +1933,9 @@ impl Timeline {
if let Some(remote_client) = &self.remote_client {
for (path, layer_metadata) in layer_paths_to_upload {
remote_client
.schedule_layer_file_upload(&path, &layer_metadata)
.context("schedule_layer_file_upload")?;
remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
}
remote_client
.schedule_index_upload(&metadata)
.context("schedule_layer_file_upload")?;
remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
}
Ok(())
@@ -2398,6 +2404,11 @@ impl Timeline {
deltas_to_compact,
} = self.compact_level0_phase1(target_file_size).await?;
if new_layers.is_empty() && deltas_to_compact.is_empty() {
// nothing to do
return Ok(());
}
// Before deleting any layers, we need to wait for their upload ops to finish.
// See storage_sync module level comment on consistency.
// Do it here because we don't want to hold self.layers.write() while waiting.

View File

@@ -165,6 +165,11 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
num_index_uploads = get_num_remote_ops("index", "upload")
# Also make sure that a no-op compaction doesn't upload the index
# file unnecessarily.
pageserver_http.timeline_compact(tenant_id, timeline_id)
log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
after = num_index_uploads