diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs index 89bbc34227..14763985ab 100644 --- a/pageserver/src/storage_sync2.rs +++ b/pageserver/src/storage_sync2.rs @@ -32,7 +32,8 @@ //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]: //! //! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file. -//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file. +//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file. +//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files. //! //! Internally, these functions create [`UploadOp`]s and put them in a queue. @@ -290,6 +291,10 @@ struct UploadQueueInitialized { /// in-progress and queued operations latest_files: HashMap, + /// How many file uploads or deletions been scheduled, since the + /// last (scheduling of) metadata index upload? + latest_files_changes_since_metadata_upload_scheduled: u64, + /// Metadata stored in the remote storage, taking into account all /// in-progress and queued operations. /// DANGER: do not return to outside world, e.g., safekeepers. @@ -339,6 +344,7 @@ impl UploadQueue { let state = UploadQueueInitialized { // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. latest_files: HashMap::new(), + latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: metadata.clone(), // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent // safekeepers from garbage-collecting anything. @@ -385,6 +391,7 @@ impl UploadQueue { let state = UploadQueueInitialized { latest_files: files, + latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: index_part_metadata.clone(), last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(), // what follows are boring default initializations @@ -558,7 +565,9 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { - upgraded.merge(&new_metadata); + if upgraded.merge(&new_metadata) { + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; + } // If we don't do an index file upload inbetween here and restart, // the value will go back down after pageserver restart, since we will // have lost this data point. @@ -583,14 +592,20 @@ impl RemoteTimelineClient { // /// - /// Launch an index-file upload operation in the background. + /// Launch an index-file upload operation in the background, with + /// updated metadata. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previosuly scheduled layer file /// upload operations have completed successfully. This is to /// ensure that when the index file claims that layers X, Y and Z - /// exist in remote storage, they really do. - pub fn schedule_index_upload( + /// exist in remote storage, they really do. To wait for the upload + /// to complete, use `wait_completion`. + /// + /// If there were any changes to the list of files, i.e. if any + /// layer file uploads were scheduled, since the last index file + /// upload, those will be included too. + pub fn schedule_index_upload_for_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { @@ -601,26 +616,60 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.latest_metadata = metadata.clone(); + let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; + self.schedule_index_upload(upload_queue, metadata_bytes); + + Ok(()) + } + + /// + /// Launch an index-file upload operation in the background, if necessary. + /// + /// Use this function to schedule the update of the index file after + /// scheduling file uploads or deletions. If no file uploads or deletions + /// have been scheduled since the last index file upload, this does + /// nothing. + /// + /// Like schedule_index_upload_for_metadata_update(), this merely adds + /// the upload to the upload queue and returns quickly. + pub fn schedule_index_upload_for_file_changes(self: &Arc) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { + let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; + self.schedule_index_upload(upload_queue, metadata_bytes); + } + + Ok(()) + } + + /// Launch an index-file upload operation in the background (internal function) + fn schedule_index_upload( + self: &Arc, + upload_queue: &mut UploadQueueInitialized, + metadata_bytes: Vec, + ) { + info!( + "scheduling metadata upload with {} files ({} changed)", + upload_queue.latest_files.len(), + upload_queue.latest_files_changes_since_metadata_upload_scheduled, + ); + let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); let index_part = IndexPart::new( upload_queue.latest_files.clone(), disk_consistent_lsn, - upload_queue.latest_metadata.to_bytes()?, + metadata_bytes, ); let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); self.update_upload_queue_unfinished_metric(1, &op); upload_queue.queued_operations.push_back(op); - - info!( - "scheduled metadata upload with {} files", - upload_queue.latest_files.len() - ); + upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); - - Ok(()) } /// @@ -644,6 +693,7 @@ impl RemoteTimelineClient { upload_queue .latest_files .insert(layer_file_name.clone(), layer_metadata.clone()); + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone()); self.update_upload_queue_unfinished_metric(1, &op); @@ -662,8 +712,11 @@ impl RemoteTimelineClient { /// /// Launch a delete operation in the background. /// - /// The deletion won't actually be performed, until all preceding - /// upload operations have completed succesfully. + /// Note: This schedules an index file upload before the deletions. The + /// deletion won't actually be performed, until any previously scheduled + /// upload operations, and the index file upload, have completed + /// succesfully. + /// pub fn schedule_layer_file_deletion( self: &Arc, names: &[LayerFileName], @@ -674,7 +727,6 @@ impl RemoteTimelineClient { // Deleting layers doesn't affect the values stored in TimelineMetadata, // so we don't need update it. Just serialize it. let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); // Update the remote index file, removing the to-be-deleted files from the index, // before deleting the actual files. @@ -686,16 +738,12 @@ impl RemoteTimelineClient { let no_bail_here = || { for name in names { upload_queue.latest_files.remove(name); + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; } - let index_part = IndexPart::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata_bytes, - ); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.update_upload_queue_unfinished_metric(1, &op); - upload_queue.queued_operations.push_back(op); + if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { + self.schedule_index_upload(upload_queue, metadata_bytes); + } // schedule the actual deletions for name in names { @@ -1244,15 +1292,19 @@ mod tests { assert!(upload_queue.queued_operations.is_empty()); assert!(upload_queue.inprogress_tasks.len() == 2); assert!(upload_queue.num_inprogress_layer_uploads == 2); + + // also check that `latest_file_changes` was updated + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2); } // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); - client.schedule_index_upload(&metadata)?; + client.schedule_index_upload_for_metadata_update(&metadata)?; { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.len() == 1); + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0); } // Wait for the uploads to finish @@ -1288,6 +1340,7 @@ mod tests { assert!(upload_queue.inprogress_tasks.len() == 1); assert!(upload_queue.num_inprogress_layer_uploads == 1); assert!(upload_queue.num_inprogress_deletions == 0); + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0); } assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir); diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs index ed4ed10189..bb58a34969 100644 --- a/pageserver/src/storage_sync2/index.rs +++ b/pageserver/src/storage_sync2/index.rs @@ -48,9 +48,17 @@ impl LayerFileMetadata { /// Metadata has holes due to version upgrades. This method is called to upgrade self with the /// other value. /// - /// This is called on the possibly outdated version. - pub fn merge(&mut self, other: &Self) { - self.file_size = other.file_size.or(self.file_size); + /// This is called on the possibly outdated version. Returns true if any changes + /// were made. + pub fn merge(&mut self, other: &Self) -> bool { + let mut changed = false; + + if self.file_size != other.file_size { + self.file_size = other.file_size.or(self.file_size); + changed = true; + } + + changed } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3373c52231..b1f580c32f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -589,6 +589,18 @@ impl Timeline { let timer = self.metrics.compact_time_histo.start_timer(); self.compact_level0(target_file_size).await?; timer.stop_and_record(); + + // If `create_image_layers' or `compact_level0` scheduled any + // uploads or deletions, but didn't update the index file yet, + // do it now. + // + // This isn't necessary for correctness, the remote state is + // consistent without the uploads and deletions, and we would + // update the index file on next flush iteration too. But it + // could take a while until that happens. + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_index_upload_for_file_changes()?; + } } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -1215,9 +1227,7 @@ impl Timeline { remote_client .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; } - if !local_only_layers.is_empty() { - remote_client.schedule_index_upload(up_to_date_metadata)?; - } + remote_client.schedule_index_upload_for_file_changes()?; info!("Done"); @@ -1923,13 +1933,9 @@ impl Timeline { if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { - remote_client - .schedule_layer_file_upload(&path, &layer_metadata) - .context("schedule_layer_file_upload")?; + remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; } - remote_client - .schedule_index_upload(&metadata) - .context("schedule_layer_file_upload")?; + remote_client.schedule_index_upload_for_metadata_update(&metadata)?; } Ok(()) @@ -2398,6 +2404,11 @@ impl Timeline { deltas_to_compact, } = self.compact_level0_phase1(target_file_size).await?; + if new_layers.is_empty() && deltas_to_compact.is_empty() { + // nothing to do + return Ok(()); + } + // Before deleting any layers, we need to wait for their upload ops to finish. // See storage_sync module level comment on consistency. // Do it here because we don't want to hold self.layers.write() while waiting. diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index b9d012fa36..5f052bf81a 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -165,6 +165,11 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')") pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32) num_index_uploads = get_num_remote_ops("index", "upload") + + # Also make sure that a no-op compaction doesn't upload the index + # file unnecessarily. + pageserver_http.timeline_compact(tenant_id, timeline_id) + log.info(f"{num_index_uploads} index uploads after GC iteration {i}") after = num_index_uploads