From 4c70322d2c8ef1a98a652df7794b458e248b5073 Mon Sep 17 00:00:00 2001 From: discord9 Date: Wed, 24 Dec 2025 17:28:23 +0800 Subject: [PATCH] cr Signed-off-by: discord9 --- src/mito2/src/manifest/action.rs | 37 ++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/mito2/src/manifest/action.rs b/src/mito2/src/manifest/action.rs index 32d876f6f8..61b073a208 100644 --- a/src/mito2/src/manifest/action.rs +++ b/src/mito2/src/manifest/action.rs @@ -351,6 +351,9 @@ pub enum RemovedFile { Index(FileId, IndexVersion), } +/// Support deserialize from old format(just FileId as string) for backward compatibility +/// into current format(RemovedFile enum). +/// This is needed just in case there are old manifests with removed files recorded. impl<'de> Deserialize<'de> for RemovedFile { fn deserialize(deserializer: D) -> std::result::Result where @@ -359,8 +362,8 @@ impl<'de> Deserialize<'de> for RemovedFile { #[derive(Deserialize)] #[serde(untagged)] enum CompatRemovedFile { - FileId(FileId), Enum(RemovedFileEnum), + FileId(FileId), } #[derive(Deserialize)] @@ -1061,7 +1064,13 @@ mod tests { let deserialized_index: RemovedFile = serde_json::from_str(&json_index).unwrap(); assert_eq!(removed_index, deserialized_index); - // Case 4: Deserialize mixed set in RemovedFilesRecord + // Case 4: Round-trip serialization/deserialization of new enum format with None as index version + let removed_file = RemovedFile::File(file_id, None); + let json = serde_json::to_string(&removed_file).unwrap(); + let deserialized: RemovedFile = serde_json::from_str(&json).unwrap(); + assert_eq!(removed_file, deserialized); + + // Case 5: Deserialize mixed set in RemovedFilesRecord // This simulates a Set which might contain old strings or new objects if manually constructed or from old versions. // Actually, if it was HashSet, the JSON is ["id1", "id2"]. // If it is HashSet, the JSON is [{"File":...}, "id2"] if mixed (which shouldn't happen usually but good to test). @@ -1071,8 +1080,28 @@ mod tests { assert!(removed_files_set.contains(&RemovedFile::File(file_id, None))); } - /// It's acceptable to ignore the old field `file_ids` when deserializing RemovedFiles. - /// (As gc worker can scan dir to find files to delete.) + /// It is intentionally acceptable to ignore the legacy `file_ids` field when + /// deserializing [`RemovedFiles`]. + /// + /// In older manifests, `file_ids` recorded the set of SSTable files that were + /// candidates for garbage collection at a given `removed_at` timestamp. The + /// newer format stores this information in the `files` field instead. When we + /// deserialize an old manifest entry into the new struct, we *drop* the + /// `file_ids` field instead of trying to recover or merge it. + /// + /// Dropping `file_ids` does **not** risk deleting live data: a file is only + /// physically removed when it is both (a) no longer referenced by any region + /// metadata and (b) selected by the GC worker as safe to delete. Losing the + /// historical list of candidate `file_ids` merely means some obsolete files + /// may stay on disk longer than strictly necessary. + /// + /// The GC worker periodically scans storage (e.g. by walking the data + /// directories and/or consulting the latest manifest) to discover files that + /// are no longer referenced anywhere. Any files that were only referenced via + /// the dropped `file_ids` field will be rediscovered during these scans and + /// eventually deleted. Thus the system converges to a correct, fully-collected + /// state without relying on `file_ids`, and the only potential impact of + /// ignoring it is temporary disk space overhead, not data loss. #[test] fn test_removed_files_backward_compatibility() { // Define the old version struct with file_ids field