cr

Signed-off-by: discord9 <discord9@163.com>
2026-01-04 20:32:56 +00:00 · 2025-12-24 17:28:23 +08:00
parent 577d38575e
commit 4c70322d2c
1 changed files with 33 additions and 4 deletions
--- a/src/mito2/src/manifest/action.rs
+++ b/src/mito2/src/manifest/action.rs
@@ -351,6 +351,9 @@ pub enum RemovedFile {
    Index(FileId, IndexVersion),
 }

+/// Support deserialize from old format(just FileId as string) for backward compatibility
+/// into current format(RemovedFile enum).
+/// This is needed just in case there are old manifests with removed files recorded.
 impl<'de> Deserialize<'de> for RemovedFile {
    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
    where
@@ -359,8 +362,8 @@ impl<'de> Deserialize<'de> for RemovedFile {
        #[derive(Deserialize)]
        #[serde(untagged)]
        enum CompatRemovedFile {
-            FileId(FileId),
            Enum(RemovedFileEnum),
+            FileId(FileId),
        }

        #[derive(Deserialize)]
@@ -1061,7 +1064,13 @@ mod tests {
        let deserialized_index: RemovedFile = serde_json::from_str(&json_index).unwrap();
        assert_eq!(removed_index, deserialized_index);

-        // Case 4: Deserialize mixed set in RemovedFilesRecord
+        // Case 4: Round-trip serialization/deserialization of new enum format with None as index version
+        let removed_file = RemovedFile::File(file_id, None);
+        let json = serde_json::to_string(&removed_file).unwrap();
+        let deserialized: RemovedFile = serde_json::from_str(&json).unwrap();
+        assert_eq!(removed_file, deserialized);
+
+        // Case 5: Deserialize mixed set in RemovedFilesRecord
        // This simulates a Set<RemovedFile> which might contain old strings or new objects if manually constructed or from old versions.
        // Actually, if it was HashSet<FileId>, the JSON is ["id1", "id2"].
        // If it is HashSet<RemovedFile>, the JSON is [{"File":...}, "id2"] if mixed (which shouldn't happen usually but good to test).
@@ -1071,8 +1080,28 @@ mod tests {
        assert!(removed_files_set.contains(&RemovedFile::File(file_id, None)));
    }

-    /// It's acceptable to ignore the old field `file_ids` when deserializing RemovedFiles.
-    /// (As gc worker can scan dir to find files to delete.)
+    /// It is intentionally acceptable to ignore the legacy `file_ids` field when
+    /// deserializing [`RemovedFiles`].
+    ///
+    /// In older manifests, `file_ids` recorded the set of SSTable files that were
+    /// candidates for garbage collection at a given `removed_at` timestamp. The
+    /// newer format stores this information in the `files` field instead. When we
+    /// deserialize an old manifest entry into the new struct, we *drop* the
+    /// `file_ids` field instead of trying to recover or merge it.
+    ///
+    /// Dropping `file_ids` does **not** risk deleting live data: a file is only
+    /// physically removed when it is both (a) no longer referenced by any region
+    /// metadata and (b) selected by the GC worker as safe to delete. Losing the
+    /// historical list of candidate `file_ids` merely means some obsolete files
+    /// may stay on disk longer than strictly necessary.
+    ///
+    /// The GC worker periodically scans storage (e.g. by walking the data
+    /// directories and/or consulting the latest manifest) to discover files that
+    /// are no longer referenced anywhere. Any files that were only referenced via
+    /// the dropped `file_ids` field will be rediscovered during these scans and
+    /// eventually deleted. Thus the system converges to a correct, fully-collected
+    /// state without relying on `file_ids`, and the only potential impact of
+    /// ignoring it is temporary disk space overhead, not data loss.
    #[test]
    fn test_removed_files_backward_compatibility() {
        // Define the old version struct with file_ids field