From 4c70322d2c8ef1a98a652df7794b458e248b5073 Mon Sep 17 00:00:00 2001
From: discord9 <discord9@163.com>
Date: Wed, 24 Dec 2025 17:28:23 +0800
Subject: [PATCH] cr

Signed-off-by: discord9 <discord9@163.com>
---
 src/mito2/src/manifest/action.rs | 37 ++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)
diff --git a/src/mito2/src/manifest/action.rs b/src/mito2/src/manifest/action.rs
index 32d876f6f8..61b073a208 100644
--- a/src/mito2/src/manifest/action.rs
+++ b/src/mito2/src/manifest/action.rs
@@ -351,6 +351,9 @@ pub enum RemovedFile {
     Index(FileId, IndexVersion),
 }
 
+/// Support deserialize from old format(just FileId as string) for backward compatibility
+/// into current format(RemovedFile enum).
+/// This is needed just in case there are old manifests with removed files recorded.
 impl<'de> Deserialize<'de> for RemovedFile {
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
@@ -359,8 +362,8 @@ impl<'de> Deserialize<'de> for RemovedFile {
         #[derive(Deserialize)]
         #[serde(untagged)]
         enum CompatRemovedFile {
-            FileId(FileId),
             Enum(RemovedFileEnum),
+            FileId(FileId),
         }
 
         #[derive(Deserialize)]
@@ -1061,7 +1064,13 @@ mod tests {
         let deserialized_index: RemovedFile = serde_json::from_str(&json_index).unwrap();
         assert_eq!(removed_index, deserialized_index);
 
-        // Case 4: Deserialize mixed set in RemovedFilesRecord
+        // Case 4: Round-trip serialization/deserialization of new enum format with None as index version
+        let removed_file = RemovedFile::File(file_id, None);
+        let json = serde_json::to_string(&removed_file).unwrap();
+        let deserialized: RemovedFile = serde_json::from_str(&json).unwrap();
+        assert_eq!(removed_file, deserialized);
+
+        // Case 5: Deserialize mixed set in RemovedFilesRecord
         // This simulates a Set<RemovedFile> which might contain old strings or new objects if manually constructed or from old versions.
         // Actually, if it was HashSet<FileId>, the JSON is ["id1", "id2"].
         // If it is HashSet<RemovedFile>, the JSON is [{"File":...}, "id2"] if mixed (which shouldn't happen usually but good to test).
@@ -1071,8 +1080,28 @@ mod tests {
         assert!(removed_files_set.contains(&RemovedFile::File(file_id, None)));
     }
 
-    /// It's acceptable to ignore the old field `file_ids` when deserializing RemovedFiles.
-    /// (As gc worker can scan dir to find files to delete.)
+    /// It is intentionally acceptable to ignore the legacy `file_ids` field when
+    /// deserializing [`RemovedFiles`].
+    ///
+    /// In older manifests, `file_ids` recorded the set of SSTable files that were
+    /// candidates for garbage collection at a given `removed_at` timestamp. The
+    /// newer format stores this information in the `files` field instead. When we
+    /// deserialize an old manifest entry into the new struct, we *drop* the
+    /// `file_ids` field instead of trying to recover or merge it.
+    ///
+    /// Dropping `file_ids` does **not** risk deleting live data: a file is only
+    /// physically removed when it is both (a) no longer referenced by any region
+    /// metadata and (b) selected by the GC worker as safe to delete. Losing the
+    /// historical list of candidate `file_ids` merely means some obsolete files
+    /// may stay on disk longer than strictly necessary.
+    ///
+    /// The GC worker periodically scans storage (e.g. by walking the data
+    /// directories and/or consulting the latest manifest) to discover files that
+    /// are no longer referenced anywhere. Any files that were only referenced via
+    /// the dropped `file_ids` field will be rediscovered during these scans and
+    /// eventually deleted. Thus the system converges to a correct, fully-collected
+    /// state without relying on `file_ids`, and the only potential impact of
+    /// ignoring it is temporary disk space overhead, not data loss.
     #[test]
     fn test_removed_files_backward_compatibility() {
         // Define the old version struct with file_ids field