Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
discord9
2025-12-24 17:28:23 +08:00
parent 577d38575e
commit 4c70322d2c

View File

@@ -351,6 +351,9 @@ pub enum RemovedFile {
Index(FileId, IndexVersion),
}
/// Support deserialize from old format(just FileId as string) for backward compatibility
/// into current format(RemovedFile enum).
/// This is needed just in case there are old manifests with removed files recorded.
impl<'de> Deserialize<'de> for RemovedFile {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
@@ -359,8 +362,8 @@ impl<'de> Deserialize<'de> for RemovedFile {
#[derive(Deserialize)]
#[serde(untagged)]
enum CompatRemovedFile {
FileId(FileId),
Enum(RemovedFileEnum),
FileId(FileId),
}
#[derive(Deserialize)]
@@ -1061,7 +1064,13 @@ mod tests {
let deserialized_index: RemovedFile = serde_json::from_str(&json_index).unwrap();
assert_eq!(removed_index, deserialized_index);
// Case 4: Deserialize mixed set in RemovedFilesRecord
// Case 4: Round-trip serialization/deserialization of new enum format with None as index version
let removed_file = RemovedFile::File(file_id, None);
let json = serde_json::to_string(&removed_file).unwrap();
let deserialized: RemovedFile = serde_json::from_str(&json).unwrap();
assert_eq!(removed_file, deserialized);
// Case 5: Deserialize mixed set in RemovedFilesRecord
// This simulates a Set<RemovedFile> which might contain old strings or new objects if manually constructed or from old versions.
// Actually, if it was HashSet<FileId>, the JSON is ["id1", "id2"].
// If it is HashSet<RemovedFile>, the JSON is [{"File":...}, "id2"] if mixed (which shouldn't happen usually but good to test).
@@ -1071,8 +1080,28 @@ mod tests {
assert!(removed_files_set.contains(&RemovedFile::File(file_id, None)));
}
/// It's acceptable to ignore the old field `file_ids` when deserializing RemovedFiles.
/// (As gc worker can scan dir to find files to delete.)
/// It is intentionally acceptable to ignore the legacy `file_ids` field when
/// deserializing [`RemovedFiles`].
///
/// In older manifests, `file_ids` recorded the set of SSTable files that were
/// candidates for garbage collection at a given `removed_at` timestamp. The
/// newer format stores this information in the `files` field instead. When we
/// deserialize an old manifest entry into the new struct, we *drop* the
/// `file_ids` field instead of trying to recover or merge it.
///
/// Dropping `file_ids` does **not** risk deleting live data: a file is only
/// physically removed when it is both (a) no longer referenced by any region
/// metadata and (b) selected by the GC worker as safe to delete. Losing the
/// historical list of candidate `file_ids` merely means some obsolete files
/// may stay on disk longer than strictly necessary.
///
/// The GC worker periodically scans storage (e.g. by walking the data
/// directories and/or consulting the latest manifest) to discover files that
/// are no longer referenced anywhere. Any files that were only referenced via
/// the dropped `file_ids` field will be rediscovered during these scans and
/// eventually deleted. Thus the system converges to a correct, fully-collected
/// state without relying on `file_ids`, and the only potential impact of
/// ignoring it is temporary disk space overhead, not data loss.
#[test]
fn test_removed_files_backward_compatibility() {
// Define the old version struct with file_ids field