mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-04 20:32:56 +00:00
@@ -351,6 +351,9 @@ pub enum RemovedFile {
|
||||
Index(FileId, IndexVersion),
|
||||
}
|
||||
|
||||
/// Support deserialize from old format(just FileId as string) for backward compatibility
|
||||
/// into current format(RemovedFile enum).
|
||||
/// This is needed just in case there are old manifests with removed files recorded.
|
||||
impl<'de> Deserialize<'de> for RemovedFile {
|
||||
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
|
||||
where
|
||||
@@ -359,8 +362,8 @@ impl<'de> Deserialize<'de> for RemovedFile {
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum CompatRemovedFile {
|
||||
FileId(FileId),
|
||||
Enum(RemovedFileEnum),
|
||||
FileId(FileId),
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
@@ -1061,7 +1064,13 @@ mod tests {
|
||||
let deserialized_index: RemovedFile = serde_json::from_str(&json_index).unwrap();
|
||||
assert_eq!(removed_index, deserialized_index);
|
||||
|
||||
// Case 4: Deserialize mixed set in RemovedFilesRecord
|
||||
// Case 4: Round-trip serialization/deserialization of new enum format with None as index version
|
||||
let removed_file = RemovedFile::File(file_id, None);
|
||||
let json = serde_json::to_string(&removed_file).unwrap();
|
||||
let deserialized: RemovedFile = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(removed_file, deserialized);
|
||||
|
||||
// Case 5: Deserialize mixed set in RemovedFilesRecord
|
||||
// This simulates a Set<RemovedFile> which might contain old strings or new objects if manually constructed or from old versions.
|
||||
// Actually, if it was HashSet<FileId>, the JSON is ["id1", "id2"].
|
||||
// If it is HashSet<RemovedFile>, the JSON is [{"File":...}, "id2"] if mixed (which shouldn't happen usually but good to test).
|
||||
@@ -1071,8 +1080,28 @@ mod tests {
|
||||
assert!(removed_files_set.contains(&RemovedFile::File(file_id, None)));
|
||||
}
|
||||
|
||||
/// It's acceptable to ignore the old field `file_ids` when deserializing RemovedFiles.
|
||||
/// (As gc worker can scan dir to find files to delete.)
|
||||
/// It is intentionally acceptable to ignore the legacy `file_ids` field when
|
||||
/// deserializing [`RemovedFiles`].
|
||||
///
|
||||
/// In older manifests, `file_ids` recorded the set of SSTable files that were
|
||||
/// candidates for garbage collection at a given `removed_at` timestamp. The
|
||||
/// newer format stores this information in the `files` field instead. When we
|
||||
/// deserialize an old manifest entry into the new struct, we *drop* the
|
||||
/// `file_ids` field instead of trying to recover or merge it.
|
||||
///
|
||||
/// Dropping `file_ids` does **not** risk deleting live data: a file is only
|
||||
/// physically removed when it is both (a) no longer referenced by any region
|
||||
/// metadata and (b) selected by the GC worker as safe to delete. Losing the
|
||||
/// historical list of candidate `file_ids` merely means some obsolete files
|
||||
/// may stay on disk longer than strictly necessary.
|
||||
///
|
||||
/// The GC worker periodically scans storage (e.g. by walking the data
|
||||
/// directories and/or consulting the latest manifest) to discover files that
|
||||
/// are no longer referenced anywhere. Any files that were only referenced via
|
||||
/// the dropped `file_ids` field will be rediscovered during these scans and
|
||||
/// eventually deleted. Thus the system converges to a correct, fully-collected
|
||||
/// state without relying on `file_ids`, and the only potential impact of
|
||||
/// ignoring it is temporary disk space overhead, not data loss.
|
||||
#[test]
|
||||
fn test_removed_files_backward_compatibility() {
|
||||
// Define the old version struct with file_ids field
|
||||
|
||||
Reference in New Issue
Block a user