cli/data/export_v2/
manifest.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Manifest data structures for Export/Import V2.
16
17use std::{fmt, str};
18
19use chrono::{DateTime, Utc};
20use serde::{Deserialize, Serialize};
21use uuid::Uuid;
22
23/// Current manifest format version.
24pub const MANIFEST_VERSION: u32 = 1;
25
26/// Manifest file name within snapshot directory.
27pub const MANIFEST_FILE: &str = "manifest.json";
28
29/// Time range for data export (half-open interval: [start, end)).
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
31pub struct TimeRange {
32    /// Start time (inclusive). None means earliest available data.
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub start: Option<DateTime<Utc>>,
35    /// End time (exclusive). None means current time.
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub end: Option<DateTime<Utc>>,
38}
39
40impl TimeRange {
41    /// Creates a new time range with specified bounds.
42    pub fn new(start: Option<DateTime<Utc>>, end: Option<DateTime<Utc>>) -> Self {
43        Self { start, end }
44    }
45
46    /// Creates an unbounded time range (all data).
47    pub fn unbounded() -> Self {
48        Self {
49            start: None,
50            end: None,
51        }
52    }
53
54    /// Returns true if this time range is unbounded.
55    pub fn is_unbounded(&self) -> bool {
56        self.start.is_none() && self.end.is_none()
57    }
58}
59
60impl Default for TimeRange {
61    fn default() -> Self {
62        Self::unbounded()
63    }
64}
65
66/// Status of a chunk during export/import.
67#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
68#[serde(rename_all = "snake_case")]
69pub enum ChunkStatus {
70    /// Chunk is pending export.
71    #[default]
72    Pending,
73    /// Chunk export is in progress.
74    InProgress,
75    /// Chunk export completed successfully.
76    Completed,
77    /// Chunk export failed.
78    Failed,
79}
80
81/// Metadata for a single chunk of exported data.
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct ChunkMeta {
84    /// Chunk identifier (sequential number starting from 1).
85    pub id: u32,
86    /// Time range covered by this chunk.
87    pub time_range: TimeRange,
88    /// Export status.
89    pub status: ChunkStatus,
90    /// List of data files in this chunk (relative paths from snapshot root).
91    #[serde(default)]
92    pub files: Vec<String>,
93    /// SHA256 checksum of all files in this chunk (aggregated).
94    #[serde(skip_serializing_if = "Option::is_none")]
95    pub checksum: Option<String>,
96    /// Error message if status is Failed.
97    #[serde(skip_serializing_if = "Option::is_none")]
98    pub error: Option<String>,
99}
100
101impl ChunkMeta {
102    /// Creates a new pending chunk with the given id and time range.
103    pub fn new(id: u32, time_range: TimeRange) -> Self {
104        Self {
105            id,
106            time_range,
107            status: ChunkStatus::Pending,
108            files: vec![],
109            checksum: None,
110            error: None,
111        }
112    }
113
114    /// Marks this chunk as in progress.
115    pub fn mark_in_progress(&mut self) {
116        self.status = ChunkStatus::InProgress;
117        self.error = None;
118    }
119
120    /// Marks this chunk as completed with the given files and checksum.
121    pub fn mark_completed(&mut self, files: Vec<String>, checksum: Option<String>) {
122        self.status = ChunkStatus::Completed;
123        self.files = files;
124        self.checksum = checksum;
125        self.error = None;
126    }
127
128    /// Marks this chunk as failed with the given error message.
129    pub fn mark_failed(&mut self, error: String) {
130        self.status = ChunkStatus::Failed;
131        self.error = Some(error);
132    }
133}
134
135/// Supported data formats for export.
136#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default, clap::ValueEnum)]
137#[serde(rename_all = "lowercase")]
138#[value(rename_all = "lowercase")]
139pub enum DataFormat {
140    /// Apache Parquet format (default, recommended for production).
141    #[default]
142    Parquet,
143    /// CSV format (human-readable).
144    Csv,
145    /// JSON format (structured text).
146    Json,
147}
148
149impl fmt::Display for DataFormat {
150    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
151        match self {
152            DataFormat::Parquet => write!(f, "parquet"),
153            DataFormat::Csv => write!(f, "csv"),
154            DataFormat::Json => write!(f, "json"),
155        }
156    }
157}
158
159impl str::FromStr for DataFormat {
160    type Err = String;
161
162    fn from_str(s: &str) -> Result<Self, Self::Err> {
163        match s.to_lowercase().as_str() {
164            "parquet" => Ok(DataFormat::Parquet),
165            "csv" => Ok(DataFormat::Csv),
166            "json" => Ok(DataFormat::Json),
167            _ => Err(format!(
168                "invalid format '{}': expected one of parquet, csv, json",
169                s
170            )),
171        }
172    }
173}
174
175/// Snapshot manifest containing all metadata.
176///
177/// The manifest is stored as `manifest.json` in the snapshot root directory.
178/// It contains:
179/// - Snapshot identification (UUID, timestamps)
180/// - Scope (catalog, schemas, time range)
181/// - Export configuration (format, schema_only)
182/// - Chunk metadata for resume support
183/// - Integrity checksums
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct Manifest {
186    /// Manifest format version for compatibility checking.
187    pub version: u32,
188    /// Unique snapshot identifier.
189    pub snapshot_id: Uuid,
190    /// Catalog name.
191    pub catalog: String,
192    /// List of schemas included in this snapshot.
193    pub schemas: Vec<String>,
194    /// Overall time range covered by this snapshot.
195    pub time_range: TimeRange,
196    /// Whether this is a schema-only snapshot (no data).
197    pub schema_only: bool,
198    /// Data format used for export.
199    pub format: DataFormat,
200    /// Chunk metadata (empty for schema-only snapshots).
201    #[serde(default)]
202    pub chunks: Vec<ChunkMeta>,
203    /// Snapshot-level SHA256 checksum (aggregated from all chunks).
204    #[serde(skip_serializing_if = "Option::is_none")]
205    pub checksum: Option<String>,
206    /// Creation timestamp.
207    pub created_at: DateTime<Utc>,
208    /// Last updated timestamp.
209    pub updated_at: DateTime<Utc>,
210}
211
212impl Manifest {
213    /// Creates a new manifest for schema-only export.
214    pub fn new_schema_only(catalog: String, schemas: Vec<String>) -> Self {
215        let now = Utc::now();
216        Self {
217            version: MANIFEST_VERSION,
218            snapshot_id: Uuid::new_v4(),
219            catalog,
220            schemas,
221            time_range: TimeRange::unbounded(),
222            schema_only: true,
223            format: DataFormat::Parquet,
224            chunks: vec![],
225            checksum: None,
226            created_at: now,
227            updated_at: now,
228        }
229    }
230
231    /// Creates a new manifest for full export with time range and format.
232    pub fn new_full(
233        catalog: String,
234        schemas: Vec<String>,
235        time_range: TimeRange,
236        format: DataFormat,
237    ) -> Self {
238        let now = Utc::now();
239        Self {
240            version: MANIFEST_VERSION,
241            snapshot_id: Uuid::new_v4(),
242            catalog,
243            schemas,
244            time_range,
245            schema_only: false,
246            format,
247            chunks: vec![],
248            checksum: None,
249            created_at: now,
250            updated_at: now,
251        }
252    }
253
254    /// Returns true if all chunks are completed (or if schema-only).
255    pub fn is_complete(&self) -> bool {
256        self.schema_only
257            || (!self.chunks.is_empty()
258                && self
259                    .chunks
260                    .iter()
261                    .all(|c| c.status == ChunkStatus::Completed))
262    }
263
264    /// Returns the number of pending chunks.
265    pub fn pending_count(&self) -> usize {
266        self.chunks
267            .iter()
268            .filter(|c| c.status == ChunkStatus::Pending)
269            .count()
270    }
271
272    /// Returns the number of in-progress chunks.
273    pub fn in_progress_count(&self) -> usize {
274        self.chunks
275            .iter()
276            .filter(|c| c.status == ChunkStatus::InProgress)
277            .count()
278    }
279
280    /// Returns the number of completed chunks.
281    pub fn completed_count(&self) -> usize {
282        self.chunks
283            .iter()
284            .filter(|c| c.status == ChunkStatus::Completed)
285            .count()
286    }
287
288    /// Returns the number of failed chunks.
289    pub fn failed_count(&self) -> usize {
290        self.chunks
291            .iter()
292            .filter(|c| c.status == ChunkStatus::Failed)
293            .count()
294    }
295
296    /// Updates the `updated_at` timestamp to now.
297    pub fn touch(&mut self) {
298        self.updated_at = Utc::now();
299    }
300
301    /// Adds a chunk to the manifest.
302    pub fn add_chunk(&mut self, chunk: ChunkMeta) {
303        self.chunks.push(chunk);
304        self.touch();
305    }
306
307    /// Updates a chunk by id.
308    pub fn update_chunk(&mut self, id: u32, updater: impl FnOnce(&mut ChunkMeta)) {
309        if let Some(chunk) = self.chunks.iter_mut().find(|c| c.id == id) {
310            updater(chunk);
311            self.touch();
312        }
313    }
314}
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319
320    #[test]
321    fn test_time_range_serialization() {
322        let range = TimeRange::unbounded();
323        let json = serde_json::to_string(&range).unwrap();
324        assert_eq!(json, "{}");
325
326        let range: TimeRange = serde_json::from_str("{}").unwrap();
327        assert!(range.is_unbounded());
328    }
329
330    #[test]
331    fn test_manifest_schema_only() {
332        let manifest =
333            Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]);
334
335        assert_eq!(manifest.version, MANIFEST_VERSION);
336        assert!(manifest.schema_only);
337        assert!(manifest.chunks.is_empty());
338        assert!(manifest.is_complete());
339    }
340
341    #[test]
342    fn test_manifest_full() {
343        let manifest = Manifest::new_full(
344            "greptime".to_string(),
345            vec!["public".to_string()],
346            TimeRange::unbounded(),
347            DataFormat::Parquet,
348        );
349
350        assert!(!manifest.schema_only);
351        assert!(manifest.chunks.is_empty());
352        assert!(!manifest.is_complete());
353    }
354
355    #[test]
356    fn test_data_format_parsing() {
357        assert_eq!(
358            "parquet".parse::<DataFormat>().unwrap(),
359            DataFormat::Parquet
360        );
361        assert_eq!("CSV".parse::<DataFormat>().unwrap(), DataFormat::Csv);
362        assert_eq!("JSON".parse::<DataFormat>().unwrap(), DataFormat::Json);
363        assert!("invalid".parse::<DataFormat>().is_err());
364    }
365
366    #[test]
367    fn test_chunk_status_transitions() {
368        let mut chunk = ChunkMeta::new(1, TimeRange::unbounded());
369        assert_eq!(chunk.status, ChunkStatus::Pending);
370
371        chunk.mark_in_progress();
372        assert_eq!(chunk.status, ChunkStatus::InProgress);
373
374        chunk.mark_completed(
375            vec!["file1.parquet".to_string()],
376            Some("abc123".to_string()),
377        );
378        assert_eq!(chunk.status, ChunkStatus::Completed);
379        assert_eq!(chunk.files.len(), 1);
380    }
381}