Skip to main content

cli/data/export_v2/
command.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Export V2 CLI commands.
16
17use std::collections::HashSet;
18use std::time::Duration;
19
20use async_trait::async_trait;
21use clap::{Parser, Subcommand};
22use common_error::ext::BoxedError;
23use common_telemetry::info;
24use serde_json::Value;
25use snafu::{OptionExt, ResultExt};
26
27use crate::Tool;
28use crate::common::ObjectStoreConfig;
29use crate::data::export_v2::coordinator::export_data;
30use crate::data::export_v2::error::{
31    ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu,
32    ManifestVersionMismatchSnafu, Result, ResumeConfigMismatchSnafu, SchemaOnlyArgsNotAllowedSnafu,
33    SchemaOnlyModeMismatchSnafu, UnexpectedValueTypeSnafu,
34};
35use crate::data::export_v2::extractor::SchemaExtractor;
36use crate::data::export_v2::manifest::{
37    ChunkMeta, DataFormat, MANIFEST_FILE, MANIFEST_VERSION, Manifest, TimeRange,
38};
39use crate::data::path::ddl_path_for_schema;
40use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
41use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
42use crate::database::{DatabaseClient, parse_proxy_opts};
43
44/// Export V2 commands.
45#[derive(Debug, Subcommand)]
46pub enum ExportV2Command {
47    /// Create a new snapshot.
48    Create(ExportCreateCommand),
49    /// List snapshots under a parent location.
50    List(ExportListCommand),
51}
52
53impl ExportV2Command {
54    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
55        match self {
56            ExportV2Command::Create(cmd) => cmd.build().await,
57            ExportV2Command::List(cmd) => cmd.build().await,
58        }
59    }
60}
61
62/// List snapshots under a parent location.
63#[derive(Debug, Parser)]
64pub struct ExportListCommand {
65    /// Parent storage location whose direct subdirectories are snapshots.
66    #[clap(long)]
67    location: String,
68
69    /// Object store configuration for remote storage backends.
70    #[clap(flatten)]
71    storage: ObjectStoreConfig,
72}
73
74impl ExportListCommand {
75    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
76        validate_uri(&self.location).map_err(BoxedError::new)?;
77        let storage = OpenDalStorage::from_parent_uri(&self.location, &self.storage)
78            .map_err(BoxedError::new)?;
79
80        Ok(Box::new(ExportList {
81            location: self.location.clone(),
82            storage,
83        }))
84    }
85}
86
87/// Export list tool implementation.
88pub struct ExportList {
89    location: String,
90    storage: OpenDalStorage,
91}
92
93#[async_trait]
94impl Tool for ExportList {
95    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
96        self.run().await.map_err(BoxedError::new)
97    }
98}
99
100impl ExportList {
101    async fn run(&self) -> Result<()> {
102        let result = scan_snapshots(&self.storage).await?;
103
104        println!("Scanning: {}", self.location);
105        if result.snapshots.is_empty() {
106            println!("No snapshots found.");
107        } else {
108            print_snapshot_list(&result.snapshots, result.unreadable.len());
109        }
110        print_unreadable_warnings(&result.unreadable);
111
112        Ok(())
113    }
114}
115
116/// Create a new snapshot.
117#[derive(Debug, Parser)]
118pub struct ExportCreateCommand {
119    /// Server address to connect (e.g., 127.0.0.1:4000).
120    #[clap(long)]
121    addr: String,
122
123    /// Target storage location (e.g., s3://bucket/path, file:///tmp/backup).
124    #[clap(long)]
125    to: String,
126
127    /// Catalog name.
128    #[clap(long, default_value = "greptime")]
129    catalog: String,
130
131    /// Schema list to export (default: all non-system schemas).
132    /// Can be specified multiple times or comma-separated.
133    #[clap(long, value_delimiter = ',')]
134    schemas: Vec<String>,
135
136    /// Export schema only, no data.
137    #[clap(long)]
138    schema_only: bool,
139
140    /// Time range start (ISO 8601 format, e.g., 2024-01-01T00:00:00Z).
141    #[clap(long)]
142    start_time: Option<String>,
143
144    /// Time range end (ISO 8601 format, e.g., 2024-12-31T23:59:59Z).
145    #[clap(long)]
146    end_time: Option<String>,
147
148    /// Chunk time window (e.g., 1h, 6h, 1d, 7d).
149    /// Requires both --start-time and --end-time when specified.
150    #[clap(long, value_parser = humantime::parse_duration)]
151    chunk_time_window: Option<Duration>,
152
153    /// Data format: parquet, csv, json.
154    #[clap(long, value_enum, default_value = "parquet")]
155    format: DataFormat,
156
157    /// Delete existing snapshot and recreate.
158    #[clap(long)]
159    force: bool,
160
161    /// Parallelism for COPY DATABASE execution (server-side, per schema per chunk).
162    #[clap(long, default_value = "1")]
163    parallelism: usize,
164
165    /// Basic authentication (user:password).
166    #[clap(long)]
167    auth_basic: Option<String>,
168
169    /// Request timeout.
170    #[clap(long, value_parser = humantime::parse_duration)]
171    timeout: Option<Duration>,
172
173    /// Proxy server address.
174    ///
175    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
176    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
177    #[clap(long)]
178    proxy: Option<String>,
179
180    /// Disable all proxy usage (ignores `--proxy` and system proxy).
181    ///
182    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
183    #[clap(long)]
184    no_proxy: bool,
185
186    /// Object store configuration for remote storage backends.
187    #[clap(flatten)]
188    storage: ObjectStoreConfig,
189}
190
191impl ExportCreateCommand {
192    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
193        // Validate URI format
194        validate_uri(&self.to).map_err(BoxedError::new)?;
195
196        let time_range = TimeRange::parse(self.start_time.as_deref(), self.end_time.as_deref())
197            .map_err(BoxedError::new)?;
198        if self.chunk_time_window.is_some() && !time_range.is_bounded() {
199            return ChunkTimeWindowRequiresBoundsSnafu
200                .fail()
201                .map_err(BoxedError::new);
202        }
203        if self.schema_only {
204            let mut invalid_args = Vec::new();
205            if self.start_time.is_some() {
206                invalid_args.push("--start-time");
207            }
208            if self.end_time.is_some() {
209                invalid_args.push("--end-time");
210            }
211            if self.chunk_time_window.is_some() {
212                invalid_args.push("--chunk-time-window");
213            }
214            if self.format != DataFormat::Parquet {
215                invalid_args.push("--format");
216            }
217            if self.parallelism != 1 {
218                invalid_args.push("--parallelism");
219            }
220            if !invalid_args.is_empty() {
221                return SchemaOnlyArgsNotAllowedSnafu {
222                    args: invalid_args.join(", "),
223                }
224                .fail()
225                .map_err(BoxedError::new);
226            }
227        }
228
229        // Parse schemas (empty vec means all schemas)
230        let schemas = if self.schemas.is_empty() {
231            None
232        } else {
233            Some(self.schemas.clone())
234        };
235
236        // Build storage
237        let storage = OpenDalStorage::from_uri(&self.to, &self.storage).map_err(BoxedError::new)?;
238
239        // Build database client
240        let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
241        let database_client = DatabaseClient::new(
242            self.addr.clone(),
243            self.catalog.clone(),
244            self.auth_basic.clone(),
245            self.timeout.unwrap_or(Duration::from_secs(60)),
246            proxy,
247            self.no_proxy,
248        );
249
250        Ok(Box::new(ExportCreate {
251            config: ExportConfig {
252                catalog: self.catalog.clone(),
253                schemas,
254                schema_only: self.schema_only,
255                format: self.format,
256                force: self.force,
257                time_range,
258                chunk_time_window: self.chunk_time_window,
259                parallelism: self.parallelism,
260                snapshot_uri: self.to.clone(),
261                storage_config: self.storage.clone(),
262            },
263            storage: Box::new(storage),
264            database_client,
265        }))
266    }
267}
268
269/// Export tool implementation.
270pub struct ExportCreate {
271    config: ExportConfig,
272    storage: Box<dyn SnapshotStorage>,
273    database_client: DatabaseClient,
274}
275
276struct ExportConfig {
277    catalog: String,
278    schemas: Option<Vec<String>>,
279    schema_only: bool,
280    format: DataFormat,
281    force: bool,
282    time_range: TimeRange,
283    chunk_time_window: Option<Duration>,
284    parallelism: usize,
285    snapshot_uri: String,
286    storage_config: ObjectStoreConfig,
287}
288
289#[async_trait]
290impl Tool for ExportCreate {
291    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
292        self.run().await.map_err(BoxedError::new)
293    }
294}
295
296impl ExportCreate {
297    async fn run(&self) -> Result<()> {
298        // 1. Check if snapshot exists
299        let exists = self.storage.exists().await?;
300
301        if exists {
302            if self.config.force {
303                info!("Deleting existing snapshot (--force)");
304                self.storage.delete_snapshot().await?;
305            } else {
306                // Resume mode - read existing manifest
307                let mut manifest = self.storage.read_manifest().await?;
308
309                // Check version compatibility
310                if manifest.version != MANIFEST_VERSION {
311                    return ManifestVersionMismatchSnafu {
312                        expected: MANIFEST_VERSION,
313                        found: manifest.version,
314                    }
315                    .fail();
316                }
317
318                validate_resume_config(&manifest, &self.config)?;
319
320                info!(
321                    "Resuming existing snapshot: {} (completed: {}/{} chunks)",
322                    manifest.snapshot_id,
323                    manifest.completed_count(),
324                    manifest.chunks.len()
325                );
326
327                if manifest.is_complete() {
328                    info!("Snapshot is already complete");
329                    return Ok(());
330                }
331
332                if manifest.schema_only {
333                    return Ok(());
334                }
335
336                export_data(
337                    self.storage.as_ref(),
338                    &self.database_client,
339                    &self.config.snapshot_uri,
340                    &self.config.storage_config,
341                    &mut manifest,
342                    self.config.parallelism,
343                )
344                .await?;
345                return Ok(());
346            }
347        }
348
349        // 2. Get schema list
350        let extractor = SchemaExtractor::new(&self.database_client, &self.config.catalog);
351        let schema_snapshot = extractor.extract(self.config.schemas.as_deref()).await?;
352
353        let schema_names: Vec<String> = schema_snapshot
354            .schemas
355            .iter()
356            .map(|s| s.name.clone())
357            .collect();
358        info!("Exporting schemas: {:?}", schema_names);
359
360        // 3. Create manifest
361        let mut manifest = Manifest::new_for_export(
362            self.config.catalog.clone(),
363            schema_names.clone(),
364            self.config.schema_only,
365            self.config.time_range.clone(),
366            self.config.format,
367            self.config.chunk_time_window,
368        )?;
369
370        // 4. Write schema files
371        self.storage.write_schema(&schema_snapshot).await?;
372        info!("Exported {} schemas", schema_snapshot.schemas.len());
373
374        // 5. Export DDL files for import recovery.
375        let ddl_by_schema = self.build_ddl_by_schema(&schema_names).await?;
376        for (schema, ddl) in ddl_by_schema {
377            let ddl_path = ddl_path_for_schema(&schema);
378            self.storage.write_text(&ddl_path, &ddl).await?;
379            info!("Exported DDL for schema {} to {}", schema, ddl_path);
380        }
381
382        // 6. Write manifest after schema artifacts and before any data export.
383        //
384        // The manifest is the snapshot commit point: only write it after the schema
385        // index and all DDL files are durable, so a crash cannot leave a "valid"
386        // snapshot that is missing required schema artifacts. For full exports we
387        // still need the manifest before data copy starts, because chunk resume is
388        // tracked by updating this manifest in place.
389        self.storage.write_manifest(&manifest).await?;
390        info!("Snapshot created: {}", manifest.snapshot_id);
391
392        if !self.config.schema_only {
393            export_data(
394                self.storage.as_ref(),
395                &self.database_client,
396                &self.config.snapshot_uri,
397                &self.config.storage_config,
398                &mut manifest,
399                self.config.parallelism,
400            )
401            .await?;
402        }
403
404        Ok(())
405    }
406
407    async fn build_ddl_by_schema(&self, schema_names: &[String]) -> Result<Vec<(String, String)>> {
408        let mut schemas = schema_names.to_vec();
409        schemas.sort();
410
411        let mut ddl_by_schema = Vec::with_capacity(schemas.len());
412        for schema in schemas {
413            let create_database = self.show_create("DATABASE", &schema, None).await?;
414
415            let (mut physical_tables, mut tables, mut views) =
416                self.get_schema_objects(&schema).await?;
417            physical_tables.sort();
418            let mut physical_ddls = Vec::with_capacity(physical_tables.len());
419            for table in physical_tables {
420                physical_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
421            }
422
423            tables.sort();
424            let mut table_ddls = Vec::with_capacity(tables.len());
425            for table in tables {
426                table_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
427            }
428
429            views.sort();
430            let mut view_ddls = Vec::with_capacity(views.len());
431            for view in views {
432                view_ddls.push(self.show_create("VIEW", &schema, Some(&view)).await?);
433            }
434
435            let ddl = build_schema_ddl(
436                &schema,
437                create_database,
438                physical_ddls,
439                table_ddls,
440                view_ddls,
441            );
442            ddl_by_schema.push((schema, ddl));
443        }
444
445        Ok(ddl_by_schema)
446    }
447
448    async fn get_schema_objects(
449        &self,
450        schema: &str,
451    ) -> Result<(Vec<String>, Vec<String>, Vec<String>)> {
452        let physical_tables = self.get_metric_physical_tables(schema).await?;
453        let physical_set: HashSet<&str> = physical_tables.iter().map(String::as_str).collect();
454        let sql = format!(
455            "SELECT table_name, table_type FROM information_schema.tables \
456             WHERE table_catalog = '{}' AND table_schema = '{}' \
457             AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')",
458            escape_sql_literal(&self.config.catalog),
459            escape_sql_literal(schema)
460        );
461        let records: Option<Vec<Vec<Value>>> = self
462            .database_client
463            .sql_in_public(&sql)
464            .await
465            .context(DatabaseSnafu)?;
466
467        let mut tables = Vec::new();
468        let mut views = Vec::new();
469        if let Some(rows) = records {
470            for row in rows {
471                let name = match row.first() {
472                    Some(Value::String(name)) => name.clone(),
473                    _ => return UnexpectedValueTypeSnafu.fail(),
474                };
475                let table_type = match row.get(1) {
476                    Some(Value::String(table_type)) => table_type.as_str(),
477                    _ => return UnexpectedValueTypeSnafu.fail(),
478                };
479                if !physical_set.contains(name.as_str()) {
480                    if table_type == "VIEW" {
481                        views.push(name);
482                    } else {
483                        tables.push(name);
484                    }
485                }
486            }
487        }
488
489        Ok((physical_tables, tables, views))
490    }
491
492    async fn get_metric_physical_tables(&self, schema: &str) -> Result<Vec<String>> {
493        let sql = format!(
494            "SELECT DISTINCT table_name FROM information_schema.columns \
495             WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'",
496            escape_sql_literal(&self.config.catalog),
497            escape_sql_literal(schema)
498        );
499        let records: Option<Vec<Vec<Value>>> = self
500            .database_client
501            .sql_in_public(&sql)
502            .await
503            .context(DatabaseSnafu)?;
504
505        let mut tables = HashSet::new();
506        if let Some(rows) = records {
507            for row in rows {
508                let name = match row.first() {
509                    Some(Value::String(name)) => name.clone(),
510                    _ => return UnexpectedValueTypeSnafu.fail(),
511                };
512                tables.insert(name);
513            }
514        }
515
516        Ok(tables.into_iter().collect())
517    }
518
519    async fn show_create(
520        &self,
521        show_type: &str,
522        schema: &str,
523        table: Option<&str>,
524    ) -> Result<String> {
525        let sql = match table {
526            Some(table) => format!(
527                r#"SHOW CREATE {} "{}"."{}"."{}""#,
528                show_type,
529                escape_sql_identifier(&self.config.catalog),
530                escape_sql_identifier(schema),
531                escape_sql_identifier(table)
532            ),
533            None => format!(
534                r#"SHOW CREATE {} "{}"."{}""#,
535                show_type,
536                escape_sql_identifier(&self.config.catalog),
537                escape_sql_identifier(schema)
538            ),
539        };
540
541        let records: Option<Vec<Vec<Value>>> = self
542            .database_client
543            .sql_in_public(&sql)
544            .await
545            .context(DatabaseSnafu)?;
546        let rows = records.context(EmptyResultSnafu)?;
547        let row = rows.first().context(EmptyResultSnafu)?;
548        let Some(Value::String(create)) = row.get(1) else {
549            return UnexpectedValueTypeSnafu.fail();
550        };
551
552        Ok(format!("{};\n", create))
553    }
554}
555
556fn build_schema_ddl(
557    schema: &str,
558    create_database: String,
559    physical_tables: Vec<String>,
560    tables: Vec<String>,
561    views: Vec<String>,
562) -> String {
563    let mut ddl = String::new();
564    ddl.push_str(&format!("-- Schema: {}\n", schema));
565    ddl.push_str(&create_database);
566    for stmt in physical_tables {
567        ddl.push_str(&stmt);
568    }
569    for stmt in tables {
570        ddl.push_str(&stmt);
571    }
572    for stmt in views {
573        ddl.push_str(&stmt);
574    }
575    ddl.push('\n');
576    ddl
577}
578
579fn validate_resume_config(manifest: &Manifest, config: &ExportConfig) -> Result<()> {
580    if manifest.schema_only != config.schema_only {
581        return SchemaOnlyModeMismatchSnafu {
582            existing_schema_only: manifest.schema_only,
583            requested_schema_only: config.schema_only,
584        }
585        .fail();
586    }
587
588    if manifest.catalog != config.catalog {
589        return ResumeConfigMismatchSnafu {
590            field: "catalog",
591            existing: manifest.catalog.clone(),
592            requested: config.catalog.clone(),
593        }
594        .fail();
595    }
596
597    // If no schema filter is provided on resume, inherit the existing snapshot
598    // selection instead of reinterpreting the request as "all schemas".
599    if let Some(requested_schemas) = &config.schemas
600        && !schema_selection_matches(&manifest.schemas, requested_schemas)
601    {
602        return ResumeConfigMismatchSnafu {
603            field: "schemas",
604            existing: format_schema_selection(&manifest.schemas),
605            requested: format_schema_selection(requested_schemas),
606        }
607        .fail();
608    }
609
610    if manifest.time_range != config.time_range {
611        return ResumeConfigMismatchSnafu {
612            field: "time_range",
613            existing: format!("{:?}", manifest.time_range),
614            requested: format!("{:?}", config.time_range),
615        }
616        .fail();
617    }
618
619    if manifest.format != config.format {
620        return ResumeConfigMismatchSnafu {
621            field: "format",
622            existing: manifest.format.to_string(),
623            requested: config.format.to_string(),
624        }
625        .fail();
626    }
627
628    let expected_plan = Manifest::new_for_export(
629        manifest.catalog.clone(),
630        manifest.schemas.clone(),
631        config.schema_only,
632        config.time_range.clone(),
633        config.format,
634        config.chunk_time_window,
635    )?;
636    if !chunk_plan_matches(manifest, &expected_plan) {
637        return ResumeConfigMismatchSnafu {
638            field: "chunk plan",
639            existing: format_chunk_plan(&manifest.chunks),
640            requested: format_chunk_plan(&expected_plan.chunks),
641        }
642        .fail();
643    }
644
645    Ok(())
646}
647
648fn schema_selection_matches(existing: &[String], requested: &[String]) -> bool {
649    canonical_schema_selection(existing) == canonical_schema_selection(requested)
650}
651
652fn canonical_schema_selection(schemas: &[String]) -> Vec<String> {
653    let mut canonicalized = Vec::new();
654    let mut seen = HashSet::new();
655
656    for schema in schemas {
657        let normalized = schema.to_ascii_lowercase();
658        if seen.insert(normalized.clone()) {
659            canonicalized.push(normalized);
660        }
661    }
662
663    canonicalized.sort();
664    canonicalized
665}
666
667fn format_schema_selection(schemas: &[String]) -> String {
668    format!("[{}]", schemas.join(", "))
669}
670
671fn chunk_plan_matches(existing: &Manifest, expected: &Manifest) -> bool {
672    existing.chunks.len() == expected.chunks.len()
673        && existing
674            .chunks
675            .iter()
676            .zip(&expected.chunks)
677            .all(|(left, right)| left.id == right.id && left.time_range == right.time_range)
678}
679
680fn format_chunk_plan(chunks: &[ChunkMeta]) -> String {
681    let items = chunks
682        .iter()
683        .map(|chunk| format!("#{}:{:?}", chunk.id, chunk.time_range))
684        .collect::<Vec<_>>();
685    format!("[{}]", items.join(", "))
686}
687
688#[derive(Debug)]
689struct SnapshotListEntry {
690    path: String,
691    manifest: Manifest,
692}
693
694#[derive(Debug, Default)]
695struct SnapshotScanResult {
696    snapshots: Vec<SnapshotListEntry>,
697    unreadable: Vec<String>,
698}
699
700async fn scan_snapshots(storage: &OpenDalStorage) -> Result<SnapshotScanResult> {
701    let mut result = SnapshotScanResult::default();
702    for dir in storage.list_direct_child_dirs().await? {
703        let manifest_path = format!("{}/{}", dir.trim_matches('/'), MANIFEST_FILE);
704        let Some(data) = storage.read_file_if_exists(&manifest_path).await? else {
705            continue;
706        };
707
708        match serde_json::from_slice::<Manifest>(&data) {
709            Ok(manifest) => result.snapshots.push(SnapshotListEntry {
710                path: format!("{}/", dir.trim_matches('/')),
711                manifest,
712            }),
713            Err(_) => result
714                .unreadable
715                .push(format!("{}/", dir.trim_matches('/'))),
716        }
717    }
718
719    result
720        .snapshots
721        .sort_by_key(|entry| std::cmp::Reverse(entry.manifest.created_at));
722    result.unreadable.sort();
723    Ok(result)
724}
725
726fn print_snapshot_list(snapshots: &[SnapshotListEntry], unreadable_count: usize) {
727    if unreadable_count == 0 {
728        println!("Found {} snapshots:", snapshots.len());
729    } else {
730        println!(
731            "Found {} snapshots ({} {} skipped: unreadable manifest):",
732            snapshots.len(),
733            unreadable_count,
734            directory_word(unreadable_count)
735        );
736    }
737    println!();
738    println!(
739        "  {:<24}  {:<36}  {:<19}  {:<9}  {:<7}  {:<6}  Status",
740        "Path", "ID", "Created", "Catalog", "Schemas", "Chunks"
741    );
742    println!(
743        "  {:<24}  {:<36}  {:<19}  {:<9}  {:<7}  {:<6}  {:<10}",
744        "-".repeat(24),
745        "-".repeat(36),
746        "-".repeat(19),
747        "-".repeat(9),
748        "-".repeat(7),
749        "-".repeat(6),
750        "-".repeat(10)
751    );
752    for entry in snapshots {
753        let manifest = &entry.manifest;
754        println!(
755            "  {:<24}  {:<36}  {:<19}  {:<9}  {:<7}  {:<6}  {}",
756            entry.path,
757            manifest.snapshot_id,
758            manifest.created_at.format("%Y-%m-%d %H:%M:%S"),
759            manifest.catalog,
760            manifest.schemas.len(),
761            format_list_chunks(manifest),
762            snapshot_status(manifest)
763        );
764    }
765}
766
767fn print_unreadable_warnings(unreadable: &[String]) {
768    if unreadable.is_empty() {
769        return;
770    }
771
772    println!();
773    println!(
774        "Warning: {} {} had corrupt/unreadable manifest.json:",
775        unreadable.len(),
776        directory_word(unreadable.len())
777    );
778    for path in unreadable {
779        println!("  - {}", path);
780    }
781}
782
783fn directory_word(count: usize) -> &'static str {
784    if count == 1 {
785        "directory"
786    } else {
787        "directories"
788    }
789}
790
791fn snapshot_status(manifest: &Manifest) -> &'static str {
792    if manifest.schema_only {
793        "schema-only"
794    } else if manifest.is_complete() {
795        "complete"
796    } else {
797        "incomplete"
798    }
799}
800
801fn format_list_chunks(manifest: &Manifest) -> String {
802    let total = manifest.chunks.len();
803    if total == 0 {
804        return "0".to_string();
805    }
806
807    format!(
808        "{}/{}",
809        manifest.completed_count() + manifest.skipped_count(),
810        total
811    )
812}
813
814#[cfg(test)]
815mod tests {
816    use chrono::TimeZone;
817    use clap::Parser;
818    use tempfile::tempdir;
819    use url::Url;
820
821    use super::*;
822    use crate::data::path::ddl_path_for_schema;
823
824    #[test]
825    fn test_ddl_path_for_schema() {
826        assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
827        assert_eq!(
828            ddl_path_for_schema("../evil"),
829            "schema/ddl/%2E%2E%2Fevil.sql"
830        );
831    }
832
833    #[test]
834    fn test_build_schema_ddl_order() {
835        let ddl = build_schema_ddl(
836            "public",
837            "CREATE DATABASE public;\n".to_string(),
838            vec!["PHYSICAL;\n".to_string()],
839            vec!["TABLE;\n".to_string()],
840            vec!["VIEW;\n".to_string()],
841        );
842
843        let db_pos = ddl.find("CREATE DATABASE").unwrap();
844        let physical_pos = ddl.find("PHYSICAL;").unwrap();
845        let table_pos = ddl.find("TABLE;").unwrap();
846        let view_pos = ddl.find("VIEW;").unwrap();
847        assert!(db_pos < physical_pos);
848        assert!(physical_pos < table_pos);
849        assert!(table_pos < view_pos);
850    }
851
852    #[tokio::test]
853    async fn test_build_rejects_chunk_window_without_bounds() {
854        let cmd = ExportCreateCommand::parse_from([
855            "export-v2-create",
856            "--addr",
857            "127.0.0.1:4000",
858            "--to",
859            "file:///tmp/export-v2-test",
860            "--chunk-time-window",
861            "1h",
862        ]);
863
864        let result = cmd.build().await;
865        assert!(result.is_err());
866        let error = result.err().unwrap().to_string();
867
868        assert!(error.contains("chunk_time_window requires both --start-time and --end-time"));
869    }
870
871    #[tokio::test]
872    async fn test_build_rejects_data_export_args_in_schema_only_mode() {
873        let cmd = ExportCreateCommand::parse_from([
874            "export-v2-create",
875            "--addr",
876            "127.0.0.1:4000",
877            "--to",
878            "file:///tmp/export-v2-test",
879            "--schema-only",
880            "--start-time",
881            "2024-01-01T00:00:00Z",
882            "--end-time",
883            "2024-01-02T00:00:00Z",
884            "--chunk-time-window",
885            "1h",
886            "--format",
887            "csv",
888            "--parallelism",
889            "2",
890        ]);
891
892        let error = cmd.build().await.err().unwrap().to_string();
893
894        assert!(error.contains("--schema-only cannot be used with data export arguments"));
895        assert!(error.contains("--start-time"));
896        assert!(error.contains("--end-time"));
897        assert!(error.contains("--chunk-time-window"));
898        assert!(error.contains("--format"));
899        assert!(error.contains("--parallelism"));
900    }
901
902    #[test]
903    fn test_schema_only_mode_mismatch_error_message() {
904        let error = crate::data::export_v2::error::SchemaOnlyModeMismatchSnafu {
905            existing_schema_only: false,
906            requested_schema_only: true,
907        }
908        .build()
909        .to_string();
910
911        assert!(error.contains("existing: false"));
912        assert!(error.contains("requested: true"));
913    }
914
915    #[test]
916    fn test_validate_resume_config_rejects_catalog_mismatch() {
917        let manifest = Manifest::new_for_export(
918            "greptime".to_string(),
919            vec!["public".to_string()],
920            false,
921            TimeRange::unbounded(),
922            DataFormat::Parquet,
923            None,
924        )
925        .unwrap();
926        let config = ExportConfig {
927            catalog: "other".to_string(),
928            schemas: None,
929            schema_only: false,
930            format: DataFormat::Parquet,
931            force: false,
932            time_range: TimeRange::unbounded(),
933            chunk_time_window: None,
934            parallelism: 1,
935            snapshot_uri: "file:///tmp/snapshot".to_string(),
936            storage_config: ObjectStoreConfig::default(),
937        };
938
939        let error = validate_resume_config(&manifest, &config)
940            .err()
941            .unwrap()
942            .to_string();
943        assert!(error.contains("catalog"));
944    }
945
946    #[test]
947    fn test_validate_resume_config_accepts_schema_selection_with_different_case_and_order() {
948        let manifest = Manifest::new_for_export(
949            "greptime".to_string(),
950            vec!["public".to_string(), "analytics".to_string()],
951            false,
952            TimeRange::unbounded(),
953            DataFormat::Parquet,
954            None,
955        )
956        .unwrap();
957        let config = ExportConfig {
958            catalog: "greptime".to_string(),
959            schemas: Some(vec![
960                "ANALYTICS".to_string(),
961                "PUBLIC".to_string(),
962                "public".to_string(),
963            ]),
964            schema_only: false,
965            format: DataFormat::Parquet,
966            force: false,
967            time_range: TimeRange::unbounded(),
968            chunk_time_window: None,
969            parallelism: 1,
970            snapshot_uri: "file:///tmp/snapshot".to_string(),
971            storage_config: ObjectStoreConfig::default(),
972        };
973
974        assert!(validate_resume_config(&manifest, &config).is_ok());
975    }
976
977    #[test]
978    fn test_validate_resume_config_rejects_chunk_plan_mismatch() {
979        let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
980        let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 2, 0, 0).unwrap();
981        let time_range = TimeRange::new(Some(start), Some(end));
982        let manifest = Manifest::new_for_export(
983            "greptime".to_string(),
984            vec!["public".to_string()],
985            false,
986            time_range.clone(),
987            DataFormat::Parquet,
988            None,
989        )
990        .unwrap();
991        let config = ExportConfig {
992            catalog: "greptime".to_string(),
993            schemas: None,
994            schema_only: false,
995            format: DataFormat::Parquet,
996            force: false,
997            time_range,
998            chunk_time_window: Some(Duration::from_secs(3600)),
999            parallelism: 1,
1000            snapshot_uri: "file:///tmp/snapshot".to_string(),
1001            storage_config: ObjectStoreConfig::default(),
1002        };
1003
1004        let error = validate_resume_config(&manifest, &config)
1005            .err()
1006            .unwrap()
1007            .to_string();
1008        assert!(error.contains("chunk plan"));
1009    }
1010
1011    #[test]
1012    fn test_validate_resume_config_rejects_format_mismatch() {
1013        let manifest = Manifest::new_for_export(
1014            "greptime".to_string(),
1015            vec!["public".to_string()],
1016            false,
1017            TimeRange::unbounded(),
1018            DataFormat::Parquet,
1019            None,
1020        )
1021        .unwrap();
1022        let config = ExportConfig {
1023            catalog: "greptime".to_string(),
1024            schemas: None,
1025            schema_only: false,
1026            format: DataFormat::Csv,
1027            force: false,
1028            time_range: TimeRange::unbounded(),
1029            chunk_time_window: None,
1030            parallelism: 1,
1031            snapshot_uri: "file:///tmp/snapshot".to_string(),
1032            storage_config: ObjectStoreConfig::default(),
1033        };
1034
1035        let error = validate_resume_config(&manifest, &config)
1036            .err()
1037            .unwrap()
1038            .to_string();
1039        assert!(error.contains("format"));
1040    }
1041
1042    #[test]
1043    fn test_validate_resume_config_rejects_time_range_mismatch() {
1044        let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
1045        let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap();
1046        let manifest = Manifest::new_for_export(
1047            "greptime".to_string(),
1048            vec!["public".to_string()],
1049            false,
1050            TimeRange::new(Some(start), Some(end)),
1051            DataFormat::Parquet,
1052            None,
1053        )
1054        .unwrap();
1055        let config = ExportConfig {
1056            catalog: "greptime".to_string(),
1057            schemas: None,
1058            schema_only: false,
1059            format: DataFormat::Parquet,
1060            force: false,
1061            time_range: TimeRange::new(Some(start), Some(start)),
1062            chunk_time_window: None,
1063            parallelism: 1,
1064            snapshot_uri: "file:///tmp/snapshot".to_string(),
1065            storage_config: ObjectStoreConfig::default(),
1066        };
1067
1068        let error = validate_resume_config(&manifest, &config)
1069            .err()
1070            .unwrap()
1071            .to_string();
1072        assert!(error.contains("time_range"));
1073    }
1074
1075    #[tokio::test]
1076    async fn test_scan_snapshots_sorts_and_tracks_unreadable_manifests() {
1077        let dir = tempdir().unwrap();
1078        write_test_manifest(
1079            dir.path(),
1080            "older",
1081            test_manifest(
1082                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1083                false,
1084                true,
1085            ),
1086        );
1087        write_test_manifest(
1088            dir.path(),
1089            "newer",
1090            test_manifest(
1091                chrono::Utc.with_ymd_and_hms(2026, 2, 1, 0, 0, 0).unwrap(),
1092                false,
1093                true,
1094            ),
1095        );
1096
1097        std::fs::create_dir_all(dir.path().join("empty-dir")).unwrap();
1098        std::fs::create_dir_all(dir.path().join("not-snapshot")).unwrap();
1099        std::fs::write(dir.path().join("not-snapshot").join("data.txt"), "x").unwrap();
1100        std::fs::create_dir_all(dir.path().join("broken")).unwrap();
1101        std::fs::write(dir.path().join("broken").join(MANIFEST_FILE), "{not-json").unwrap();
1102
1103        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
1104        let storage = OpenDalStorage::from_file_uri(&uri).unwrap();
1105        let result = scan_snapshots(&storage).await.unwrap();
1106
1107        assert_eq!(result.snapshots.len(), 2);
1108        assert_eq!(
1109            result.snapshots[0].manifest.created_at,
1110            chrono::Utc.with_ymd_and_hms(2026, 2, 1, 0, 0, 0).unwrap()
1111        );
1112        assert_eq!(
1113            result.snapshots[1].manifest.created_at,
1114            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap()
1115        );
1116        assert_eq!(result.unreadable, vec!["broken/".to_string()]);
1117        assert_eq!(result.snapshots[0].path, "newer/");
1118        assert_eq!(result.snapshots[1].path, "older/");
1119    }
1120
1121    #[test]
1122    fn test_snapshot_list_status_and_chunk_summary() {
1123        let schema_only = test_manifest(
1124            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1125            true,
1126            true,
1127        );
1128        assert_eq!(snapshot_status(&schema_only), "schema-only");
1129        assert_eq!(format_list_chunks(&schema_only), "0");
1130
1131        let complete = test_manifest(
1132            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1133            false,
1134            true,
1135        );
1136        assert_eq!(snapshot_status(&complete), "complete");
1137        assert_eq!(format_list_chunks(&complete), "2/2");
1138
1139        let incomplete = test_manifest(
1140            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1141            false,
1142            false,
1143        );
1144        assert_eq!(snapshot_status(&incomplete), "incomplete");
1145        assert_eq!(format_list_chunks(&incomplete), "1/2");
1146    }
1147
1148    fn write_test_manifest(root: &std::path::Path, dir: &str, manifest: Manifest) {
1149        let snapshot_dir = root.join(dir);
1150        std::fs::create_dir_all(&snapshot_dir).unwrap();
1151        std::fs::write(
1152            snapshot_dir.join(MANIFEST_FILE),
1153            serde_json::to_vec_pretty(&manifest).unwrap(),
1154        )
1155        .unwrap();
1156    }
1157
1158    fn test_manifest(
1159        created_at: chrono::DateTime<chrono::Utc>,
1160        schema_only: bool,
1161        complete: bool,
1162    ) -> Manifest {
1163        let mut manifest = Manifest::new_for_export(
1164            "greptime".to_string(),
1165            vec!["public".to_string(), "analytics".to_string()],
1166            schema_only,
1167            TimeRange::unbounded(),
1168            DataFormat::Parquet,
1169            None,
1170        )
1171        .unwrap();
1172        manifest.created_at = created_at;
1173        manifest.updated_at = created_at;
1174
1175        if !schema_only {
1176            manifest.chunks.clear();
1177            let mut first = ChunkMeta::new(1, TimeRange::unbounded());
1178            first.mark_completed(vec!["data/public/chunk_1/file.parquet".to_string()], None);
1179            manifest.chunks.push(first);
1180
1181            if complete {
1182                manifest
1183                    .chunks
1184                    .push(ChunkMeta::skipped(2, TimeRange::unbounded()));
1185            } else {
1186                manifest
1187                    .chunks
1188                    .push(ChunkMeta::new(2, TimeRange::unbounded()));
1189            }
1190        }
1191
1192        manifest
1193    }
1194}