Skip to main content

cli/data/export_v2/
command.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Export V2 CLI commands.
16
17use std::collections::HashSet;
18use std::io::{self, Write};
19use std::time::Duration;
20
21use async_trait::async_trait;
22use clap::{Parser, Subcommand};
23use common_error::ext::BoxedError;
24use common_telemetry::info;
25use serde_json::Value;
26use snafu::{OptionExt, ResultExt};
27
28use crate::Tool;
29use crate::common::ObjectStoreConfig;
30use crate::data::export_v2::coordinator::export_data;
31use crate::data::export_v2::error::{
32    ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu, IoSnafu,
33    ManifestVersionMismatchSnafu, Result, ResumeConfigMismatchSnafu, SchemaOnlyArgsNotAllowedSnafu,
34    SchemaOnlyModeMismatchSnafu, SnapshotVerifyFailedSnafu, UnexpectedValueTypeSnafu,
35};
36use crate::data::export_v2::extractor::SchemaExtractor;
37use crate::data::export_v2::manifest::{
38    ChunkMeta, ChunkStatus, DataFormat, MANIFEST_FILE, MANIFEST_VERSION, Manifest, TimeRange,
39};
40use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR, SCHEMAS_FILE};
41use crate::data::path::{data_dir_for_schema_chunk, ddl_path_for_schema};
42use crate::data::snapshot_storage::{
43    OpenDalStorage, SnapshotStorage, validate_snapshot_uri, validate_uri,
44};
45use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
46use crate::database::{DatabaseClient, parse_proxy_opts};
47
48/// Export V2 commands.
49#[derive(Debug, Subcommand)]
50pub enum ExportV2Command {
51    /// Create a new snapshot.
52    Create(ExportCreateCommand),
53    /// List snapshots under a parent location.
54    List(ExportListCommand),
55    /// Verify snapshot integrity.
56    Verify(ExportVerifyCommand),
57    /// Delete a snapshot and all data under it.
58    Delete(ExportDeleteCommand),
59}
60
61impl ExportV2Command {
62    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
63        match self {
64            ExportV2Command::Create(cmd) => cmd.build().await,
65            ExportV2Command::List(cmd) => cmd.build().await,
66            ExportV2Command::Verify(cmd) => cmd.build().await,
67            ExportV2Command::Delete(cmd) => cmd.build().await,
68        }
69    }
70}
71
72/// List snapshots under a parent location.
73#[derive(Debug, Parser)]
74pub struct ExportListCommand {
75    /// Parent storage location whose direct subdirectories are snapshots.
76    #[clap(long)]
77    location: String,
78
79    /// Object store configuration for remote storage backends.
80    #[clap(flatten)]
81    storage: ObjectStoreConfig,
82}
83
84impl ExportListCommand {
85    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
86        validate_uri(&self.location).map_err(BoxedError::new)?;
87        let storage = OpenDalStorage::from_parent_uri(&self.location, &self.storage)
88            .map_err(BoxedError::new)?;
89
90        Ok(Box::new(ExportList {
91            location: self.location.clone(),
92            storage,
93        }))
94    }
95}
96
97/// Export list tool implementation.
98pub struct ExportList {
99    location: String,
100    storage: OpenDalStorage,
101}
102
103#[async_trait]
104impl Tool for ExportList {
105    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
106        self.run().await.map_err(BoxedError::new)
107    }
108}
109
110impl ExportList {
111    async fn run(&self) -> Result<()> {
112        let result = scan_snapshots(&self.storage).await?;
113
114        println!("Scanning: {}", self.location);
115        if result.snapshots.is_empty() {
116            println!("No snapshots found.");
117        } else {
118            print_snapshot_list(&result.snapshots, result.unreadable.len());
119        }
120        print_unreadable_warnings(&result.unreadable);
121
122        Ok(())
123    }
124}
125
126/// Verify snapshot integrity.
127#[derive(Debug, Parser)]
128pub struct ExportVerifyCommand {
129    /// Snapshot storage location (e.g., s3://bucket/path, file:///tmp/backup).
130    #[clap(long)]
131    snapshot: String,
132
133    /// Object store configuration for remote storage backends.
134    #[clap(flatten)]
135    storage: ObjectStoreConfig,
136}
137
138impl ExportVerifyCommand {
139    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
140        validate_uri(&self.snapshot).map_err(BoxedError::new)?;
141        let storage =
142            OpenDalStorage::from_uri(&self.snapshot, &self.storage).map_err(BoxedError::new)?;
143
144        Ok(Box::new(ExportVerify {
145            snapshot: self.snapshot.clone(),
146            storage,
147        }))
148    }
149}
150
151/// Export verify tool implementation.
152pub struct ExportVerify {
153    snapshot: String,
154    storage: OpenDalStorage,
155}
156
157#[async_trait]
158impl Tool for ExportVerify {
159    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
160        self.run().await.map_err(BoxedError::new)
161    }
162}
163
164impl ExportVerify {
165    async fn run(&self) -> Result<()> {
166        let report = verify_snapshot(&self.storage).await?;
167        print_verify_report(&self.snapshot, &report);
168
169        if report.has_problems() {
170            return SnapshotVerifyFailedSnafu {
171                errors: report.error_count(),
172                warnings: report.warning_count(),
173            }
174            .fail();
175        }
176
177        Ok(())
178    }
179}
180
181/// Delete a snapshot and all data under it.
182#[derive(Debug, Parser)]
183pub struct ExportDeleteCommand {
184    /// Snapshot storage location (e.g., s3://bucket/path, file:///tmp/backup).
185    #[clap(long)]
186    snapshot: String,
187
188    /// Skip interactive confirmation.
189    #[clap(long = "no-confirm", alias = "yes")]
190    skip_confirmation: bool,
191
192    /// Object store configuration for remote storage backends.
193    #[clap(flatten)]
194    storage: ObjectStoreConfig,
195}
196
197impl ExportDeleteCommand {
198    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
199        validate_snapshot_uri(&self.snapshot).map_err(BoxedError::new)?;
200        let storage =
201            OpenDalStorage::from_uri(&self.snapshot, &self.storage).map_err(BoxedError::new)?;
202
203        Ok(Box::new(ExportDelete {
204            snapshot: self.snapshot.clone(),
205            skip_confirmation: self.skip_confirmation,
206            storage,
207        }))
208    }
209}
210
211/// Export delete tool implementation.
212pub struct ExportDelete {
213    snapshot: String,
214    skip_confirmation: bool,
215    storage: OpenDalStorage,
216}
217
218#[async_trait]
219impl Tool for ExportDelete {
220    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
221        self.run().await.map_err(BoxedError::new)
222    }
223}
224
225impl ExportDelete {
226    async fn run(&self) -> Result<()> {
227        self.run_with_confirmation(confirm_delete).await
228    }
229
230    async fn run_with_confirmation<F>(&self, confirm: F) -> Result<()>
231    where
232        F: FnOnce(&str) -> Result<bool>,
233    {
234        let manifest = self.storage.read_manifest().await?;
235        print_delete_summary(&self.snapshot, &manifest);
236
237        if !self.skip_confirmation && !confirm(&self.snapshot)? {
238            println!("Deletion cancelled.");
239            return Ok(());
240        }
241
242        println!("Deleting snapshot...");
243        self.storage.delete_snapshot().await?;
244        println!("Snapshot deleted successfully.");
245
246        Ok(())
247    }
248}
249
250/// Create a new snapshot.
251#[derive(Debug, Parser)]
252pub struct ExportCreateCommand {
253    /// Server address to connect (e.g., 127.0.0.1:4000).
254    #[clap(long)]
255    addr: String,
256
257    /// Target storage location (e.g., s3://bucket/path, file:///tmp/backup).
258    #[clap(long)]
259    to: String,
260
261    /// Catalog name.
262    #[clap(long, default_value = "greptime")]
263    catalog: String,
264
265    /// Schema list to export (default: all non-system schemas).
266    /// Can be specified multiple times or comma-separated.
267    #[clap(long, value_delimiter = ',')]
268    schemas: Vec<String>,
269
270    /// Export schema only, no data.
271    #[clap(long)]
272    schema_only: bool,
273
274    /// Time range start (ISO 8601 format, e.g., 2024-01-01T00:00:00Z).
275    #[clap(long)]
276    start_time: Option<String>,
277
278    /// Time range end (ISO 8601 format, e.g., 2024-12-31T23:59:59Z).
279    #[clap(long)]
280    end_time: Option<String>,
281
282    /// Chunk time window (e.g., 1h, 6h, 1d, 7d).
283    /// Requires both --start-time and --end-time when specified.
284    #[clap(long, value_parser = humantime::parse_duration)]
285    chunk_time_window: Option<Duration>,
286
287    /// Data format: parquet, csv, json.
288    #[clap(long, value_enum, default_value = "parquet")]
289    format: DataFormat,
290
291    /// Delete existing snapshot and recreate.
292    #[clap(long)]
293    force: bool,
294
295    /// Parallelism for COPY DATABASE execution (server-side, per schema per chunk).
296    #[clap(long, default_value = "1")]
297    parallelism: usize,
298
299    /// Basic authentication (user:password).
300    #[clap(long)]
301    auth_basic: Option<String>,
302
303    /// Request timeout.
304    #[clap(long, value_parser = humantime::parse_duration)]
305    timeout: Option<Duration>,
306
307    /// Proxy server address.
308    ///
309    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
310    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
311    #[clap(long)]
312    proxy: Option<String>,
313
314    /// Disable all proxy usage (ignores `--proxy` and system proxy).
315    ///
316    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
317    #[clap(long)]
318    no_proxy: bool,
319
320    /// Object store configuration for remote storage backends.
321    #[clap(flatten)]
322    storage: ObjectStoreConfig,
323}
324
325impl ExportCreateCommand {
326    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
327        // Validate URI format
328        validate_uri(&self.to).map_err(BoxedError::new)?;
329
330        let time_range = TimeRange::parse(self.start_time.as_deref(), self.end_time.as_deref())
331            .map_err(BoxedError::new)?;
332        if self.chunk_time_window.is_some() && !time_range.is_bounded() {
333            return ChunkTimeWindowRequiresBoundsSnafu
334                .fail()
335                .map_err(BoxedError::new);
336        }
337        if self.schema_only {
338            let mut invalid_args = Vec::new();
339            if self.start_time.is_some() {
340                invalid_args.push("--start-time");
341            }
342            if self.end_time.is_some() {
343                invalid_args.push("--end-time");
344            }
345            if self.chunk_time_window.is_some() {
346                invalid_args.push("--chunk-time-window");
347            }
348            if self.format != DataFormat::Parquet {
349                invalid_args.push("--format");
350            }
351            if self.parallelism != 1 {
352                invalid_args.push("--parallelism");
353            }
354            if !invalid_args.is_empty() {
355                return SchemaOnlyArgsNotAllowedSnafu {
356                    args: invalid_args.join(", "),
357                }
358                .fail()
359                .map_err(BoxedError::new);
360            }
361        }
362
363        // Parse schemas (empty vec means all schemas)
364        let schemas = if self.schemas.is_empty() {
365            None
366        } else {
367            Some(self.schemas.clone())
368        };
369
370        // Build storage
371        let storage = OpenDalStorage::from_uri(&self.to, &self.storage).map_err(BoxedError::new)?;
372
373        // Build database client
374        let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
375        let database_client = DatabaseClient::new(
376            self.addr.clone(),
377            self.catalog.clone(),
378            self.auth_basic.clone(),
379            self.timeout.unwrap_or(Duration::from_secs(60)),
380            proxy,
381            self.no_proxy,
382        );
383
384        Ok(Box::new(ExportCreate {
385            config: ExportConfig {
386                catalog: self.catalog.clone(),
387                schemas,
388                schema_only: self.schema_only,
389                format: self.format,
390                force: self.force,
391                time_range,
392                chunk_time_window: self.chunk_time_window,
393                parallelism: self.parallelism,
394                snapshot_uri: self.to.clone(),
395                storage_config: self.storage.clone(),
396            },
397            storage: Box::new(storage),
398            database_client,
399        }))
400    }
401}
402
403/// Export tool implementation.
404pub struct ExportCreate {
405    config: ExportConfig,
406    storage: Box<dyn SnapshotStorage>,
407    database_client: DatabaseClient,
408}
409
410struct ExportConfig {
411    catalog: String,
412    schemas: Option<Vec<String>>,
413    schema_only: bool,
414    format: DataFormat,
415    force: bool,
416    time_range: TimeRange,
417    chunk_time_window: Option<Duration>,
418    parallelism: usize,
419    snapshot_uri: String,
420    storage_config: ObjectStoreConfig,
421}
422
423#[async_trait]
424impl Tool for ExportCreate {
425    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
426        self.run().await.map_err(BoxedError::new)
427    }
428}
429
430impl ExportCreate {
431    async fn run(&self) -> Result<()> {
432        // 1. Check if snapshot exists
433        let exists = self.storage.exists().await?;
434
435        if exists {
436            if self.config.force {
437                info!("Deleting existing snapshot (--force)");
438                self.storage.delete_snapshot().await?;
439            } else {
440                // Resume mode - read existing manifest
441                let mut manifest = self.storage.read_manifest().await?;
442
443                // Check version compatibility
444                if manifest.version != MANIFEST_VERSION {
445                    return ManifestVersionMismatchSnafu {
446                        expected: MANIFEST_VERSION,
447                        found: manifest.version,
448                    }
449                    .fail();
450                }
451
452                validate_resume_config(&manifest, &self.config)?;
453
454                info!(
455                    "Resuming existing snapshot: {} (completed: {}/{} chunks)",
456                    manifest.snapshot_id,
457                    manifest.completed_count(),
458                    manifest.chunks.len()
459                );
460
461                if manifest.is_complete() {
462                    info!("Snapshot is already complete");
463                    return Ok(());
464                }
465
466                if manifest.schema_only {
467                    return Ok(());
468                }
469
470                export_data(
471                    self.storage.as_ref(),
472                    &self.database_client,
473                    &self.config.snapshot_uri,
474                    &self.config.storage_config,
475                    &mut manifest,
476                    self.config.parallelism,
477                )
478                .await?;
479                return Ok(());
480            }
481        }
482
483        // 2. Get schema list
484        let extractor = SchemaExtractor::new(&self.database_client, &self.config.catalog);
485        let schema_snapshot = extractor.extract(self.config.schemas.as_deref()).await?;
486
487        let schema_names: Vec<String> = schema_snapshot
488            .schemas
489            .iter()
490            .map(|s| s.name.clone())
491            .collect();
492        info!("Exporting schemas: {:?}", schema_names);
493
494        // 3. Create manifest
495        let mut manifest = Manifest::new_for_export(
496            self.config.catalog.clone(),
497            schema_names.clone(),
498            self.config.schema_only,
499            self.config.time_range.clone(),
500            self.config.format,
501            self.config.chunk_time_window,
502        )?;
503
504        // 4. Write schema files
505        self.storage.write_schema(&schema_snapshot).await?;
506        info!("Exported {} schemas", schema_snapshot.schemas.len());
507
508        // 5. Export DDL files for import recovery.
509        let ddl_by_schema = self.build_ddl_by_schema(&schema_names).await?;
510        for (schema, ddl) in ddl_by_schema {
511            let ddl_path = ddl_path_for_schema(&schema);
512            self.storage.write_text(&ddl_path, &ddl).await?;
513            info!("Exported DDL for schema {} to {}", schema, ddl_path);
514        }
515
516        // 6. Write manifest after schema artifacts and before any data export.
517        //
518        // The manifest is the snapshot commit point: only write it after the schema
519        // index and all DDL files are durable, so a crash cannot leave a "valid"
520        // snapshot that is missing required schema artifacts. For full exports we
521        // still need the manifest before data copy starts, because chunk resume is
522        // tracked by updating this manifest in place.
523        self.storage.write_manifest(&manifest).await?;
524        info!("Snapshot created: {}", manifest.snapshot_id);
525
526        if !self.config.schema_only {
527            export_data(
528                self.storage.as_ref(),
529                &self.database_client,
530                &self.config.snapshot_uri,
531                &self.config.storage_config,
532                &mut manifest,
533                self.config.parallelism,
534            )
535            .await?;
536        }
537
538        Ok(())
539    }
540
541    async fn build_ddl_by_schema(&self, schema_names: &[String]) -> Result<Vec<(String, String)>> {
542        let mut schemas = schema_names.to_vec();
543        schemas.sort();
544
545        let mut ddl_by_schema = Vec::with_capacity(schemas.len());
546        for schema in schemas {
547            let create_database = self.show_create("DATABASE", &schema, None).await?;
548
549            let (mut physical_tables, mut tables, mut views) =
550                self.get_schema_objects(&schema).await?;
551            physical_tables.sort();
552            let mut physical_ddls = Vec::with_capacity(physical_tables.len());
553            for table in physical_tables {
554                physical_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
555            }
556
557            tables.sort();
558            let mut table_ddls = Vec::with_capacity(tables.len());
559            for table in tables {
560                table_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
561            }
562
563            views.sort();
564            let mut view_ddls = Vec::with_capacity(views.len());
565            for view in views {
566                view_ddls.push(self.show_create("VIEW", &schema, Some(&view)).await?);
567            }
568
569            let ddl = build_schema_ddl(
570                &schema,
571                create_database,
572                physical_ddls,
573                table_ddls,
574                view_ddls,
575            );
576            ddl_by_schema.push((schema, ddl));
577        }
578
579        Ok(ddl_by_schema)
580    }
581
582    async fn get_schema_objects(
583        &self,
584        schema: &str,
585    ) -> Result<(Vec<String>, Vec<String>, Vec<String>)> {
586        let physical_tables = self.get_metric_physical_tables(schema).await?;
587        let physical_set: HashSet<&str> = physical_tables.iter().map(String::as_str).collect();
588        let sql = format!(
589            "SELECT table_name, table_type FROM information_schema.tables \
590             WHERE table_catalog = '{}' AND table_schema = '{}' \
591             AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')",
592            escape_sql_literal(&self.config.catalog),
593            escape_sql_literal(schema)
594        );
595        let records: Option<Vec<Vec<Value>>> = self
596            .database_client
597            .sql_in_public(&sql)
598            .await
599            .context(DatabaseSnafu)?;
600
601        let mut tables = Vec::new();
602        let mut views = Vec::new();
603        if let Some(rows) = records {
604            for row in rows {
605                let name = match row.first() {
606                    Some(Value::String(name)) => name.clone(),
607                    _ => return UnexpectedValueTypeSnafu.fail(),
608                };
609                let table_type = match row.get(1) {
610                    Some(Value::String(table_type)) => table_type.as_str(),
611                    _ => return UnexpectedValueTypeSnafu.fail(),
612                };
613                if !physical_set.contains(name.as_str()) {
614                    if table_type == "VIEW" {
615                        views.push(name);
616                    } else {
617                        tables.push(name);
618                    }
619                }
620            }
621        }
622
623        Ok((physical_tables, tables, views))
624    }
625
626    async fn get_metric_physical_tables(&self, schema: &str) -> Result<Vec<String>> {
627        let sql = format!(
628            "SELECT DISTINCT table_name FROM information_schema.columns \
629             WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'",
630            escape_sql_literal(&self.config.catalog),
631            escape_sql_literal(schema)
632        );
633        let records: Option<Vec<Vec<Value>>> = self
634            .database_client
635            .sql_in_public(&sql)
636            .await
637            .context(DatabaseSnafu)?;
638
639        let mut tables = HashSet::new();
640        if let Some(rows) = records {
641            for row in rows {
642                let name = match row.first() {
643                    Some(Value::String(name)) => name.clone(),
644                    _ => return UnexpectedValueTypeSnafu.fail(),
645                };
646                tables.insert(name);
647            }
648        }
649
650        Ok(tables.into_iter().collect())
651    }
652
653    async fn show_create(
654        &self,
655        show_type: &str,
656        schema: &str,
657        table: Option<&str>,
658    ) -> Result<String> {
659        let sql = match table {
660            Some(table) => format!(
661                r#"SHOW CREATE {} "{}"."{}"."{}""#,
662                show_type,
663                escape_sql_identifier(&self.config.catalog),
664                escape_sql_identifier(schema),
665                escape_sql_identifier(table)
666            ),
667            None => format!(
668                r#"SHOW CREATE {} "{}"."{}""#,
669                show_type,
670                escape_sql_identifier(&self.config.catalog),
671                escape_sql_identifier(schema)
672            ),
673        };
674
675        let records: Option<Vec<Vec<Value>>> = self
676            .database_client
677            .sql_in_public(&sql)
678            .await
679            .context(DatabaseSnafu)?;
680        let rows = records.context(EmptyResultSnafu)?;
681        let row = rows.first().context(EmptyResultSnafu)?;
682        let Some(Value::String(create)) = row.get(1) else {
683            return UnexpectedValueTypeSnafu.fail();
684        };
685
686        Ok(format!("{};\n", create))
687    }
688}
689
690fn build_schema_ddl(
691    schema: &str,
692    create_database: String,
693    physical_tables: Vec<String>,
694    tables: Vec<String>,
695    views: Vec<String>,
696) -> String {
697    let mut ddl = String::new();
698    ddl.push_str(&format!("-- Schema: {}\n", schema));
699    ddl.push_str(&create_database);
700    for stmt in physical_tables {
701        ddl.push_str(&stmt);
702    }
703    for stmt in tables {
704        ddl.push_str(&stmt);
705    }
706    for stmt in views {
707        ddl.push_str(&stmt);
708    }
709    ddl.push('\n');
710    ddl
711}
712
713fn validate_resume_config(manifest: &Manifest, config: &ExportConfig) -> Result<()> {
714    if manifest.schema_only != config.schema_only {
715        return SchemaOnlyModeMismatchSnafu {
716            existing_schema_only: manifest.schema_only,
717            requested_schema_only: config.schema_only,
718        }
719        .fail();
720    }
721
722    if manifest.catalog != config.catalog {
723        return ResumeConfigMismatchSnafu {
724            field: "catalog",
725            existing: manifest.catalog.clone(),
726            requested: config.catalog.clone(),
727        }
728        .fail();
729    }
730
731    // If no schema filter is provided on resume, inherit the existing snapshot
732    // selection instead of reinterpreting the request as "all schemas".
733    if let Some(requested_schemas) = &config.schemas
734        && !schema_selection_matches(&manifest.schemas, requested_schemas)
735    {
736        return ResumeConfigMismatchSnafu {
737            field: "schemas",
738            existing: format_schema_selection(&manifest.schemas),
739            requested: format_schema_selection(requested_schemas),
740        }
741        .fail();
742    }
743
744    if manifest.time_range != config.time_range {
745        return ResumeConfigMismatchSnafu {
746            field: "time_range",
747            existing: format!("{:?}", manifest.time_range),
748            requested: format!("{:?}", config.time_range),
749        }
750        .fail();
751    }
752
753    if manifest.format != config.format {
754        return ResumeConfigMismatchSnafu {
755            field: "format",
756            existing: manifest.format.to_string(),
757            requested: config.format.to_string(),
758        }
759        .fail();
760    }
761
762    let expected_plan = Manifest::new_for_export(
763        manifest.catalog.clone(),
764        manifest.schemas.clone(),
765        config.schema_only,
766        config.time_range.clone(),
767        config.format,
768        config.chunk_time_window,
769    )?;
770    if !chunk_plan_matches(manifest, &expected_plan) {
771        return ResumeConfigMismatchSnafu {
772            field: "chunk plan",
773            existing: format_chunk_plan(&manifest.chunks),
774            requested: format_chunk_plan(&expected_plan.chunks),
775        }
776        .fail();
777    }
778
779    Ok(())
780}
781
782fn schema_selection_matches(existing: &[String], requested: &[String]) -> bool {
783    canonical_schema_selection(existing) == canonical_schema_selection(requested)
784}
785
786fn canonical_schema_selection(schemas: &[String]) -> Vec<String> {
787    let mut canonicalized = Vec::new();
788    let mut seen = HashSet::new();
789
790    for schema in schemas {
791        let normalized = schema.to_ascii_lowercase();
792        if seen.insert(normalized.clone()) {
793            canonicalized.push(normalized);
794        }
795    }
796
797    canonicalized.sort();
798    canonicalized
799}
800
801fn format_schema_selection(schemas: &[String]) -> String {
802    format!("[{}]", schemas.join(", "))
803}
804
805fn chunk_plan_matches(existing: &Manifest, expected: &Manifest) -> bool {
806    existing.chunks.len() == expected.chunks.len()
807        && existing
808            .chunks
809            .iter()
810            .zip(&expected.chunks)
811            .all(|(left, right)| left.id == right.id && left.time_range == right.time_range)
812}
813
814fn format_chunk_plan(chunks: &[ChunkMeta]) -> String {
815    let items = chunks
816        .iter()
817        .map(|chunk| format!("#{}:{:?}", chunk.id, chunk.time_range))
818        .collect::<Vec<_>>();
819    format!("[{}]", items.join(", "))
820}
821
822#[derive(Debug)]
823struct SnapshotListEntry {
824    path: String,
825    manifest: Manifest,
826}
827
828#[derive(Debug, Default)]
829struct SnapshotScanResult {
830    snapshots: Vec<SnapshotListEntry>,
831    unreadable: Vec<String>,
832}
833
834async fn scan_snapshots(storage: &OpenDalStorage) -> Result<SnapshotScanResult> {
835    let mut result = SnapshotScanResult::default();
836    for dir in storage.list_direct_child_dirs().await? {
837        let manifest_path = format!("{}/{}", dir.trim_matches('/'), MANIFEST_FILE);
838        let Some(data) = storage.read_file_if_exists(&manifest_path).await? else {
839            continue;
840        };
841
842        match serde_json::from_slice::<Manifest>(&data) {
843            Ok(manifest) => result.snapshots.push(SnapshotListEntry {
844                path: format!("{}/", dir.trim_matches('/')),
845                manifest,
846            }),
847            Err(_) => result
848                .unreadable
849                .push(format!("{}/", dir.trim_matches('/'))),
850        }
851    }
852
853    result
854        .snapshots
855        .sort_by_key(|entry| std::cmp::Reverse(entry.manifest.created_at));
856    result.unreadable.sort();
857    Ok(result)
858}
859
860fn print_snapshot_list(snapshots: &[SnapshotListEntry], unreadable_count: usize) {
861    if unreadable_count == 0 {
862        println!("Found {} snapshots:", snapshots.len());
863    } else {
864        println!(
865            "Found {} snapshots ({} {} skipped: unreadable manifest):",
866            snapshots.len(),
867            unreadable_count,
868            directory_word(unreadable_count)
869        );
870    }
871    println!();
872    println!(
873        "  {:<24}  {:<36}  {:<19}  {:<9}  {:<7}  {:<6}  Status",
874        "Path", "ID", "Created", "Catalog", "Schemas", "Chunks"
875    );
876    println!(
877        "  {:<24}  {:<36}  {:<19}  {:<9}  {:<7}  {:<6}  {:<10}",
878        "-".repeat(24),
879        "-".repeat(36),
880        "-".repeat(19),
881        "-".repeat(9),
882        "-".repeat(7),
883        "-".repeat(6),
884        "-".repeat(10)
885    );
886    for entry in snapshots {
887        let manifest = &entry.manifest;
888        println!(
889            "  {:<24}  {:<36}  {:<19}  {:<9}  {:<7}  {:<6}  {}",
890            entry.path,
891            manifest.snapshot_id,
892            manifest.created_at.format("%Y-%m-%d %H:%M:%S"),
893            manifest.catalog,
894            manifest.schemas.len(),
895            format_list_chunks(manifest),
896            snapshot_status(manifest)
897        );
898    }
899}
900
901fn print_unreadable_warnings(unreadable: &[String]) {
902    if unreadable.is_empty() {
903        return;
904    }
905
906    println!();
907    println!(
908        "Warning: {} {} had corrupt/unreadable manifest.json:",
909        unreadable.len(),
910        directory_word(unreadable.len())
911    );
912    for path in unreadable {
913        println!("  - {}", path);
914    }
915}
916
917fn directory_word(count: usize) -> &'static str {
918    if count == 1 {
919        "directory"
920    } else {
921        "directories"
922    }
923}
924
925fn snapshot_status(manifest: &Manifest) -> &'static str {
926    if manifest.schema_only {
927        "schema-only"
928    } else if manifest.is_complete() {
929        "complete"
930    } else {
931        "incomplete"
932    }
933}
934
935fn format_list_chunks(manifest: &Manifest) -> String {
936    let total = manifest.chunks.len();
937    if total == 0 {
938        return "0".to_string();
939    }
940
941    format!(
942        "{}/{}",
943        manifest.completed_count() + manifest.skipped_count(),
944        total
945    )
946}
947
948#[derive(Debug, Clone, Copy, PartialEq, Eq)]
949enum VerifySeverity {
950    Error,
951    Warn,
952}
953
954impl VerifySeverity {
955    fn as_str(self) -> &'static str {
956        match self {
957            VerifySeverity::Error => "ERROR",
958            VerifySeverity::Warn => "WARN",
959        }
960    }
961}
962
963#[derive(Debug)]
964struct VerifyProblem {
965    severity: VerifySeverity,
966    message: String,
967}
968
969#[derive(Debug, Default)]
970struct VerifyChunkSummary {
971    total: usize,
972    completed: usize,
973    skipped: usize,
974    pending: usize,
975    in_progress: usize,
976    failed: usize,
977}
978
979#[derive(Debug)]
980struct VerifyReport {
981    manifest: Manifest,
982    schema_index_exists: bool,
983    ddl_file_count: usize,
984    chunk_summary: VerifyChunkSummary,
985    data_files_total: usize,
986    data_files_verified: usize,
987    problems: Vec<VerifyProblem>,
988}
989
990impl VerifyReport {
991    fn error_count(&self) -> usize {
992        self.problems
993            .iter()
994            .filter(|problem| problem.severity == VerifySeverity::Error)
995            .count()
996    }
997
998    fn warning_count(&self) -> usize {
999        self.problems
1000            .iter()
1001            .filter(|problem| problem.severity == VerifySeverity::Warn)
1002            .count()
1003    }
1004
1005    fn has_problems(&self) -> bool {
1006        !self.problems.is_empty()
1007    }
1008
1009    fn push_error(&mut self, message: impl Into<String>) {
1010        self.problems.push(VerifyProblem {
1011            severity: VerifySeverity::Error,
1012            message: message.into(),
1013        });
1014    }
1015
1016    fn push_warn(&mut self, message: impl Into<String>) {
1017        self.problems.push(VerifyProblem {
1018            severity: VerifySeverity::Warn,
1019            message: message.into(),
1020        });
1021    }
1022}
1023
1024async fn verify_snapshot(storage: &OpenDalStorage) -> Result<VerifyReport> {
1025    let manifest = storage.read_manifest().await?;
1026    let schema_index_path = format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE);
1027    let ddl_prefix = format!("{}/{}/", SCHEMA_DIR, DDL_DIR);
1028    let schema_index_exists = storage.file_exists(&schema_index_path).await?;
1029    let ddl_files: HashSet<_> = storage
1030        .list_files_recursive(&ddl_prefix)
1031        .await?
1032        .into_iter()
1033        .collect();
1034    let ddl_file_count = ddl_files
1035        .iter()
1036        .filter(|path| path.ends_with(".sql"))
1037        .count();
1038
1039    let mut report = VerifyReport {
1040        manifest,
1041        schema_index_exists,
1042        ddl_file_count,
1043        chunk_summary: VerifyChunkSummary::default(),
1044        data_files_total: 0,
1045        data_files_verified: 0,
1046        problems: Vec::new(),
1047    };
1048
1049    if report.manifest.version != MANIFEST_VERSION {
1050        report.push_error(format!(
1051            "Manifest version mismatch: expected {}, found {}",
1052            MANIFEST_VERSION, report.manifest.version
1053        ));
1054    }
1055
1056    if !report.schema_index_exists {
1057        report.push_warn(format!("Missing schema index '{}'", schema_index_path));
1058    }
1059
1060    for schema in &report.manifest.schemas {
1061        let ddl_path = ddl_path_for_schema(schema);
1062        if !ddl_files.contains(ddl_path.as_str()) {
1063            report.problems.push(VerifyProblem {
1064                severity: VerifySeverity::Error,
1065                message: format!("Schema '{}': missing DDL file '{}'", schema, ddl_path),
1066            });
1067        }
1068    }
1069
1070    report.chunk_summary = summarize_chunks(&report.manifest);
1071    if report.manifest.schema_only {
1072        let chunk_count = report.manifest.chunks.len();
1073        if chunk_count > 0 {
1074            report.push_error(format!(
1075                "Schema-only snapshot should not contain data chunks (found {})",
1076                chunk_count
1077            ));
1078        }
1079        let data_files = storage.list_files_recursive("data/").await?;
1080        // Report the lexicographically smallest path so the message is stable
1081        // regardless of listing order across backends.
1082        if let Some(path) = data_files.iter().min() {
1083            report.push_error(format!(
1084                "Schema-only snapshot should not contain data files (found '{}')",
1085                path
1086            ));
1087        }
1088    } else if report.manifest.chunks.is_empty() {
1089        report.push_error("Full snapshot should contain at least one data chunk");
1090    } else {
1091        verify_chunks_and_data_files(storage, &mut report).await?;
1092    }
1093
1094    Ok(report)
1095}
1096
1097fn summarize_chunks(manifest: &Manifest) -> VerifyChunkSummary {
1098    VerifyChunkSummary {
1099        total: manifest.chunks.len(),
1100        completed: manifest.completed_count(),
1101        skipped: manifest.skipped_count(),
1102        pending: manifest.pending_count(),
1103        in_progress: manifest.in_progress_count(),
1104        failed: manifest.failed_count(),
1105    }
1106}
1107
1108/// A data file declared by a completed chunk that is expected to exist in storage.
1109#[derive(Debug)]
1110struct ChunkFile {
1111    chunk_id: u32,
1112    path: String,
1113}
1114
1115/// Expected snapshot contents derived purely from the manifest (no object-store IO).
1116///
1117/// Separating planning from scanning makes it obvious which problems come from
1118/// the manifest alone and which require comparing against actual storage.
1119#[derive(Debug, Default)]
1120struct VerifyPlan {
1121    /// Valid data files declared by completed chunks; each must exist in storage.
1122    files_to_check: Vec<ChunkFile>,
1123    /// All syntactically-safe data paths declared by any chunk, regardless of
1124    /// status. Used as the orphan-detection baseline so a listed-but-invalid
1125    /// file is not also reported as unexpected.
1126    claimed_data_files: HashSet<String>,
1127    /// Total data-file references in completed chunks (valid + invalid).
1128    data_files_total: usize,
1129    /// Problems detectable from the manifest alone.
1130    problems: Vec<VerifyProblem>,
1131}
1132
1133/// Actual data files discovered under `data/` (the only object-store IO in
1134/// chunk/data-file verification).
1135#[derive(Debug)]
1136struct VerifyDataScan {
1137    existing_data_files: HashSet<String>,
1138}
1139
1140/// Result of reconciling the manifest plan against the storage scan.
1141#[derive(Debug, Default)]
1142struct VerifyOutcome {
1143    data_files_total: usize,
1144    data_files_verified: usize,
1145    problems: Vec<VerifyProblem>,
1146}
1147
1148async fn verify_chunks_and_data_files(
1149    storage: &OpenDalStorage,
1150    report: &mut VerifyReport,
1151) -> Result<()> {
1152    let plan = build_verify_plan(&report.manifest);
1153    let scan = scan_data_files(storage).await?;
1154    let outcome = reconcile_plan_with_scan(plan, &scan);
1155
1156    report.data_files_total = outcome.data_files_total;
1157    report.data_files_verified = outcome.data_files_verified;
1158    report.problems.extend(outcome.problems);
1159
1160    Ok(())
1161}
1162
1163/// Builds the expected-state plan from the manifest. Pure; performs no IO.
1164fn build_verify_plan(manifest: &Manifest) -> VerifyPlan {
1165    let mut plan = VerifyPlan::default();
1166    let mut seen_chunk_ids = HashSet::new();
1167
1168    for chunk in &manifest.chunks {
1169        if !seen_chunk_ids.insert(chunk.id) {
1170            plan.problems.push(VerifyProblem {
1171                severity: VerifySeverity::Error,
1172                message: format!("Chunk {}: duplicate chunk id", chunk.id),
1173            });
1174        }
1175        for file in &chunk.files {
1176            if let Some(path) = safe_manifest_data_file_path(file) {
1177                plan.claimed_data_files.insert(path.to_string());
1178            }
1179        }
1180
1181        match chunk.status {
1182            ChunkStatus::Completed => {
1183                if chunk.files.is_empty() {
1184                    plan.problems.push(VerifyProblem {
1185                        severity: VerifySeverity::Error,
1186                        message: format!("Chunk {}: completed chunk has no data files", chunk.id),
1187                    });
1188                    continue;
1189                }
1190                let allowed_prefixes = manifest
1191                    .schemas
1192                    .iter()
1193                    .map(|schema| data_dir_for_schema_chunk(schema, chunk.id))
1194                    .collect::<Vec<_>>();
1195                for file in &chunk.files {
1196                    plan.data_files_total += 1;
1197                    match valid_manifest_data_file_path(file, &allowed_prefixes) {
1198                        Some(path) => plan.files_to_check.push(ChunkFile {
1199                            chunk_id: chunk.id,
1200                            path: path.to_string(),
1201                        }),
1202                        None => plan.problems.push(VerifyProblem {
1203                            severity: VerifySeverity::Error,
1204                            message: format!(
1205                                "Chunk {}: invalid data file path '{}'",
1206                                chunk.id, file
1207                            ),
1208                        }),
1209                    }
1210                }
1211            }
1212            ChunkStatus::Skipped => {
1213                if !chunk.files.is_empty() {
1214                    plan.problems.push(VerifyProblem {
1215                        severity: VerifySeverity::Error,
1216                        message: format!(
1217                            "Chunk {}: skipped chunk should not list data files",
1218                            chunk.id
1219                        ),
1220                    });
1221                }
1222            }
1223            ChunkStatus::Pending => {
1224                plan.problems.push(VerifyProblem {
1225                    severity: VerifySeverity::Error,
1226                    message: format!("Chunk {}: status is 'pending'", chunk.id),
1227                });
1228            }
1229            ChunkStatus::InProgress => {
1230                plan.problems.push(VerifyProblem {
1231                    severity: VerifySeverity::Error,
1232                    message: format!("Chunk {}: status is 'in_progress'", chunk.id),
1233                });
1234            }
1235            ChunkStatus::Failed => {
1236                let reason = chunk.error.as_deref().unwrap_or("unknown error");
1237                plan.problems.push(VerifyProblem {
1238                    severity: VerifySeverity::Error,
1239                    message: format!("Chunk {}: status is 'failed' (error: {})", chunk.id, reason),
1240                });
1241            }
1242        }
1243    }
1244
1245    plan
1246}
1247
1248/// Lists all data files under `data/`. This is the only object-store IO in
1249/// chunk/data-file verification.
1250async fn scan_data_files(storage: &OpenDalStorage) -> Result<VerifyDataScan> {
1251    let existing_data_files = storage
1252        .list_files_recursive("data/")
1253        .await?
1254        .into_iter()
1255        .collect();
1256    Ok(VerifyDataScan {
1257        existing_data_files,
1258    })
1259}
1260
1261/// Reconciles the manifest plan against the storage scan. Pure; performs no IO.
1262///
1263/// Emits missing-file problems for expected files absent from storage and
1264/// unexpected-file problems for storage files no chunk claims. Unexpected files
1265/// are sorted by path so output is deterministic regardless of listing order.
1266fn reconcile_plan_with_scan(plan: VerifyPlan, scan: &VerifyDataScan) -> VerifyOutcome {
1267    let mut problems = plan.problems;
1268    let mut data_files_verified = 0;
1269
1270    for file in &plan.files_to_check {
1271        if scan.existing_data_files.contains(&file.path) {
1272            data_files_verified += 1;
1273        } else {
1274            problems.push(VerifyProblem {
1275                severity: VerifySeverity::Error,
1276                message: format!("Chunk {}: missing file '{}'", file.chunk_id, file.path),
1277            });
1278        }
1279    }
1280
1281    let mut orphans: Vec<&String> = scan
1282        .existing_data_files
1283        .iter()
1284        .filter(|path| !plan.claimed_data_files.contains(*path))
1285        .collect();
1286    orphans.sort();
1287    for path in orphans {
1288        problems.push(VerifyProblem {
1289            severity: VerifySeverity::Error,
1290            message: format!("Unexpected data file '{}' is not listed in manifest", path),
1291        });
1292    }
1293
1294    VerifyOutcome {
1295        data_files_total: plan.data_files_total,
1296        data_files_verified,
1297        problems,
1298    }
1299}
1300
1301fn valid_manifest_data_file_path<'a>(
1302    path: &'a str,
1303    allowed_prefixes: &[String],
1304) -> Option<&'a str> {
1305    let normalized = safe_manifest_data_file_path(path)?;
1306
1307    if !allowed_prefixes
1308        .iter()
1309        .any(|prefix| normalized.starts_with(prefix))
1310    {
1311        return None;
1312    }
1313
1314    Some(normalized)
1315}
1316
1317fn safe_manifest_data_file_path(path: &str) -> Option<&str> {
1318    let normalized = path.trim_start_matches('/');
1319    if normalized.is_empty() || !normalized.starts_with("data/") {
1320        return None;
1321    }
1322
1323    if normalized
1324        .split('/')
1325        .any(|segment| segment.is_empty() || segment == "." || segment == "..")
1326    {
1327        return None;
1328    }
1329
1330    Some(normalized)
1331}
1332
1333fn print_verify_report(snapshot: &str, report: &VerifyReport) {
1334    println!("Verifying snapshot: {}", report.manifest.snapshot_id);
1335    println!("  Location:     {}", snapshot);
1336    if report.manifest.version == MANIFEST_VERSION {
1337        println!("  Manifest:     OK (version {})", report.manifest.version);
1338    } else {
1339        println!(
1340            "  Manifest:     ERROR (version {}, expected {})",
1341            report.manifest.version, MANIFEST_VERSION
1342        );
1343    }
1344    println!(
1345        "  Schema files: {}",
1346        if report.schema_index_exists {
1347            format!("OK ({})", SCHEMAS_FILE)
1348        } else {
1349            format!("WARN (missing {})", SCHEMAS_FILE)
1350        }
1351    );
1352    if report.ddl_file_count > 0 {
1353        println!("  DDL files:    {} file(s) found", report.ddl_file_count);
1354    } else {
1355        println!("  DDL files:    not present");
1356    }
1357
1358    let chunks = &report.chunk_summary;
1359    println!(
1360        "  Chunks:       {} total ({} completed, {} skipped, {} pending, {} in_progress, {} failed)",
1361        chunks.total,
1362        chunks.completed,
1363        chunks.skipped,
1364        chunks.pending,
1365        chunks.in_progress,
1366        chunks.failed
1367    );
1368
1369    if report.manifest.schema_only {
1370        println!("  Data files:   skipped (schema-only)");
1371    } else {
1372        println!(
1373            "  Data files:   {}/{} files verified",
1374            report.data_files_verified, report.data_files_total
1375        );
1376    }
1377
1378    if report.problems.is_empty() {
1379        println!();
1380        println!("Snapshot is valid.");
1381        return;
1382    }
1383
1384    println!();
1385    println!("Problems found:");
1386    for problem in &report.problems {
1387        println!("  [{}] {}", problem.severity.as_str(), problem.message);
1388    }
1389    println!();
1390    println!(
1391        "Snapshot has {} error(s), {} warning(s).",
1392        report.error_count(),
1393        report.warning_count()
1394    );
1395}
1396
1397fn print_delete_summary(snapshot: &str, manifest: &Manifest) {
1398    println!("Snapshot: {}", manifest.snapshot_id);
1399    println!("  Location: {}", snapshot);
1400    println!(
1401        "  Created:  {} UTC",
1402        manifest.created_at.format("%Y-%m-%d %H:%M:%S")
1403    );
1404    println!("  Catalog:  {}", manifest.catalog);
1405    println!("  Schemas:  {}", manifest.schemas.join(", "));
1406    println!("  Chunks:   {}", format_delete_chunks(manifest));
1407}
1408
1409fn format_delete_chunks(manifest: &Manifest) -> String {
1410    if manifest.schema_only {
1411        return "0 (schema-only)".to_string();
1412    }
1413
1414    let summary = summarize_chunks(manifest);
1415    if manifest.is_complete() {
1416        format!("{} (all processed)", summary.total)
1417    } else {
1418        format!(
1419            "{} ({} completed, {} skipped, {} pending, {} in_progress, {} failed)",
1420            summary.total,
1421            summary.completed,
1422            summary.skipped,
1423            summary.pending,
1424            summary.in_progress,
1425            summary.failed
1426        )
1427    }
1428}
1429
1430fn confirm_delete(snapshot: &str) -> Result<bool> {
1431    println!();
1432    println!(
1433        "Warning: this removes the entire snapshot directory/prefix, not only files listed in manifest."
1434    );
1435    println!("This will permanently delete all data under:");
1436    println!("  {}", display_snapshot_prefix(snapshot));
1437    print!("Type 'yes' to confirm deletion: ");
1438    io::stdout().flush().map_err(|error| {
1439        IoSnafu {
1440            operation: "flushing delete confirmation prompt",
1441            error,
1442        }
1443        .build()
1444    })?;
1445
1446    let mut input = String::new();
1447    io::stdin().read_line(&mut input).map_err(|error| {
1448        IoSnafu {
1449            operation: "reading delete confirmation",
1450            error,
1451        }
1452        .build()
1453    })?;
1454
1455    Ok(delete_confirmation_matches(&input))
1456}
1457
1458fn delete_confirmation_matches(input: &str) -> bool {
1459    input.trim() == "yes"
1460}
1461
1462fn display_snapshot_prefix(snapshot: &str) -> String {
1463    if snapshot.ends_with('/') {
1464        snapshot.to_string()
1465    } else {
1466        format!("{}/", snapshot)
1467    }
1468}
1469
1470#[cfg(test)]
1471mod tests {
1472    use chrono::TimeZone;
1473    use clap::Parser;
1474    use tempfile::tempdir;
1475    use url::Url;
1476
1477    use super::*;
1478    use crate::data::path::ddl_path_for_schema;
1479
1480    #[test]
1481    fn test_ddl_path_for_schema() {
1482        assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
1483        assert_eq!(
1484            ddl_path_for_schema("../evil"),
1485            "schema/ddl/%2E%2E%2Fevil.sql"
1486        );
1487    }
1488
1489    #[test]
1490    fn test_build_schema_ddl_order() {
1491        let ddl = build_schema_ddl(
1492            "public",
1493            "CREATE DATABASE public;\n".to_string(),
1494            vec!["PHYSICAL;\n".to_string()],
1495            vec!["TABLE;\n".to_string()],
1496            vec!["VIEW;\n".to_string()],
1497        );
1498
1499        let db_pos = ddl.find("CREATE DATABASE").unwrap();
1500        let physical_pos = ddl.find("PHYSICAL;").unwrap();
1501        let table_pos = ddl.find("TABLE;").unwrap();
1502        let view_pos = ddl.find("VIEW;").unwrap();
1503        assert!(db_pos < physical_pos);
1504        assert!(physical_pos < table_pos);
1505        assert!(table_pos < view_pos);
1506    }
1507
1508    #[tokio::test]
1509    async fn test_build_rejects_chunk_window_without_bounds() {
1510        let cmd = ExportCreateCommand::parse_from([
1511            "export-v2-create",
1512            "--addr",
1513            "127.0.0.1:4000",
1514            "--to",
1515            "file:///tmp/export-v2-test",
1516            "--chunk-time-window",
1517            "1h",
1518        ]);
1519
1520        let result = cmd.build().await;
1521        assert!(result.is_err());
1522        let error = result.err().unwrap().to_string();
1523
1524        assert!(error.contains("chunk_time_window requires both --start-time and --end-time"));
1525    }
1526
1527    #[tokio::test]
1528    async fn test_build_rejects_data_export_args_in_schema_only_mode() {
1529        let cmd = ExportCreateCommand::parse_from([
1530            "export-v2-create",
1531            "--addr",
1532            "127.0.0.1:4000",
1533            "--to",
1534            "file:///tmp/export-v2-test",
1535            "--schema-only",
1536            "--start-time",
1537            "2024-01-01T00:00:00Z",
1538            "--end-time",
1539            "2024-01-02T00:00:00Z",
1540            "--chunk-time-window",
1541            "1h",
1542            "--format",
1543            "csv",
1544            "--parallelism",
1545            "2",
1546        ]);
1547
1548        let error = cmd.build().await.err().unwrap().to_string();
1549
1550        assert!(error.contains("--schema-only cannot be used with data export arguments"));
1551        assert!(error.contains("--start-time"));
1552        assert!(error.contains("--end-time"));
1553        assert!(error.contains("--chunk-time-window"));
1554        assert!(error.contains("--format"));
1555        assert!(error.contains("--parallelism"));
1556    }
1557
1558    #[test]
1559    fn test_schema_only_mode_mismatch_error_message() {
1560        let error = crate::data::export_v2::error::SchemaOnlyModeMismatchSnafu {
1561            existing_schema_only: false,
1562            requested_schema_only: true,
1563        }
1564        .build()
1565        .to_string();
1566
1567        assert!(error.contains("existing: false"));
1568        assert!(error.contains("requested: true"));
1569    }
1570
1571    #[test]
1572    fn test_validate_resume_config_rejects_catalog_mismatch() {
1573        let manifest = Manifest::new_for_export(
1574            "greptime".to_string(),
1575            vec!["public".to_string()],
1576            false,
1577            TimeRange::unbounded(),
1578            DataFormat::Parquet,
1579            None,
1580        )
1581        .unwrap();
1582        let config = ExportConfig {
1583            catalog: "other".to_string(),
1584            schemas: None,
1585            schema_only: false,
1586            format: DataFormat::Parquet,
1587            force: false,
1588            time_range: TimeRange::unbounded(),
1589            chunk_time_window: None,
1590            parallelism: 1,
1591            snapshot_uri: "file:///tmp/snapshot".to_string(),
1592            storage_config: ObjectStoreConfig::default(),
1593        };
1594
1595        let error = validate_resume_config(&manifest, &config)
1596            .err()
1597            .unwrap()
1598            .to_string();
1599        assert!(error.contains("catalog"));
1600    }
1601
1602    #[test]
1603    fn test_validate_resume_config_accepts_schema_selection_with_different_case_and_order() {
1604        let manifest = Manifest::new_for_export(
1605            "greptime".to_string(),
1606            vec!["public".to_string(), "analytics".to_string()],
1607            false,
1608            TimeRange::unbounded(),
1609            DataFormat::Parquet,
1610            None,
1611        )
1612        .unwrap();
1613        let config = ExportConfig {
1614            catalog: "greptime".to_string(),
1615            schemas: Some(vec![
1616                "ANALYTICS".to_string(),
1617                "PUBLIC".to_string(),
1618                "public".to_string(),
1619            ]),
1620            schema_only: false,
1621            format: DataFormat::Parquet,
1622            force: false,
1623            time_range: TimeRange::unbounded(),
1624            chunk_time_window: None,
1625            parallelism: 1,
1626            snapshot_uri: "file:///tmp/snapshot".to_string(),
1627            storage_config: ObjectStoreConfig::default(),
1628        };
1629
1630        assert!(validate_resume_config(&manifest, &config).is_ok());
1631    }
1632
1633    #[test]
1634    fn test_validate_resume_config_rejects_chunk_plan_mismatch() {
1635        let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
1636        let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 2, 0, 0).unwrap();
1637        let time_range = TimeRange::new(Some(start), Some(end));
1638        let manifest = Manifest::new_for_export(
1639            "greptime".to_string(),
1640            vec!["public".to_string()],
1641            false,
1642            time_range.clone(),
1643            DataFormat::Parquet,
1644            None,
1645        )
1646        .unwrap();
1647        let config = ExportConfig {
1648            catalog: "greptime".to_string(),
1649            schemas: None,
1650            schema_only: false,
1651            format: DataFormat::Parquet,
1652            force: false,
1653            time_range,
1654            chunk_time_window: Some(Duration::from_secs(3600)),
1655            parallelism: 1,
1656            snapshot_uri: "file:///tmp/snapshot".to_string(),
1657            storage_config: ObjectStoreConfig::default(),
1658        };
1659
1660        let error = validate_resume_config(&manifest, &config)
1661            .err()
1662            .unwrap()
1663            .to_string();
1664        assert!(error.contains("chunk plan"));
1665    }
1666
1667    #[test]
1668    fn test_validate_resume_config_rejects_format_mismatch() {
1669        let manifest = Manifest::new_for_export(
1670            "greptime".to_string(),
1671            vec!["public".to_string()],
1672            false,
1673            TimeRange::unbounded(),
1674            DataFormat::Parquet,
1675            None,
1676        )
1677        .unwrap();
1678        let config = ExportConfig {
1679            catalog: "greptime".to_string(),
1680            schemas: None,
1681            schema_only: false,
1682            format: DataFormat::Csv,
1683            force: false,
1684            time_range: TimeRange::unbounded(),
1685            chunk_time_window: None,
1686            parallelism: 1,
1687            snapshot_uri: "file:///tmp/snapshot".to_string(),
1688            storage_config: ObjectStoreConfig::default(),
1689        };
1690
1691        let error = validate_resume_config(&manifest, &config)
1692            .err()
1693            .unwrap()
1694            .to_string();
1695        assert!(error.contains("format"));
1696    }
1697
1698    #[test]
1699    fn test_validate_resume_config_rejects_time_range_mismatch() {
1700        let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
1701        let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap();
1702        let manifest = Manifest::new_for_export(
1703            "greptime".to_string(),
1704            vec!["public".to_string()],
1705            false,
1706            TimeRange::new(Some(start), Some(end)),
1707            DataFormat::Parquet,
1708            None,
1709        )
1710        .unwrap();
1711        let config = ExportConfig {
1712            catalog: "greptime".to_string(),
1713            schemas: None,
1714            schema_only: false,
1715            format: DataFormat::Parquet,
1716            force: false,
1717            time_range: TimeRange::new(Some(start), Some(start)),
1718            chunk_time_window: None,
1719            parallelism: 1,
1720            snapshot_uri: "file:///tmp/snapshot".to_string(),
1721            storage_config: ObjectStoreConfig::default(),
1722        };
1723
1724        let error = validate_resume_config(&manifest, &config)
1725            .err()
1726            .unwrap()
1727            .to_string();
1728        assert!(error.contains("time_range"));
1729    }
1730
1731    #[tokio::test]
1732    async fn test_scan_snapshots_sorts_and_tracks_unreadable_manifests() {
1733        let dir = tempdir().unwrap();
1734        write_test_manifest(
1735            dir.path(),
1736            "older",
1737            test_manifest(
1738                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1739                false,
1740                true,
1741            ),
1742        );
1743        write_test_manifest(
1744            dir.path(),
1745            "newer",
1746            test_manifest(
1747                chrono::Utc.with_ymd_and_hms(2026, 2, 1, 0, 0, 0).unwrap(),
1748                false,
1749                true,
1750            ),
1751        );
1752
1753        std::fs::create_dir_all(dir.path().join("empty-dir")).unwrap();
1754        std::fs::create_dir_all(dir.path().join("not-snapshot")).unwrap();
1755        std::fs::write(dir.path().join("not-snapshot").join("data.txt"), "x").unwrap();
1756        std::fs::create_dir_all(dir.path().join("broken")).unwrap();
1757        std::fs::write(dir.path().join("broken").join(MANIFEST_FILE), "{not-json").unwrap();
1758
1759        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
1760        let storage = OpenDalStorage::from_file_uri(&uri).unwrap();
1761        let result = scan_snapshots(&storage).await.unwrap();
1762
1763        assert_eq!(result.snapshots.len(), 2);
1764        assert_eq!(
1765            result.snapshots[0].manifest.created_at,
1766            chrono::Utc.with_ymd_and_hms(2026, 2, 1, 0, 0, 0).unwrap()
1767        );
1768        assert_eq!(
1769            result.snapshots[1].manifest.created_at,
1770            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap()
1771        );
1772        assert_eq!(result.unreadable, vec!["broken/".to_string()]);
1773        assert_eq!(result.snapshots[0].path, "newer/");
1774        assert_eq!(result.snapshots[1].path, "older/");
1775    }
1776
1777    #[test]
1778    fn test_snapshot_list_status_and_chunk_summary() {
1779        let schema_only = test_manifest(
1780            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1781            true,
1782            true,
1783        );
1784        assert_eq!(snapshot_status(&schema_only), "schema-only");
1785        assert_eq!(format_list_chunks(&schema_only), "0");
1786
1787        let complete = test_manifest(
1788            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1789            false,
1790            true,
1791        );
1792        assert_eq!(snapshot_status(&complete), "complete");
1793        assert_eq!(format_list_chunks(&complete), "2/2");
1794        assert_eq!(format_delete_chunks(&complete), "2 (all processed)");
1795
1796        let incomplete = test_manifest(
1797            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1798            false,
1799            false,
1800        );
1801        assert_eq!(snapshot_status(&incomplete), "incomplete");
1802        assert_eq!(format_list_chunks(&incomplete), "1/2");
1803        assert_eq!(
1804            format_delete_chunks(&incomplete),
1805            "2 (1 completed, 0 skipped, 1 pending, 0 in_progress, 0 failed)"
1806        );
1807    }
1808
1809    #[tokio::test]
1810    async fn test_delete_build_rejects_bucket_root_uri() {
1811        let cmd = ExportDeleteCommand::parse_from([
1812            "export-v2-delete",
1813            "--snapshot",
1814            "s3://bucket",
1815            "--no-confirm",
1816        ]);
1817
1818        let error = cmd.build().await.err().unwrap().to_string();
1819        assert!(error.contains("non-empty path"));
1820    }
1821
1822    #[test]
1823    fn test_delete_skip_confirmation_aliases() {
1824        let no_confirm = ExportDeleteCommand::parse_from([
1825            "export-v2-delete",
1826            "--snapshot",
1827            "s3://bucket/snapshot",
1828            "--no-confirm",
1829        ]);
1830        assert!(no_confirm.skip_confirmation);
1831
1832        let yes = ExportDeleteCommand::parse_from([
1833            "export-v2-delete",
1834            "--snapshot",
1835            "s3://bucket/snapshot",
1836            "--yes",
1837        ]);
1838        assert!(yes.skip_confirmation);
1839    }
1840
1841    #[tokio::test]
1842    async fn test_delete_snapshot_with_no_confirm_removes_snapshot_contents() {
1843        let parent = tempdir().unwrap();
1844        let snapshot = parent.path().join("snapshot");
1845        let sibling = parent.path().join("sibling");
1846        std::fs::create_dir_all(&snapshot).unwrap();
1847        std::fs::create_dir_all(&sibling).unwrap();
1848        std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
1849        write_root_manifest(
1850            &snapshot,
1851            test_manifest(
1852                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1853                true,
1854                true,
1855            ),
1856        );
1857        write_snapshot_file(&snapshot, "schema/schemas.json", b"[]");
1858
1859        let uri = Url::from_directory_path(&snapshot).unwrap().to_string();
1860        let delete = ExportDelete {
1861            snapshot: uri,
1862            skip_confirmation: true,
1863            storage: file_storage_for_dir(&snapshot),
1864        };
1865
1866        delete
1867            .run_with_confirmation(|_| unreachable!())
1868            .await
1869            .unwrap();
1870
1871        assert!(!snapshot.join(MANIFEST_FILE).exists());
1872        assert!(!snapshot.join("schema/schemas.json").exists());
1873        assert!(sibling.join("keep.txt").exists());
1874    }
1875
1876    #[tokio::test]
1877    async fn test_delete_snapshot_requires_manifest() {
1878        let dir = tempdir().unwrap();
1879        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
1880        let delete = ExportDelete {
1881            snapshot: uri,
1882            skip_confirmation: true,
1883            storage: file_storage_for_dir(dir.path()),
1884        };
1885
1886        let error = delete
1887            .run_with_confirmation(|_| unreachable!())
1888            .await
1889            .err()
1890            .unwrap()
1891            .to_string();
1892
1893        assert!(error.contains("Snapshot not found"));
1894        assert!(dir.path().exists());
1895    }
1896
1897    #[tokio::test]
1898    async fn test_delete_snapshot_cancels_without_exact_confirmation() {
1899        let dir = tempdir().unwrap();
1900        write_root_manifest(
1901            dir.path(),
1902            test_manifest(
1903                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1904                true,
1905                true,
1906            ),
1907        );
1908        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
1909        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
1910        let delete = ExportDelete {
1911            snapshot: uri.clone(),
1912            skip_confirmation: false,
1913            storage: file_storage_for_dir(dir.path()),
1914        };
1915
1916        delete
1917            .run_with_confirmation(|snapshot| {
1918                assert_eq!(snapshot, uri);
1919                Ok(false)
1920            })
1921            .await
1922            .unwrap();
1923
1924        assert!(dir.path().join(MANIFEST_FILE).exists());
1925        assert!(dir.path().join("schema/schemas.json").exists());
1926    }
1927
1928    #[test]
1929    fn test_delete_confirmation_requires_exact_yes() {
1930        assert!(delete_confirmation_matches("yes"));
1931        assert!(delete_confirmation_matches(" yes\n"));
1932        assert!(!delete_confirmation_matches("YES"));
1933        assert!(!delete_confirmation_matches("y"));
1934        assert!(!delete_confirmation_matches("yes please"));
1935    }
1936
1937    #[test]
1938    fn test_display_snapshot_prefix_adds_trailing_slash() {
1939        assert_eq!(
1940            display_snapshot_prefix("s3://bucket/snapshot"),
1941            "s3://bucket/snapshot/"
1942        );
1943        assert_eq!(
1944            display_snapshot_prefix("s3://bucket/snapshot/"),
1945            "s3://bucket/snapshot/"
1946        );
1947    }
1948
1949    #[tokio::test]
1950    async fn test_verify_snapshot_accepts_valid_full_snapshot() {
1951        let dir = tempdir().unwrap();
1952        let manifest = test_manifest(
1953            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1954            false,
1955            true,
1956        );
1957        write_root_manifest(dir.path(), manifest);
1958        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
1959        write_default_ddl_files(dir.path());
1960        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
1961
1962        let storage = file_storage_for_dir(dir.path());
1963        let report = verify_snapshot(&storage).await.unwrap();
1964
1965        assert_eq!(report.error_count(), 0);
1966        assert_eq!(report.warning_count(), 0);
1967        assert_eq!(report.data_files_total, 1);
1968        assert_eq!(report.data_files_verified, 1);
1969    }
1970
1971    #[tokio::test]
1972    async fn test_verify_snapshot_reports_missing_data_file_and_failed_chunk() {
1973        let dir = tempdir().unwrap();
1974        let mut manifest = test_manifest(
1975            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1976            false,
1977            true,
1978        );
1979        manifest.chunks[1].mark_failed("copy failed".to_string());
1980        write_root_manifest(dir.path(), manifest);
1981        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
1982        write_default_ddl_files(dir.path());
1983
1984        let storage = file_storage_for_dir(dir.path());
1985        let report = verify_snapshot(&storage).await.unwrap();
1986
1987        assert_eq!(report.error_count(), 2);
1988        assert!(
1989            report
1990                .problems
1991                .iter()
1992                .any(|problem| problem.message.contains("missing file"))
1993        );
1994        assert!(
1995            report
1996                .problems
1997                .iter()
1998                .any(|problem| problem.message.contains("status is 'failed'"))
1999        );
2000    }
2001
2002    #[tokio::test]
2003    async fn test_verify_snapshot_reports_missing_schema_index_as_warning() {
2004        let dir = tempdir().unwrap();
2005        let manifest = test_manifest(
2006            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2007            false,
2008            true,
2009        );
2010        write_root_manifest(dir.path(), manifest);
2011        write_default_ddl_files(dir.path());
2012        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2013
2014        let storage = file_storage_for_dir(dir.path());
2015        let report = verify_snapshot(&storage).await.unwrap();
2016
2017        assert_eq!(report.error_count(), 0);
2018        assert_eq!(report.warning_count(), 1);
2019        assert!(
2020            report
2021                .problems
2022                .iter()
2023                .any(|problem| problem.message.contains("Missing schema index"))
2024        );
2025    }
2026
2027    #[tokio::test]
2028    async fn test_verify_snapshot_rejects_schema_only_snapshot_with_chunks() {
2029        let dir = tempdir().unwrap();
2030        let mut manifest = test_manifest(
2031            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2032            true,
2033            true,
2034        );
2035        let mut chunk = ChunkMeta::new(1, TimeRange::unbounded());
2036        chunk.mark_completed(vec!["data/public/1/file.parquet".to_string()], None);
2037        manifest.chunks.push(chunk);
2038        write_root_manifest(dir.path(), manifest);
2039        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2040        write_default_ddl_files(dir.path());
2041
2042        let storage = file_storage_for_dir(dir.path());
2043        let report = verify_snapshot(&storage).await.unwrap();
2044
2045        assert_eq!(report.error_count(), 1);
2046        assert_eq!(report.data_files_total, 0);
2047        assert!(
2048            report
2049                .problems
2050                .iter()
2051                .any(|problem| problem.message.contains("should not contain data chunks"))
2052        );
2053    }
2054
2055    #[tokio::test]
2056    async fn test_verify_snapshot_rejects_schema_only_snapshot_with_data_files() {
2057        let dir = tempdir().unwrap();
2058        let manifest = test_manifest(
2059            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2060            true,
2061            true,
2062        );
2063        write_root_manifest(dir.path(), manifest);
2064        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2065        write_default_ddl_files(dir.path());
2066        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2067
2068        let storage = file_storage_for_dir(dir.path());
2069        let report = verify_snapshot(&storage).await.unwrap();
2070
2071        assert_eq!(report.error_count(), 1);
2072        assert_eq!(report.data_files_total, 0);
2073        assert!(
2074            report
2075                .problems
2076                .iter()
2077                .any(|problem| problem.message.contains("should not contain data files"))
2078        );
2079    }
2080
2081    #[tokio::test]
2082    async fn test_verify_snapshot_rejects_full_snapshot_without_chunks() {
2083        let dir = tempdir().unwrap();
2084        let mut manifest = test_manifest(
2085            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2086            false,
2087            true,
2088        );
2089        manifest.chunks.clear();
2090        write_root_manifest(dir.path(), manifest);
2091        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2092        write_default_ddl_files(dir.path());
2093
2094        let storage = file_storage_for_dir(dir.path());
2095        let report = verify_snapshot(&storage).await.unwrap();
2096
2097        assert_eq!(report.error_count(), 1);
2098        assert_eq!(report.data_files_total, 0);
2099        assert!(
2100            report
2101                .problems
2102                .iter()
2103                .any(|problem| problem.message.contains("at least one data chunk"))
2104        );
2105    }
2106
2107    #[tokio::test]
2108    async fn test_verify_snapshot_rejects_skipped_chunk_data_files() {
2109        let dir = tempdir().unwrap();
2110        let manifest = test_manifest(
2111            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2112            false,
2113            true,
2114        );
2115        write_root_manifest(dir.path(), manifest);
2116        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2117        write_default_ddl_files(dir.path());
2118        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2119        write_snapshot_file(dir.path(), "data/public/2/file.parquet", b"data");
2120
2121        let storage = file_storage_for_dir(dir.path());
2122        let report = verify_snapshot(&storage).await.unwrap();
2123
2124        assert_eq!(report.error_count(), 1);
2125        assert!(
2126            report
2127                .problems
2128                .iter()
2129                .any(|problem| { problem.message.contains("Unexpected data file") })
2130        );
2131    }
2132
2133    #[tokio::test]
2134    async fn test_verify_snapshot_rejects_duplicate_chunk_ids() {
2135        let dir = tempdir().unwrap();
2136        let mut manifest = test_manifest(
2137            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2138            false,
2139            true,
2140        );
2141        let mut duplicate = ChunkMeta::new(1, TimeRange::unbounded());
2142        duplicate.mark_completed(vec!["data/public/1/file.parquet".to_string()], None);
2143        manifest.chunks.push(duplicate);
2144        write_root_manifest(dir.path(), manifest);
2145        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2146        write_default_ddl_files(dir.path());
2147        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2148
2149        let storage = file_storage_for_dir(dir.path());
2150        let report = verify_snapshot(&storage).await.unwrap();
2151
2152        assert_eq!(report.error_count(), 1);
2153        assert!(
2154            report
2155                .problems
2156                .iter()
2157                .any(|problem| problem.message.contains("duplicate chunk id"))
2158        );
2159    }
2160
2161    #[tokio::test]
2162    async fn test_verify_snapshot_requires_all_schema_ddl() {
2163        let dir = tempdir().unwrap();
2164        let manifest = test_manifest(
2165            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2166            true,
2167            true,
2168        );
2169        write_root_manifest(dir.path(), manifest);
2170        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2171        write_snapshot_file(
2172            dir.path(),
2173            "schema/ddl/public.sql",
2174            b"CREATE DATABASE public;",
2175        );
2176
2177        let storage = file_storage_for_dir(dir.path());
2178        let report = verify_snapshot(&storage).await.unwrap();
2179
2180        assert_eq!(report.error_count(), 1);
2181        assert!(
2182            report
2183                .problems
2184                .iter()
2185                .any(|problem| problem.message.contains("analytics"))
2186        );
2187    }
2188
2189    #[tokio::test]
2190    async fn test_verify_snapshot_reports_missing_ddl_dir() {
2191        let dir = tempdir().unwrap();
2192        let manifest = test_manifest(
2193            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2194            false,
2195            true,
2196        );
2197        write_root_manifest(dir.path(), manifest);
2198        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2199        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2200
2201        let storage = file_storage_for_dir(dir.path());
2202        let report = verify_snapshot(&storage).await.unwrap();
2203
2204        assert_eq!(report.error_count(), 2);
2205        assert!(
2206            report
2207                .problems
2208                .iter()
2209                .any(|problem| problem.message.contains("schema/ddl/public.sql"))
2210        );
2211        assert!(
2212            report
2213                .problems
2214                .iter()
2215                .any(|problem| problem.message.contains("schema/ddl/analytics.sql"))
2216        );
2217    }
2218
2219    #[tokio::test]
2220    async fn test_verify_snapshot_reports_manifest_version_mismatch() {
2221        let dir = tempdir().unwrap();
2222        let mut manifest = test_manifest(
2223            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2224            false,
2225            true,
2226        );
2227        manifest.version = MANIFEST_VERSION + 1;
2228        write_root_manifest(dir.path(), manifest);
2229        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2230        write_default_ddl_files(dir.path());
2231        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2232
2233        let storage = file_storage_for_dir(dir.path());
2234        let report = verify_snapshot(&storage).await.unwrap();
2235
2236        assert_eq!(report.error_count(), 1);
2237        assert!(
2238            report
2239                .problems
2240                .iter()
2241                .any(|problem| problem.message.contains("Manifest version mismatch"))
2242        );
2243    }
2244
2245    #[tokio::test]
2246    async fn test_verify_snapshot_rejects_invalid_data_file_paths() {
2247        let dir = tempdir().unwrap();
2248        let mut manifest = test_manifest(
2249            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2250            false,
2251            true,
2252        );
2253        manifest.chunks[0].files = vec!["data/public/1/../file.parquet".to_string()];
2254        write_root_manifest(dir.path(), manifest);
2255        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2256        write_default_ddl_files(dir.path());
2257
2258        let storage = file_storage_for_dir(dir.path());
2259        let report = verify_snapshot(&storage).await.unwrap();
2260
2261        assert_eq!(report.error_count(), 1);
2262        assert!(
2263            report
2264                .problems
2265                .iter()
2266                .any(|problem| problem.message.contains("invalid data file path"))
2267        );
2268        assert_eq!(report.data_files_verified, 0);
2269    }
2270
2271    #[tokio::test]
2272    async fn test_verify_snapshot_accepts_leading_slash_manifest_data_paths() {
2273        let dir = tempdir().unwrap();
2274        let mut manifest = test_manifest(
2275            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2276            false,
2277            true,
2278        );
2279        manifest.chunks[0].files = vec!["/data/public/1/file.parquet".to_string()];
2280        write_root_manifest(dir.path(), manifest);
2281        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2282        write_default_ddl_files(dir.path());
2283        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2284
2285        let storage = file_storage_for_dir(dir.path());
2286        let report = verify_snapshot(&storage).await.unwrap();
2287
2288        assert_eq!(report.error_count(), 0);
2289        assert_eq!(report.data_files_verified, 1);
2290    }
2291
2292    #[tokio::test]
2293    async fn test_verify_snapshot_rejects_unlisted_files_under_completed_chunk_prefix() {
2294        let dir = tempdir().unwrap();
2295        let manifest = test_manifest(
2296            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2297            false,
2298            true,
2299        );
2300        write_root_manifest(dir.path(), manifest);
2301        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2302        write_default_ddl_files(dir.path());
2303        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2304        write_snapshot_file(dir.path(), "data/public/1/extra.parquet", b"data");
2305
2306        let storage = file_storage_for_dir(dir.path());
2307        let report = verify_snapshot(&storage).await.unwrap();
2308
2309        assert_eq!(report.error_count(), 1);
2310        assert!(
2311            report
2312                .problems
2313                .iter()
2314                .any(|problem| problem.message.contains("Unexpected data file"))
2315        );
2316        assert_eq!(report.data_files_verified, 1);
2317    }
2318
2319    #[tokio::test]
2320    async fn test_verify_snapshot_rejects_orphan_data_files_outside_known_chunk_prefixes() {
2321        let dir = tempdir().unwrap();
2322        let manifest = test_manifest(
2323            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2324            false,
2325            true,
2326        );
2327        write_root_manifest(dir.path(), manifest);
2328        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2329        write_default_ddl_files(dir.path());
2330        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2331        write_snapshot_file(dir.path(), "data/public/99/file.parquet", b"data");
2332
2333        let storage = file_storage_for_dir(dir.path());
2334        let report = verify_snapshot(&storage).await.unwrap();
2335
2336        assert_eq!(report.error_count(), 1);
2337        assert!(
2338            report
2339                .problems
2340                .iter()
2341                .any(|problem| problem.message.contains("Unexpected data file"))
2342        );
2343        assert_eq!(report.data_files_verified, 1);
2344    }
2345
2346    #[tokio::test]
2347    async fn test_verify_snapshot_rejects_data_files_under_wrong_chunk_or_schema() {
2348        let dir = tempdir().unwrap();
2349        let mut manifest = test_manifest(
2350            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2351            false,
2352            true,
2353        );
2354        manifest.chunks[0].files = vec![
2355            "data/public/99/file.parquet".to_string(),
2356            "data/metrics/1/file.parquet".to_string(),
2357        ];
2358        write_root_manifest(dir.path(), manifest);
2359        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2360        write_default_ddl_files(dir.path());
2361        write_snapshot_file(dir.path(), "data/public/99/file.parquet", b"data");
2362        write_snapshot_file(dir.path(), "data/metrics/1/file.parquet", b"data");
2363
2364        let storage = file_storage_for_dir(dir.path());
2365        let report = verify_snapshot(&storage).await.unwrap();
2366
2367        assert_eq!(report.error_count(), 2);
2368        assert_eq!(report.data_files_verified, 0);
2369        assert!(
2370            report
2371                .problems
2372                .iter()
2373                .all(|problem| problem.message.contains("invalid data file path"))
2374        );
2375    }
2376
2377    #[test]
2378    fn test_build_verify_plan_classifies_chunks_without_io() {
2379        let mut manifest = test_manifest(
2380            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2381            false,
2382            true,
2383        );
2384        // test_manifest(complete) gives: chunk 1 completed (1 file), chunk 2 skipped.
2385        let mut failed = ChunkMeta::new(3, TimeRange::unbounded());
2386        failed.mark_failed("boom".to_string());
2387        manifest.chunks.push(failed);
2388        manifest
2389            .chunks
2390            .push(ChunkMeta::new(4, TimeRange::unbounded()));
2391
2392        let plan = build_verify_plan(&manifest);
2393
2394        assert_eq!(plan.files_to_check.len(), 1);
2395        assert_eq!(plan.files_to_check[0].chunk_id, 1);
2396        assert_eq!(plan.files_to_check[0].path, "data/public/1/file.parquet");
2397        assert_eq!(plan.data_files_total, 1);
2398        assert!(
2399            plan.claimed_data_files
2400                .contains("data/public/1/file.parquet")
2401        );
2402        assert_eq!(plan.problems.len(), 2);
2403        assert!(
2404            plan.problems
2405                .iter()
2406                .any(|problem| problem.message.contains("status is 'failed'"))
2407        );
2408        assert!(
2409            plan.problems
2410                .iter()
2411                .any(|problem| problem.message.contains("status is 'pending'"))
2412        );
2413    }
2414
2415    #[tokio::test]
2416    async fn test_verify_snapshot_produces_deterministic_problem_output() {
2417        let dir = tempdir().unwrap();
2418        let manifest = test_manifest(
2419            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2420            false,
2421            true,
2422        );
2423        write_root_manifest(dir.path(), manifest);
2424        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2425        write_default_ddl_files(dir.path());
2426        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2427        // Many orphan files under a known chunk prefix to stress ordering.
2428        for i in 0..50 {
2429            write_snapshot_file(
2430                dir.path(),
2431                &format!("data/public/1/orphan_{:02}.parquet", i),
2432                b"x",
2433            );
2434        }
2435
2436        let storage = file_storage_for_dir(dir.path());
2437        let messages = |report: &VerifyReport| {
2438            report
2439                .problems
2440                .iter()
2441                .map(|problem| problem.message.clone())
2442                .collect::<Vec<_>>()
2443        };
2444        let first = messages(&verify_snapshot(&storage).await.unwrap());
2445        let second = messages(&verify_snapshot(&storage).await.unwrap());
2446
2447        // Output is identical across runs despite HashSet-based scanning.
2448        assert_eq!(first, second);
2449
2450        let orphans = first
2451            .iter()
2452            .filter(|message| message.contains("Unexpected data file"))
2453            .cloned()
2454            .collect::<Vec<_>>();
2455        assert_eq!(orphans.len(), 50);
2456        let mut sorted = orphans.clone();
2457        sorted.sort();
2458        assert_eq!(orphans, sorted);
2459    }
2460
2461    fn write_test_manifest(root: &std::path::Path, dir: &str, manifest: Manifest) {
2462        let snapshot_dir = root.join(dir);
2463        std::fs::create_dir_all(&snapshot_dir).unwrap();
2464        std::fs::write(
2465            snapshot_dir.join(MANIFEST_FILE),
2466            serde_json::to_vec_pretty(&manifest).unwrap(),
2467        )
2468        .unwrap();
2469    }
2470
2471    fn write_root_manifest(root: &std::path::Path, manifest: Manifest) {
2472        std::fs::write(
2473            root.join(MANIFEST_FILE),
2474            serde_json::to_vec_pretty(&manifest).unwrap(),
2475        )
2476        .unwrap();
2477    }
2478
2479    fn write_snapshot_file(root: &std::path::Path, relative_path: &str, content: &[u8]) {
2480        let mut path = root.to_path_buf();
2481        for segment in relative_path.split('/') {
2482            path.push(segment);
2483        }
2484        std::fs::create_dir_all(path.parent().unwrap()).unwrap();
2485        std::fs::write(path, content).unwrap();
2486    }
2487
2488    fn write_default_ddl_files(root: &std::path::Path) {
2489        write_snapshot_file(root, "schema/ddl/public.sql", b"CREATE DATABASE public;");
2490        write_snapshot_file(
2491            root,
2492            "schema/ddl/analytics.sql",
2493            b"CREATE DATABASE analytics;",
2494        );
2495    }
2496
2497    fn file_storage_for_dir(root: &std::path::Path) -> OpenDalStorage {
2498        let uri = Url::from_directory_path(root).unwrap().to_string();
2499        OpenDalStorage::from_file_uri(&uri).unwrap()
2500    }
2501
2502    fn test_manifest(
2503        created_at: chrono::DateTime<chrono::Utc>,
2504        schema_only: bool,
2505        complete: bool,
2506    ) -> Manifest {
2507        let mut manifest = Manifest::new_for_export(
2508            "greptime".to_string(),
2509            vec!["public".to_string(), "analytics".to_string()],
2510            schema_only,
2511            TimeRange::unbounded(),
2512            DataFormat::Parquet,
2513            None,
2514        )
2515        .unwrap();
2516        manifest.created_at = created_at;
2517        manifest.updated_at = created_at;
2518
2519        if !schema_only {
2520            manifest.chunks.clear();
2521            let mut first = ChunkMeta::new(1, TimeRange::unbounded());
2522            first.mark_completed(vec!["data/public/1/file.parquet".to_string()], None);
2523            manifest.chunks.push(first);
2524
2525            if complete {
2526                manifest
2527                    .chunks
2528                    .push(ChunkMeta::skipped(2, TimeRange::unbounded()));
2529            } else {
2530                manifest
2531                    .chunks
2532                    .push(ChunkMeta::new(2, TimeRange::unbounded()));
2533            }
2534        }
2535
2536        manifest
2537    }
2538}