1use std::collections::HashSet;
18use std::io::{self, Write};
19use std::time::Duration;
20
21use async_trait::async_trait;
22use clap::{Parser, Subcommand};
23use common_error::ext::BoxedError;
24use common_telemetry::info;
25use serde_json::Value;
26use snafu::{OptionExt, ResultExt};
27
28use crate::Tool;
29use crate::common::ObjectStoreConfig;
30use crate::data::export_v2::coordinator::export_data;
31use crate::data::export_v2::error::{
32 ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu, IoSnafu,
33 ManifestVersionMismatchSnafu, Result, ResumeConfigMismatchSnafu, SchemaOnlyArgsNotAllowedSnafu,
34 SchemaOnlyModeMismatchSnafu, SnapshotVerifyFailedSnafu, UnexpectedValueTypeSnafu,
35};
36use crate::data::export_v2::extractor::SchemaExtractor;
37use crate::data::export_v2::manifest::{
38 ChunkMeta, ChunkStatus, DataFormat, MANIFEST_FILE, MANIFEST_VERSION, Manifest, TimeRange,
39};
40use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR, SCHEMAS_FILE};
41use crate::data::path::{data_dir_for_schema_chunk, ddl_path_for_schema};
42use crate::data::snapshot_storage::{
43 OpenDalStorage, SnapshotStorage, validate_snapshot_uri, validate_uri,
44};
45use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
46use crate::database::{DatabaseClient, parse_proxy_opts};
47
48#[derive(Debug, Subcommand)]
50pub enum ExportV2Command {
51 Create(ExportCreateCommand),
53 List(ExportListCommand),
55 Verify(ExportVerifyCommand),
57 Delete(ExportDeleteCommand),
59}
60
61impl ExportV2Command {
62 pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
63 match self {
64 ExportV2Command::Create(cmd) => cmd.build().await,
65 ExportV2Command::List(cmd) => cmd.build().await,
66 ExportV2Command::Verify(cmd) => cmd.build().await,
67 ExportV2Command::Delete(cmd) => cmd.build().await,
68 }
69 }
70}
71
72#[derive(Debug, Parser)]
74pub struct ExportListCommand {
75 #[clap(long)]
77 location: String,
78
79 #[clap(flatten)]
81 storage: ObjectStoreConfig,
82}
83
84impl ExportListCommand {
85 pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
86 validate_uri(&self.location).map_err(BoxedError::new)?;
87 let storage = OpenDalStorage::from_parent_uri(&self.location, &self.storage)
88 .map_err(BoxedError::new)?;
89
90 Ok(Box::new(ExportList {
91 location: self.location.clone(),
92 storage,
93 }))
94 }
95}
96
97pub struct ExportList {
99 location: String,
100 storage: OpenDalStorage,
101}
102
103#[async_trait]
104impl Tool for ExportList {
105 async fn do_work(&self) -> std::result::Result<(), BoxedError> {
106 self.run().await.map_err(BoxedError::new)
107 }
108}
109
110impl ExportList {
111 async fn run(&self) -> Result<()> {
112 let result = scan_snapshots(&self.storage).await?;
113
114 println!("Scanning: {}", self.location);
115 if result.snapshots.is_empty() {
116 println!("No snapshots found.");
117 } else {
118 print_snapshot_list(&result.snapshots, result.unreadable.len());
119 }
120 print_unreadable_warnings(&result.unreadable);
121
122 Ok(())
123 }
124}
125
126#[derive(Debug, Parser)]
128pub struct ExportVerifyCommand {
129 #[clap(long)]
131 snapshot: String,
132
133 #[clap(flatten)]
135 storage: ObjectStoreConfig,
136}
137
138impl ExportVerifyCommand {
139 pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
140 validate_uri(&self.snapshot).map_err(BoxedError::new)?;
141 let storage =
142 OpenDalStorage::from_uri(&self.snapshot, &self.storage).map_err(BoxedError::new)?;
143
144 Ok(Box::new(ExportVerify {
145 snapshot: self.snapshot.clone(),
146 storage,
147 }))
148 }
149}
150
151pub struct ExportVerify {
153 snapshot: String,
154 storage: OpenDalStorage,
155}
156
157#[async_trait]
158impl Tool for ExportVerify {
159 async fn do_work(&self) -> std::result::Result<(), BoxedError> {
160 self.run().await.map_err(BoxedError::new)
161 }
162}
163
164impl ExportVerify {
165 async fn run(&self) -> Result<()> {
166 let report = verify_snapshot(&self.storage).await?;
167 print_verify_report(&self.snapshot, &report);
168
169 if report.has_problems() {
170 return SnapshotVerifyFailedSnafu {
171 errors: report.error_count(),
172 warnings: report.warning_count(),
173 }
174 .fail();
175 }
176
177 Ok(())
178 }
179}
180
181#[derive(Debug, Parser)]
183pub struct ExportDeleteCommand {
184 #[clap(long)]
186 snapshot: String,
187
188 #[clap(long = "no-confirm", alias = "yes")]
190 skip_confirmation: bool,
191
192 #[clap(flatten)]
194 storage: ObjectStoreConfig,
195}
196
197impl ExportDeleteCommand {
198 pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
199 validate_snapshot_uri(&self.snapshot).map_err(BoxedError::new)?;
200 let storage =
201 OpenDalStorage::from_uri(&self.snapshot, &self.storage).map_err(BoxedError::new)?;
202
203 Ok(Box::new(ExportDelete {
204 snapshot: self.snapshot.clone(),
205 skip_confirmation: self.skip_confirmation,
206 storage,
207 }))
208 }
209}
210
211pub struct ExportDelete {
213 snapshot: String,
214 skip_confirmation: bool,
215 storage: OpenDalStorage,
216}
217
218#[async_trait]
219impl Tool for ExportDelete {
220 async fn do_work(&self) -> std::result::Result<(), BoxedError> {
221 self.run().await.map_err(BoxedError::new)
222 }
223}
224
225impl ExportDelete {
226 async fn run(&self) -> Result<()> {
227 self.run_with_confirmation(confirm_delete).await
228 }
229
230 async fn run_with_confirmation<F>(&self, confirm: F) -> Result<()>
231 where
232 F: FnOnce(&str) -> Result<bool>,
233 {
234 let manifest = self.storage.read_manifest().await?;
235 print_delete_summary(&self.snapshot, &manifest);
236
237 if !self.skip_confirmation && !confirm(&self.snapshot)? {
238 println!("Deletion cancelled.");
239 return Ok(());
240 }
241
242 println!("Deleting snapshot...");
243 self.storage.delete_snapshot().await?;
244 println!("Snapshot deleted successfully.");
245
246 Ok(())
247 }
248}
249
250#[derive(Debug, Parser)]
252pub struct ExportCreateCommand {
253 #[clap(long)]
255 addr: String,
256
257 #[clap(long)]
259 to: String,
260
261 #[clap(long, default_value = "greptime")]
263 catalog: String,
264
265 #[clap(long, value_delimiter = ',')]
268 schemas: Vec<String>,
269
270 #[clap(long)]
272 schema_only: bool,
273
274 #[clap(long)]
276 start_time: Option<String>,
277
278 #[clap(long)]
280 end_time: Option<String>,
281
282 #[clap(long, value_parser = humantime::parse_duration)]
285 chunk_time_window: Option<Duration>,
286
287 #[clap(long, value_enum, default_value = "parquet")]
289 format: DataFormat,
290
291 #[clap(long)]
293 force: bool,
294
295 #[clap(long, default_value = "1")]
297 parallelism: usize,
298
299 #[clap(long)]
301 auth_basic: Option<String>,
302
303 #[clap(long, value_parser = humantime::parse_duration)]
305 timeout: Option<Duration>,
306
307 #[clap(long)]
312 proxy: Option<String>,
313
314 #[clap(long)]
318 no_proxy: bool,
319
320 #[clap(flatten)]
322 storage: ObjectStoreConfig,
323}
324
325impl ExportCreateCommand {
326 pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
327 validate_uri(&self.to).map_err(BoxedError::new)?;
329
330 let time_range = TimeRange::parse(self.start_time.as_deref(), self.end_time.as_deref())
331 .map_err(BoxedError::new)?;
332 if self.chunk_time_window.is_some() && !time_range.is_bounded() {
333 return ChunkTimeWindowRequiresBoundsSnafu
334 .fail()
335 .map_err(BoxedError::new);
336 }
337 if self.schema_only {
338 let mut invalid_args = Vec::new();
339 if self.start_time.is_some() {
340 invalid_args.push("--start-time");
341 }
342 if self.end_time.is_some() {
343 invalid_args.push("--end-time");
344 }
345 if self.chunk_time_window.is_some() {
346 invalid_args.push("--chunk-time-window");
347 }
348 if self.format != DataFormat::Parquet {
349 invalid_args.push("--format");
350 }
351 if self.parallelism != 1 {
352 invalid_args.push("--parallelism");
353 }
354 if !invalid_args.is_empty() {
355 return SchemaOnlyArgsNotAllowedSnafu {
356 args: invalid_args.join(", "),
357 }
358 .fail()
359 .map_err(BoxedError::new);
360 }
361 }
362
363 let schemas = if self.schemas.is_empty() {
365 None
366 } else {
367 Some(self.schemas.clone())
368 };
369
370 let storage = OpenDalStorage::from_uri(&self.to, &self.storage).map_err(BoxedError::new)?;
372
373 let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
375 let database_client = DatabaseClient::new(
376 self.addr.clone(),
377 self.catalog.clone(),
378 self.auth_basic.clone(),
379 self.timeout.unwrap_or(Duration::from_secs(60)),
380 proxy,
381 self.no_proxy,
382 );
383
384 Ok(Box::new(ExportCreate {
385 config: ExportConfig {
386 catalog: self.catalog.clone(),
387 schemas,
388 schema_only: self.schema_only,
389 format: self.format,
390 force: self.force,
391 time_range,
392 chunk_time_window: self.chunk_time_window,
393 parallelism: self.parallelism,
394 snapshot_uri: self.to.clone(),
395 storage_config: self.storage.clone(),
396 },
397 storage: Box::new(storage),
398 database_client,
399 }))
400 }
401}
402
403pub struct ExportCreate {
405 config: ExportConfig,
406 storage: Box<dyn SnapshotStorage>,
407 database_client: DatabaseClient,
408}
409
410struct ExportConfig {
411 catalog: String,
412 schemas: Option<Vec<String>>,
413 schema_only: bool,
414 format: DataFormat,
415 force: bool,
416 time_range: TimeRange,
417 chunk_time_window: Option<Duration>,
418 parallelism: usize,
419 snapshot_uri: String,
420 storage_config: ObjectStoreConfig,
421}
422
423#[async_trait]
424impl Tool for ExportCreate {
425 async fn do_work(&self) -> std::result::Result<(), BoxedError> {
426 self.run().await.map_err(BoxedError::new)
427 }
428}
429
430impl ExportCreate {
431 async fn run(&self) -> Result<()> {
432 let exists = self.storage.exists().await?;
434
435 if exists {
436 if self.config.force {
437 info!("Deleting existing snapshot (--force)");
438 self.storage.delete_snapshot().await?;
439 } else {
440 let mut manifest = self.storage.read_manifest().await?;
442
443 if manifest.version != MANIFEST_VERSION {
445 return ManifestVersionMismatchSnafu {
446 expected: MANIFEST_VERSION,
447 found: manifest.version,
448 }
449 .fail();
450 }
451
452 validate_resume_config(&manifest, &self.config)?;
453
454 info!(
455 "Resuming existing snapshot: {} (completed: {}/{} chunks)",
456 manifest.snapshot_id,
457 manifest.completed_count(),
458 manifest.chunks.len()
459 );
460
461 if manifest.is_complete() {
462 info!("Snapshot is already complete");
463 return Ok(());
464 }
465
466 if manifest.schema_only {
467 return Ok(());
468 }
469
470 export_data(
471 self.storage.as_ref(),
472 &self.database_client,
473 &self.config.snapshot_uri,
474 &self.config.storage_config,
475 &mut manifest,
476 self.config.parallelism,
477 )
478 .await?;
479 return Ok(());
480 }
481 }
482
483 let extractor = SchemaExtractor::new(&self.database_client, &self.config.catalog);
485 let schema_snapshot = extractor.extract(self.config.schemas.as_deref()).await?;
486
487 let schema_names: Vec<String> = schema_snapshot
488 .schemas
489 .iter()
490 .map(|s| s.name.clone())
491 .collect();
492 info!("Exporting schemas: {:?}", schema_names);
493
494 let mut manifest = Manifest::new_for_export(
496 self.config.catalog.clone(),
497 schema_names.clone(),
498 self.config.schema_only,
499 self.config.time_range.clone(),
500 self.config.format,
501 self.config.chunk_time_window,
502 )?;
503
504 self.storage.write_schema(&schema_snapshot).await?;
506 info!("Exported {} schemas", schema_snapshot.schemas.len());
507
508 let ddl_by_schema = self.build_ddl_by_schema(&schema_names).await?;
510 for (schema, ddl) in ddl_by_schema {
511 let ddl_path = ddl_path_for_schema(&schema);
512 self.storage.write_text(&ddl_path, &ddl).await?;
513 info!("Exported DDL for schema {} to {}", schema, ddl_path);
514 }
515
516 self.storage.write_manifest(&manifest).await?;
524 info!("Snapshot created: {}", manifest.snapshot_id);
525
526 if !self.config.schema_only {
527 export_data(
528 self.storage.as_ref(),
529 &self.database_client,
530 &self.config.snapshot_uri,
531 &self.config.storage_config,
532 &mut manifest,
533 self.config.parallelism,
534 )
535 .await?;
536 }
537
538 Ok(())
539 }
540
541 async fn build_ddl_by_schema(&self, schema_names: &[String]) -> Result<Vec<(String, String)>> {
542 let mut schemas = schema_names.to_vec();
543 schemas.sort();
544
545 let mut ddl_by_schema = Vec::with_capacity(schemas.len());
546 for schema in schemas {
547 let create_database = self.show_create("DATABASE", &schema, None).await?;
548
549 let (mut physical_tables, mut tables, mut views) =
550 self.get_schema_objects(&schema).await?;
551 physical_tables.sort();
552 let mut physical_ddls = Vec::with_capacity(physical_tables.len());
553 for table in physical_tables {
554 physical_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
555 }
556
557 tables.sort();
558 let mut table_ddls = Vec::with_capacity(tables.len());
559 for table in tables {
560 table_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
561 }
562
563 views.sort();
564 let mut view_ddls = Vec::with_capacity(views.len());
565 for view in views {
566 view_ddls.push(self.show_create("VIEW", &schema, Some(&view)).await?);
567 }
568
569 let ddl = build_schema_ddl(
570 &schema,
571 create_database,
572 physical_ddls,
573 table_ddls,
574 view_ddls,
575 );
576 ddl_by_schema.push((schema, ddl));
577 }
578
579 Ok(ddl_by_schema)
580 }
581
582 async fn get_schema_objects(
583 &self,
584 schema: &str,
585 ) -> Result<(Vec<String>, Vec<String>, Vec<String>)> {
586 let physical_tables = self.get_metric_physical_tables(schema).await?;
587 let physical_set: HashSet<&str> = physical_tables.iter().map(String::as_str).collect();
588 let sql = format!(
589 "SELECT table_name, table_type FROM information_schema.tables \
590 WHERE table_catalog = '{}' AND table_schema = '{}' \
591 AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')",
592 escape_sql_literal(&self.config.catalog),
593 escape_sql_literal(schema)
594 );
595 let records: Option<Vec<Vec<Value>>> = self
596 .database_client
597 .sql_in_public(&sql)
598 .await
599 .context(DatabaseSnafu)?;
600
601 let mut tables = Vec::new();
602 let mut views = Vec::new();
603 if let Some(rows) = records {
604 for row in rows {
605 let name = match row.first() {
606 Some(Value::String(name)) => name.clone(),
607 _ => return UnexpectedValueTypeSnafu.fail(),
608 };
609 let table_type = match row.get(1) {
610 Some(Value::String(table_type)) => table_type.as_str(),
611 _ => return UnexpectedValueTypeSnafu.fail(),
612 };
613 if !physical_set.contains(name.as_str()) {
614 if table_type == "VIEW" {
615 views.push(name);
616 } else {
617 tables.push(name);
618 }
619 }
620 }
621 }
622
623 Ok((physical_tables, tables, views))
624 }
625
626 async fn get_metric_physical_tables(&self, schema: &str) -> Result<Vec<String>> {
627 let sql = format!(
628 "SELECT DISTINCT table_name FROM information_schema.columns \
629 WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'",
630 escape_sql_literal(&self.config.catalog),
631 escape_sql_literal(schema)
632 );
633 let records: Option<Vec<Vec<Value>>> = self
634 .database_client
635 .sql_in_public(&sql)
636 .await
637 .context(DatabaseSnafu)?;
638
639 let mut tables = HashSet::new();
640 if let Some(rows) = records {
641 for row in rows {
642 let name = match row.first() {
643 Some(Value::String(name)) => name.clone(),
644 _ => return UnexpectedValueTypeSnafu.fail(),
645 };
646 tables.insert(name);
647 }
648 }
649
650 Ok(tables.into_iter().collect())
651 }
652
653 async fn show_create(
654 &self,
655 show_type: &str,
656 schema: &str,
657 table: Option<&str>,
658 ) -> Result<String> {
659 let sql = match table {
660 Some(table) => format!(
661 r#"SHOW CREATE {} "{}"."{}"."{}""#,
662 show_type,
663 escape_sql_identifier(&self.config.catalog),
664 escape_sql_identifier(schema),
665 escape_sql_identifier(table)
666 ),
667 None => format!(
668 r#"SHOW CREATE {} "{}"."{}""#,
669 show_type,
670 escape_sql_identifier(&self.config.catalog),
671 escape_sql_identifier(schema)
672 ),
673 };
674
675 let records: Option<Vec<Vec<Value>>> = self
676 .database_client
677 .sql_in_public(&sql)
678 .await
679 .context(DatabaseSnafu)?;
680 let rows = records.context(EmptyResultSnafu)?;
681 let row = rows.first().context(EmptyResultSnafu)?;
682 let Some(Value::String(create)) = row.get(1) else {
683 return UnexpectedValueTypeSnafu.fail();
684 };
685
686 Ok(format!("{};\n", create))
687 }
688}
689
690fn build_schema_ddl(
691 schema: &str,
692 create_database: String,
693 physical_tables: Vec<String>,
694 tables: Vec<String>,
695 views: Vec<String>,
696) -> String {
697 let mut ddl = String::new();
698 ddl.push_str(&format!("-- Schema: {}\n", schema));
699 ddl.push_str(&create_database);
700 for stmt in physical_tables {
701 ddl.push_str(&stmt);
702 }
703 for stmt in tables {
704 ddl.push_str(&stmt);
705 }
706 for stmt in views {
707 ddl.push_str(&stmt);
708 }
709 ddl.push('\n');
710 ddl
711}
712
713fn validate_resume_config(manifest: &Manifest, config: &ExportConfig) -> Result<()> {
714 if manifest.schema_only != config.schema_only {
715 return SchemaOnlyModeMismatchSnafu {
716 existing_schema_only: manifest.schema_only,
717 requested_schema_only: config.schema_only,
718 }
719 .fail();
720 }
721
722 if manifest.catalog != config.catalog {
723 return ResumeConfigMismatchSnafu {
724 field: "catalog",
725 existing: manifest.catalog.clone(),
726 requested: config.catalog.clone(),
727 }
728 .fail();
729 }
730
731 if let Some(requested_schemas) = &config.schemas
734 && !schema_selection_matches(&manifest.schemas, requested_schemas)
735 {
736 return ResumeConfigMismatchSnafu {
737 field: "schemas",
738 existing: format_schema_selection(&manifest.schemas),
739 requested: format_schema_selection(requested_schemas),
740 }
741 .fail();
742 }
743
744 if manifest.time_range != config.time_range {
745 return ResumeConfigMismatchSnafu {
746 field: "time_range",
747 existing: format!("{:?}", manifest.time_range),
748 requested: format!("{:?}", config.time_range),
749 }
750 .fail();
751 }
752
753 if manifest.format != config.format {
754 return ResumeConfigMismatchSnafu {
755 field: "format",
756 existing: manifest.format.to_string(),
757 requested: config.format.to_string(),
758 }
759 .fail();
760 }
761
762 let expected_plan = Manifest::new_for_export(
763 manifest.catalog.clone(),
764 manifest.schemas.clone(),
765 config.schema_only,
766 config.time_range.clone(),
767 config.format,
768 config.chunk_time_window,
769 )?;
770 if !chunk_plan_matches(manifest, &expected_plan) {
771 return ResumeConfigMismatchSnafu {
772 field: "chunk plan",
773 existing: format_chunk_plan(&manifest.chunks),
774 requested: format_chunk_plan(&expected_plan.chunks),
775 }
776 .fail();
777 }
778
779 Ok(())
780}
781
782fn schema_selection_matches(existing: &[String], requested: &[String]) -> bool {
783 canonical_schema_selection(existing) == canonical_schema_selection(requested)
784}
785
786fn canonical_schema_selection(schemas: &[String]) -> Vec<String> {
787 let mut canonicalized = Vec::new();
788 let mut seen = HashSet::new();
789
790 for schema in schemas {
791 let normalized = schema.to_ascii_lowercase();
792 if seen.insert(normalized.clone()) {
793 canonicalized.push(normalized);
794 }
795 }
796
797 canonicalized.sort();
798 canonicalized
799}
800
801fn format_schema_selection(schemas: &[String]) -> String {
802 format!("[{}]", schemas.join(", "))
803}
804
805fn chunk_plan_matches(existing: &Manifest, expected: &Manifest) -> bool {
806 existing.chunks.len() == expected.chunks.len()
807 && existing
808 .chunks
809 .iter()
810 .zip(&expected.chunks)
811 .all(|(left, right)| left.id == right.id && left.time_range == right.time_range)
812}
813
814fn format_chunk_plan(chunks: &[ChunkMeta]) -> String {
815 let items = chunks
816 .iter()
817 .map(|chunk| format!("#{}:{:?}", chunk.id, chunk.time_range))
818 .collect::<Vec<_>>();
819 format!("[{}]", items.join(", "))
820}
821
822#[derive(Debug)]
823struct SnapshotListEntry {
824 path: String,
825 manifest: Manifest,
826}
827
828#[derive(Debug, Default)]
829struct SnapshotScanResult {
830 snapshots: Vec<SnapshotListEntry>,
831 unreadable: Vec<String>,
832}
833
834async fn scan_snapshots(storage: &OpenDalStorage) -> Result<SnapshotScanResult> {
835 let mut result = SnapshotScanResult::default();
836 for dir in storage.list_direct_child_dirs().await? {
837 let manifest_path = format!("{}/{}", dir.trim_matches('/'), MANIFEST_FILE);
838 let Some(data) = storage.read_file_if_exists(&manifest_path).await? else {
839 continue;
840 };
841
842 match serde_json::from_slice::<Manifest>(&data) {
843 Ok(manifest) => result.snapshots.push(SnapshotListEntry {
844 path: format!("{}/", dir.trim_matches('/')),
845 manifest,
846 }),
847 Err(_) => result
848 .unreadable
849 .push(format!("{}/", dir.trim_matches('/'))),
850 }
851 }
852
853 result
854 .snapshots
855 .sort_by_key(|entry| std::cmp::Reverse(entry.manifest.created_at));
856 result.unreadable.sort();
857 Ok(result)
858}
859
860fn print_snapshot_list(snapshots: &[SnapshotListEntry], unreadable_count: usize) {
861 if unreadable_count == 0 {
862 println!("Found {} snapshots:", snapshots.len());
863 } else {
864 println!(
865 "Found {} snapshots ({} {} skipped: unreadable manifest):",
866 snapshots.len(),
867 unreadable_count,
868 directory_word(unreadable_count)
869 );
870 }
871 println!();
872 println!(
873 " {:<24} {:<36} {:<19} {:<9} {:<7} {:<6} Status",
874 "Path", "ID", "Created", "Catalog", "Schemas", "Chunks"
875 );
876 println!(
877 " {:<24} {:<36} {:<19} {:<9} {:<7} {:<6} {:<10}",
878 "-".repeat(24),
879 "-".repeat(36),
880 "-".repeat(19),
881 "-".repeat(9),
882 "-".repeat(7),
883 "-".repeat(6),
884 "-".repeat(10)
885 );
886 for entry in snapshots {
887 let manifest = &entry.manifest;
888 println!(
889 " {:<24} {:<36} {:<19} {:<9} {:<7} {:<6} {}",
890 entry.path,
891 manifest.snapshot_id,
892 manifest.created_at.format("%Y-%m-%d %H:%M:%S"),
893 manifest.catalog,
894 manifest.schemas.len(),
895 format_list_chunks(manifest),
896 snapshot_status(manifest)
897 );
898 }
899}
900
901fn print_unreadable_warnings(unreadable: &[String]) {
902 if unreadable.is_empty() {
903 return;
904 }
905
906 println!();
907 println!(
908 "Warning: {} {} had corrupt/unreadable manifest.json:",
909 unreadable.len(),
910 directory_word(unreadable.len())
911 );
912 for path in unreadable {
913 println!(" - {}", path);
914 }
915}
916
917fn directory_word(count: usize) -> &'static str {
918 if count == 1 {
919 "directory"
920 } else {
921 "directories"
922 }
923}
924
925fn snapshot_status(manifest: &Manifest) -> &'static str {
926 if manifest.schema_only {
927 "schema-only"
928 } else if manifest.is_complete() {
929 "complete"
930 } else {
931 "incomplete"
932 }
933}
934
935fn format_list_chunks(manifest: &Manifest) -> String {
936 let total = manifest.chunks.len();
937 if total == 0 {
938 return "0".to_string();
939 }
940
941 format!(
942 "{}/{}",
943 manifest.completed_count() + manifest.skipped_count(),
944 total
945 )
946}
947
948#[derive(Debug, Clone, Copy, PartialEq, Eq)]
949enum VerifySeverity {
950 Error,
951 Warn,
952}
953
954impl VerifySeverity {
955 fn as_str(self) -> &'static str {
956 match self {
957 VerifySeverity::Error => "ERROR",
958 VerifySeverity::Warn => "WARN",
959 }
960 }
961}
962
963#[derive(Debug)]
964struct VerifyProblem {
965 severity: VerifySeverity,
966 message: String,
967}
968
969#[derive(Debug, Default)]
970struct VerifyChunkSummary {
971 total: usize,
972 completed: usize,
973 skipped: usize,
974 pending: usize,
975 in_progress: usize,
976 failed: usize,
977}
978
979#[derive(Debug)]
980struct VerifyReport {
981 manifest: Manifest,
982 schema_index_exists: bool,
983 ddl_file_count: usize,
984 chunk_summary: VerifyChunkSummary,
985 data_files_total: usize,
986 data_files_verified: usize,
987 problems: Vec<VerifyProblem>,
988}
989
990impl VerifyReport {
991 fn error_count(&self) -> usize {
992 self.problems
993 .iter()
994 .filter(|problem| problem.severity == VerifySeverity::Error)
995 .count()
996 }
997
998 fn warning_count(&self) -> usize {
999 self.problems
1000 .iter()
1001 .filter(|problem| problem.severity == VerifySeverity::Warn)
1002 .count()
1003 }
1004
1005 fn has_problems(&self) -> bool {
1006 !self.problems.is_empty()
1007 }
1008
1009 fn push_error(&mut self, message: impl Into<String>) {
1010 self.problems.push(VerifyProblem {
1011 severity: VerifySeverity::Error,
1012 message: message.into(),
1013 });
1014 }
1015
1016 fn push_warn(&mut self, message: impl Into<String>) {
1017 self.problems.push(VerifyProblem {
1018 severity: VerifySeverity::Warn,
1019 message: message.into(),
1020 });
1021 }
1022}
1023
1024async fn verify_snapshot(storage: &OpenDalStorage) -> Result<VerifyReport> {
1025 let manifest = storage.read_manifest().await?;
1026 let schema_index_path = format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE);
1027 let ddl_prefix = format!("{}/{}/", SCHEMA_DIR, DDL_DIR);
1028 let schema_index_exists = storage.file_exists(&schema_index_path).await?;
1029 let ddl_files: HashSet<_> = storage
1030 .list_files_recursive(&ddl_prefix)
1031 .await?
1032 .into_iter()
1033 .collect();
1034 let ddl_file_count = ddl_files
1035 .iter()
1036 .filter(|path| path.ends_with(".sql"))
1037 .count();
1038
1039 let mut report = VerifyReport {
1040 manifest,
1041 schema_index_exists,
1042 ddl_file_count,
1043 chunk_summary: VerifyChunkSummary::default(),
1044 data_files_total: 0,
1045 data_files_verified: 0,
1046 problems: Vec::new(),
1047 };
1048
1049 if report.manifest.version != MANIFEST_VERSION {
1050 report.push_error(format!(
1051 "Manifest version mismatch: expected {}, found {}",
1052 MANIFEST_VERSION, report.manifest.version
1053 ));
1054 }
1055
1056 if !report.schema_index_exists {
1057 report.push_warn(format!("Missing schema index '{}'", schema_index_path));
1058 }
1059
1060 for schema in &report.manifest.schemas {
1061 let ddl_path = ddl_path_for_schema(schema);
1062 if !ddl_files.contains(ddl_path.as_str()) {
1063 report.problems.push(VerifyProblem {
1064 severity: VerifySeverity::Error,
1065 message: format!("Schema '{}': missing DDL file '{}'", schema, ddl_path),
1066 });
1067 }
1068 }
1069
1070 report.chunk_summary = summarize_chunks(&report.manifest);
1071 if report.manifest.schema_only {
1072 let chunk_count = report.manifest.chunks.len();
1073 if chunk_count > 0 {
1074 report.push_error(format!(
1075 "Schema-only snapshot should not contain data chunks (found {})",
1076 chunk_count
1077 ));
1078 }
1079 let data_files = storage.list_files_recursive("data/").await?;
1080 if let Some(path) = data_files.iter().min() {
1083 report.push_error(format!(
1084 "Schema-only snapshot should not contain data files (found '{}')",
1085 path
1086 ));
1087 }
1088 } else if report.manifest.chunks.is_empty() {
1089 report.push_error("Full snapshot should contain at least one data chunk");
1090 } else {
1091 verify_chunks_and_data_files(storage, &mut report).await?;
1092 }
1093
1094 Ok(report)
1095}
1096
1097fn summarize_chunks(manifest: &Manifest) -> VerifyChunkSummary {
1098 VerifyChunkSummary {
1099 total: manifest.chunks.len(),
1100 completed: manifest.completed_count(),
1101 skipped: manifest.skipped_count(),
1102 pending: manifest.pending_count(),
1103 in_progress: manifest.in_progress_count(),
1104 failed: manifest.failed_count(),
1105 }
1106}
1107
1108#[derive(Debug)]
1110struct ChunkFile {
1111 chunk_id: u32,
1112 path: String,
1113}
1114
1115#[derive(Debug, Default)]
1120struct VerifyPlan {
1121 files_to_check: Vec<ChunkFile>,
1123 claimed_data_files: HashSet<String>,
1127 data_files_total: usize,
1129 problems: Vec<VerifyProblem>,
1131}
1132
1133#[derive(Debug)]
1136struct VerifyDataScan {
1137 existing_data_files: HashSet<String>,
1138}
1139
1140#[derive(Debug, Default)]
1142struct VerifyOutcome {
1143 data_files_total: usize,
1144 data_files_verified: usize,
1145 problems: Vec<VerifyProblem>,
1146}
1147
1148async fn verify_chunks_and_data_files(
1149 storage: &OpenDalStorage,
1150 report: &mut VerifyReport,
1151) -> Result<()> {
1152 let plan = build_verify_plan(&report.manifest);
1153 let scan = scan_data_files(storage).await?;
1154 let outcome = reconcile_plan_with_scan(plan, &scan);
1155
1156 report.data_files_total = outcome.data_files_total;
1157 report.data_files_verified = outcome.data_files_verified;
1158 report.problems.extend(outcome.problems);
1159
1160 Ok(())
1161}
1162
1163fn build_verify_plan(manifest: &Manifest) -> VerifyPlan {
1165 let mut plan = VerifyPlan::default();
1166 let mut seen_chunk_ids = HashSet::new();
1167
1168 for chunk in &manifest.chunks {
1169 if !seen_chunk_ids.insert(chunk.id) {
1170 plan.problems.push(VerifyProblem {
1171 severity: VerifySeverity::Error,
1172 message: format!("Chunk {}: duplicate chunk id", chunk.id),
1173 });
1174 }
1175 for file in &chunk.files {
1176 if let Some(path) = safe_manifest_data_file_path(file) {
1177 plan.claimed_data_files.insert(path.to_string());
1178 }
1179 }
1180
1181 match chunk.status {
1182 ChunkStatus::Completed => {
1183 if chunk.files.is_empty() {
1184 plan.problems.push(VerifyProblem {
1185 severity: VerifySeverity::Error,
1186 message: format!("Chunk {}: completed chunk has no data files", chunk.id),
1187 });
1188 continue;
1189 }
1190 let allowed_prefixes = manifest
1191 .schemas
1192 .iter()
1193 .map(|schema| data_dir_for_schema_chunk(schema, chunk.id))
1194 .collect::<Vec<_>>();
1195 for file in &chunk.files {
1196 plan.data_files_total += 1;
1197 match valid_manifest_data_file_path(file, &allowed_prefixes) {
1198 Some(path) => plan.files_to_check.push(ChunkFile {
1199 chunk_id: chunk.id,
1200 path: path.to_string(),
1201 }),
1202 None => plan.problems.push(VerifyProblem {
1203 severity: VerifySeverity::Error,
1204 message: format!(
1205 "Chunk {}: invalid data file path '{}'",
1206 chunk.id, file
1207 ),
1208 }),
1209 }
1210 }
1211 }
1212 ChunkStatus::Skipped => {
1213 if !chunk.files.is_empty() {
1214 plan.problems.push(VerifyProblem {
1215 severity: VerifySeverity::Error,
1216 message: format!(
1217 "Chunk {}: skipped chunk should not list data files",
1218 chunk.id
1219 ),
1220 });
1221 }
1222 }
1223 ChunkStatus::Pending => {
1224 plan.problems.push(VerifyProblem {
1225 severity: VerifySeverity::Error,
1226 message: format!("Chunk {}: status is 'pending'", chunk.id),
1227 });
1228 }
1229 ChunkStatus::InProgress => {
1230 plan.problems.push(VerifyProblem {
1231 severity: VerifySeverity::Error,
1232 message: format!("Chunk {}: status is 'in_progress'", chunk.id),
1233 });
1234 }
1235 ChunkStatus::Failed => {
1236 let reason = chunk.error.as_deref().unwrap_or("unknown error");
1237 plan.problems.push(VerifyProblem {
1238 severity: VerifySeverity::Error,
1239 message: format!("Chunk {}: status is 'failed' (error: {})", chunk.id, reason),
1240 });
1241 }
1242 }
1243 }
1244
1245 plan
1246}
1247
1248async fn scan_data_files(storage: &OpenDalStorage) -> Result<VerifyDataScan> {
1251 let existing_data_files = storage
1252 .list_files_recursive("data/")
1253 .await?
1254 .into_iter()
1255 .collect();
1256 Ok(VerifyDataScan {
1257 existing_data_files,
1258 })
1259}
1260
1261fn reconcile_plan_with_scan(plan: VerifyPlan, scan: &VerifyDataScan) -> VerifyOutcome {
1267 let mut problems = plan.problems;
1268 let mut data_files_verified = 0;
1269
1270 for file in &plan.files_to_check {
1271 if scan.existing_data_files.contains(&file.path) {
1272 data_files_verified += 1;
1273 } else {
1274 problems.push(VerifyProblem {
1275 severity: VerifySeverity::Error,
1276 message: format!("Chunk {}: missing file '{}'", file.chunk_id, file.path),
1277 });
1278 }
1279 }
1280
1281 let mut orphans: Vec<&String> = scan
1282 .existing_data_files
1283 .iter()
1284 .filter(|path| !plan.claimed_data_files.contains(*path))
1285 .collect();
1286 orphans.sort();
1287 for path in orphans {
1288 problems.push(VerifyProblem {
1289 severity: VerifySeverity::Error,
1290 message: format!("Unexpected data file '{}' is not listed in manifest", path),
1291 });
1292 }
1293
1294 VerifyOutcome {
1295 data_files_total: plan.data_files_total,
1296 data_files_verified,
1297 problems,
1298 }
1299}
1300
1301fn valid_manifest_data_file_path<'a>(
1302 path: &'a str,
1303 allowed_prefixes: &[String],
1304) -> Option<&'a str> {
1305 let normalized = safe_manifest_data_file_path(path)?;
1306
1307 if !allowed_prefixes
1308 .iter()
1309 .any(|prefix| normalized.starts_with(prefix))
1310 {
1311 return None;
1312 }
1313
1314 Some(normalized)
1315}
1316
1317fn safe_manifest_data_file_path(path: &str) -> Option<&str> {
1318 let normalized = path.trim_start_matches('/');
1319 if normalized.is_empty() || !normalized.starts_with("data/") {
1320 return None;
1321 }
1322
1323 if normalized
1324 .split('/')
1325 .any(|segment| segment.is_empty() || segment == "." || segment == "..")
1326 {
1327 return None;
1328 }
1329
1330 Some(normalized)
1331}
1332
1333fn print_verify_report(snapshot: &str, report: &VerifyReport) {
1334 println!("Verifying snapshot: {}", report.manifest.snapshot_id);
1335 println!(" Location: {}", snapshot);
1336 if report.manifest.version == MANIFEST_VERSION {
1337 println!(" Manifest: OK (version {})", report.manifest.version);
1338 } else {
1339 println!(
1340 " Manifest: ERROR (version {}, expected {})",
1341 report.manifest.version, MANIFEST_VERSION
1342 );
1343 }
1344 println!(
1345 " Schema files: {}",
1346 if report.schema_index_exists {
1347 format!("OK ({})", SCHEMAS_FILE)
1348 } else {
1349 format!("WARN (missing {})", SCHEMAS_FILE)
1350 }
1351 );
1352 if report.ddl_file_count > 0 {
1353 println!(" DDL files: {} file(s) found", report.ddl_file_count);
1354 } else {
1355 println!(" DDL files: not present");
1356 }
1357
1358 let chunks = &report.chunk_summary;
1359 println!(
1360 " Chunks: {} total ({} completed, {} skipped, {} pending, {} in_progress, {} failed)",
1361 chunks.total,
1362 chunks.completed,
1363 chunks.skipped,
1364 chunks.pending,
1365 chunks.in_progress,
1366 chunks.failed
1367 );
1368
1369 if report.manifest.schema_only {
1370 println!(" Data files: skipped (schema-only)");
1371 } else {
1372 println!(
1373 " Data files: {}/{} files verified",
1374 report.data_files_verified, report.data_files_total
1375 );
1376 }
1377
1378 if report.problems.is_empty() {
1379 println!();
1380 println!("Snapshot is valid.");
1381 return;
1382 }
1383
1384 println!();
1385 println!("Problems found:");
1386 for problem in &report.problems {
1387 println!(" [{}] {}", problem.severity.as_str(), problem.message);
1388 }
1389 println!();
1390 println!(
1391 "Snapshot has {} error(s), {} warning(s).",
1392 report.error_count(),
1393 report.warning_count()
1394 );
1395}
1396
1397fn print_delete_summary(snapshot: &str, manifest: &Manifest) {
1398 println!("Snapshot: {}", manifest.snapshot_id);
1399 println!(" Location: {}", snapshot);
1400 println!(
1401 " Created: {} UTC",
1402 manifest.created_at.format("%Y-%m-%d %H:%M:%S")
1403 );
1404 println!(" Catalog: {}", manifest.catalog);
1405 println!(" Schemas: {}", manifest.schemas.join(", "));
1406 println!(" Chunks: {}", format_delete_chunks(manifest));
1407}
1408
1409fn format_delete_chunks(manifest: &Manifest) -> String {
1410 if manifest.schema_only {
1411 return "0 (schema-only)".to_string();
1412 }
1413
1414 let summary = summarize_chunks(manifest);
1415 if manifest.is_complete() {
1416 format!("{} (all processed)", summary.total)
1417 } else {
1418 format!(
1419 "{} ({} completed, {} skipped, {} pending, {} in_progress, {} failed)",
1420 summary.total,
1421 summary.completed,
1422 summary.skipped,
1423 summary.pending,
1424 summary.in_progress,
1425 summary.failed
1426 )
1427 }
1428}
1429
1430fn confirm_delete(snapshot: &str) -> Result<bool> {
1431 println!();
1432 println!(
1433 "Warning: this removes the entire snapshot directory/prefix, not only files listed in manifest."
1434 );
1435 println!("This will permanently delete all data under:");
1436 println!(" {}", display_snapshot_prefix(snapshot));
1437 print!("Type 'yes' to confirm deletion: ");
1438 io::stdout().flush().map_err(|error| {
1439 IoSnafu {
1440 operation: "flushing delete confirmation prompt",
1441 error,
1442 }
1443 .build()
1444 })?;
1445
1446 let mut input = String::new();
1447 io::stdin().read_line(&mut input).map_err(|error| {
1448 IoSnafu {
1449 operation: "reading delete confirmation",
1450 error,
1451 }
1452 .build()
1453 })?;
1454
1455 Ok(delete_confirmation_matches(&input))
1456}
1457
1458fn delete_confirmation_matches(input: &str) -> bool {
1459 input.trim() == "yes"
1460}
1461
1462fn display_snapshot_prefix(snapshot: &str) -> String {
1463 if snapshot.ends_with('/') {
1464 snapshot.to_string()
1465 } else {
1466 format!("{}/", snapshot)
1467 }
1468}
1469
1470#[cfg(test)]
1471mod tests {
1472 use chrono::TimeZone;
1473 use clap::Parser;
1474 use tempfile::tempdir;
1475 use url::Url;
1476
1477 use super::*;
1478 use crate::data::path::ddl_path_for_schema;
1479
1480 #[test]
1481 fn test_ddl_path_for_schema() {
1482 assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
1483 assert_eq!(
1484 ddl_path_for_schema("../evil"),
1485 "schema/ddl/%2E%2E%2Fevil.sql"
1486 );
1487 }
1488
1489 #[test]
1490 fn test_build_schema_ddl_order() {
1491 let ddl = build_schema_ddl(
1492 "public",
1493 "CREATE DATABASE public;\n".to_string(),
1494 vec!["PHYSICAL;\n".to_string()],
1495 vec!["TABLE;\n".to_string()],
1496 vec!["VIEW;\n".to_string()],
1497 );
1498
1499 let db_pos = ddl.find("CREATE DATABASE").unwrap();
1500 let physical_pos = ddl.find("PHYSICAL;").unwrap();
1501 let table_pos = ddl.find("TABLE;").unwrap();
1502 let view_pos = ddl.find("VIEW;").unwrap();
1503 assert!(db_pos < physical_pos);
1504 assert!(physical_pos < table_pos);
1505 assert!(table_pos < view_pos);
1506 }
1507
1508 #[tokio::test]
1509 async fn test_build_rejects_chunk_window_without_bounds() {
1510 let cmd = ExportCreateCommand::parse_from([
1511 "export-v2-create",
1512 "--addr",
1513 "127.0.0.1:4000",
1514 "--to",
1515 "file:///tmp/export-v2-test",
1516 "--chunk-time-window",
1517 "1h",
1518 ]);
1519
1520 let result = cmd.build().await;
1521 assert!(result.is_err());
1522 let error = result.err().unwrap().to_string();
1523
1524 assert!(error.contains("chunk_time_window requires both --start-time and --end-time"));
1525 }
1526
1527 #[tokio::test]
1528 async fn test_build_rejects_data_export_args_in_schema_only_mode() {
1529 let cmd = ExportCreateCommand::parse_from([
1530 "export-v2-create",
1531 "--addr",
1532 "127.0.0.1:4000",
1533 "--to",
1534 "file:///tmp/export-v2-test",
1535 "--schema-only",
1536 "--start-time",
1537 "2024-01-01T00:00:00Z",
1538 "--end-time",
1539 "2024-01-02T00:00:00Z",
1540 "--chunk-time-window",
1541 "1h",
1542 "--format",
1543 "csv",
1544 "--parallelism",
1545 "2",
1546 ]);
1547
1548 let error = cmd.build().await.err().unwrap().to_string();
1549
1550 assert!(error.contains("--schema-only cannot be used with data export arguments"));
1551 assert!(error.contains("--start-time"));
1552 assert!(error.contains("--end-time"));
1553 assert!(error.contains("--chunk-time-window"));
1554 assert!(error.contains("--format"));
1555 assert!(error.contains("--parallelism"));
1556 }
1557
1558 #[test]
1559 fn test_schema_only_mode_mismatch_error_message() {
1560 let error = crate::data::export_v2::error::SchemaOnlyModeMismatchSnafu {
1561 existing_schema_only: false,
1562 requested_schema_only: true,
1563 }
1564 .build()
1565 .to_string();
1566
1567 assert!(error.contains("existing: false"));
1568 assert!(error.contains("requested: true"));
1569 }
1570
1571 #[test]
1572 fn test_validate_resume_config_rejects_catalog_mismatch() {
1573 let manifest = Manifest::new_for_export(
1574 "greptime".to_string(),
1575 vec!["public".to_string()],
1576 false,
1577 TimeRange::unbounded(),
1578 DataFormat::Parquet,
1579 None,
1580 )
1581 .unwrap();
1582 let config = ExportConfig {
1583 catalog: "other".to_string(),
1584 schemas: None,
1585 schema_only: false,
1586 format: DataFormat::Parquet,
1587 force: false,
1588 time_range: TimeRange::unbounded(),
1589 chunk_time_window: None,
1590 parallelism: 1,
1591 snapshot_uri: "file:///tmp/snapshot".to_string(),
1592 storage_config: ObjectStoreConfig::default(),
1593 };
1594
1595 let error = validate_resume_config(&manifest, &config)
1596 .err()
1597 .unwrap()
1598 .to_string();
1599 assert!(error.contains("catalog"));
1600 }
1601
1602 #[test]
1603 fn test_validate_resume_config_accepts_schema_selection_with_different_case_and_order() {
1604 let manifest = Manifest::new_for_export(
1605 "greptime".to_string(),
1606 vec!["public".to_string(), "analytics".to_string()],
1607 false,
1608 TimeRange::unbounded(),
1609 DataFormat::Parquet,
1610 None,
1611 )
1612 .unwrap();
1613 let config = ExportConfig {
1614 catalog: "greptime".to_string(),
1615 schemas: Some(vec![
1616 "ANALYTICS".to_string(),
1617 "PUBLIC".to_string(),
1618 "public".to_string(),
1619 ]),
1620 schema_only: false,
1621 format: DataFormat::Parquet,
1622 force: false,
1623 time_range: TimeRange::unbounded(),
1624 chunk_time_window: None,
1625 parallelism: 1,
1626 snapshot_uri: "file:///tmp/snapshot".to_string(),
1627 storage_config: ObjectStoreConfig::default(),
1628 };
1629
1630 assert!(validate_resume_config(&manifest, &config).is_ok());
1631 }
1632
1633 #[test]
1634 fn test_validate_resume_config_rejects_chunk_plan_mismatch() {
1635 let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
1636 let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 2, 0, 0).unwrap();
1637 let time_range = TimeRange::new(Some(start), Some(end));
1638 let manifest = Manifest::new_for_export(
1639 "greptime".to_string(),
1640 vec!["public".to_string()],
1641 false,
1642 time_range.clone(),
1643 DataFormat::Parquet,
1644 None,
1645 )
1646 .unwrap();
1647 let config = ExportConfig {
1648 catalog: "greptime".to_string(),
1649 schemas: None,
1650 schema_only: false,
1651 format: DataFormat::Parquet,
1652 force: false,
1653 time_range,
1654 chunk_time_window: Some(Duration::from_secs(3600)),
1655 parallelism: 1,
1656 snapshot_uri: "file:///tmp/snapshot".to_string(),
1657 storage_config: ObjectStoreConfig::default(),
1658 };
1659
1660 let error = validate_resume_config(&manifest, &config)
1661 .err()
1662 .unwrap()
1663 .to_string();
1664 assert!(error.contains("chunk plan"));
1665 }
1666
1667 #[test]
1668 fn test_validate_resume_config_rejects_format_mismatch() {
1669 let manifest = Manifest::new_for_export(
1670 "greptime".to_string(),
1671 vec!["public".to_string()],
1672 false,
1673 TimeRange::unbounded(),
1674 DataFormat::Parquet,
1675 None,
1676 )
1677 .unwrap();
1678 let config = ExportConfig {
1679 catalog: "greptime".to_string(),
1680 schemas: None,
1681 schema_only: false,
1682 format: DataFormat::Csv,
1683 force: false,
1684 time_range: TimeRange::unbounded(),
1685 chunk_time_window: None,
1686 parallelism: 1,
1687 snapshot_uri: "file:///tmp/snapshot".to_string(),
1688 storage_config: ObjectStoreConfig::default(),
1689 };
1690
1691 let error = validate_resume_config(&manifest, &config)
1692 .err()
1693 .unwrap()
1694 .to_string();
1695 assert!(error.contains("format"));
1696 }
1697
1698 #[test]
1699 fn test_validate_resume_config_rejects_time_range_mismatch() {
1700 let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
1701 let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap();
1702 let manifest = Manifest::new_for_export(
1703 "greptime".to_string(),
1704 vec!["public".to_string()],
1705 false,
1706 TimeRange::new(Some(start), Some(end)),
1707 DataFormat::Parquet,
1708 None,
1709 )
1710 .unwrap();
1711 let config = ExportConfig {
1712 catalog: "greptime".to_string(),
1713 schemas: None,
1714 schema_only: false,
1715 format: DataFormat::Parquet,
1716 force: false,
1717 time_range: TimeRange::new(Some(start), Some(start)),
1718 chunk_time_window: None,
1719 parallelism: 1,
1720 snapshot_uri: "file:///tmp/snapshot".to_string(),
1721 storage_config: ObjectStoreConfig::default(),
1722 };
1723
1724 let error = validate_resume_config(&manifest, &config)
1725 .err()
1726 .unwrap()
1727 .to_string();
1728 assert!(error.contains("time_range"));
1729 }
1730
1731 #[tokio::test]
1732 async fn test_scan_snapshots_sorts_and_tracks_unreadable_manifests() {
1733 let dir = tempdir().unwrap();
1734 write_test_manifest(
1735 dir.path(),
1736 "older",
1737 test_manifest(
1738 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1739 false,
1740 true,
1741 ),
1742 );
1743 write_test_manifest(
1744 dir.path(),
1745 "newer",
1746 test_manifest(
1747 chrono::Utc.with_ymd_and_hms(2026, 2, 1, 0, 0, 0).unwrap(),
1748 false,
1749 true,
1750 ),
1751 );
1752
1753 std::fs::create_dir_all(dir.path().join("empty-dir")).unwrap();
1754 std::fs::create_dir_all(dir.path().join("not-snapshot")).unwrap();
1755 std::fs::write(dir.path().join("not-snapshot").join("data.txt"), "x").unwrap();
1756 std::fs::create_dir_all(dir.path().join("broken")).unwrap();
1757 std::fs::write(dir.path().join("broken").join(MANIFEST_FILE), "{not-json").unwrap();
1758
1759 let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
1760 let storage = OpenDalStorage::from_file_uri(&uri).unwrap();
1761 let result = scan_snapshots(&storage).await.unwrap();
1762
1763 assert_eq!(result.snapshots.len(), 2);
1764 assert_eq!(
1765 result.snapshots[0].manifest.created_at,
1766 chrono::Utc.with_ymd_and_hms(2026, 2, 1, 0, 0, 0).unwrap()
1767 );
1768 assert_eq!(
1769 result.snapshots[1].manifest.created_at,
1770 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap()
1771 );
1772 assert_eq!(result.unreadable, vec!["broken/".to_string()]);
1773 assert_eq!(result.snapshots[0].path, "newer/");
1774 assert_eq!(result.snapshots[1].path, "older/");
1775 }
1776
1777 #[test]
1778 fn test_snapshot_list_status_and_chunk_summary() {
1779 let schema_only = test_manifest(
1780 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1781 true,
1782 true,
1783 );
1784 assert_eq!(snapshot_status(&schema_only), "schema-only");
1785 assert_eq!(format_list_chunks(&schema_only), "0");
1786
1787 let complete = test_manifest(
1788 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1789 false,
1790 true,
1791 );
1792 assert_eq!(snapshot_status(&complete), "complete");
1793 assert_eq!(format_list_chunks(&complete), "2/2");
1794 assert_eq!(format_delete_chunks(&complete), "2 (all processed)");
1795
1796 let incomplete = test_manifest(
1797 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1798 false,
1799 false,
1800 );
1801 assert_eq!(snapshot_status(&incomplete), "incomplete");
1802 assert_eq!(format_list_chunks(&incomplete), "1/2");
1803 assert_eq!(
1804 format_delete_chunks(&incomplete),
1805 "2 (1 completed, 0 skipped, 1 pending, 0 in_progress, 0 failed)"
1806 );
1807 }
1808
1809 #[tokio::test]
1810 async fn test_delete_build_rejects_bucket_root_uri() {
1811 let cmd = ExportDeleteCommand::parse_from([
1812 "export-v2-delete",
1813 "--snapshot",
1814 "s3://bucket",
1815 "--no-confirm",
1816 ]);
1817
1818 let error = cmd.build().await.err().unwrap().to_string();
1819 assert!(error.contains("non-empty path"));
1820 }
1821
1822 #[test]
1823 fn test_delete_skip_confirmation_aliases() {
1824 let no_confirm = ExportDeleteCommand::parse_from([
1825 "export-v2-delete",
1826 "--snapshot",
1827 "s3://bucket/snapshot",
1828 "--no-confirm",
1829 ]);
1830 assert!(no_confirm.skip_confirmation);
1831
1832 let yes = ExportDeleteCommand::parse_from([
1833 "export-v2-delete",
1834 "--snapshot",
1835 "s3://bucket/snapshot",
1836 "--yes",
1837 ]);
1838 assert!(yes.skip_confirmation);
1839 }
1840
1841 #[tokio::test]
1842 async fn test_delete_snapshot_with_no_confirm_removes_snapshot_contents() {
1843 let parent = tempdir().unwrap();
1844 let snapshot = parent.path().join("snapshot");
1845 let sibling = parent.path().join("sibling");
1846 std::fs::create_dir_all(&snapshot).unwrap();
1847 std::fs::create_dir_all(&sibling).unwrap();
1848 std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
1849 write_root_manifest(
1850 &snapshot,
1851 test_manifest(
1852 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1853 true,
1854 true,
1855 ),
1856 );
1857 write_snapshot_file(&snapshot, "schema/schemas.json", b"[]");
1858
1859 let uri = Url::from_directory_path(&snapshot).unwrap().to_string();
1860 let delete = ExportDelete {
1861 snapshot: uri,
1862 skip_confirmation: true,
1863 storage: file_storage_for_dir(&snapshot),
1864 };
1865
1866 delete
1867 .run_with_confirmation(|_| unreachable!())
1868 .await
1869 .unwrap();
1870
1871 assert!(!snapshot.join(MANIFEST_FILE).exists());
1872 assert!(!snapshot.join("schema/schemas.json").exists());
1873 assert!(sibling.join("keep.txt").exists());
1874 }
1875
1876 #[tokio::test]
1877 async fn test_delete_snapshot_requires_manifest() {
1878 let dir = tempdir().unwrap();
1879 let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
1880 let delete = ExportDelete {
1881 snapshot: uri,
1882 skip_confirmation: true,
1883 storage: file_storage_for_dir(dir.path()),
1884 };
1885
1886 let error = delete
1887 .run_with_confirmation(|_| unreachable!())
1888 .await
1889 .err()
1890 .unwrap()
1891 .to_string();
1892
1893 assert!(error.contains("Snapshot not found"));
1894 assert!(dir.path().exists());
1895 }
1896
1897 #[tokio::test]
1898 async fn test_delete_snapshot_cancels_without_exact_confirmation() {
1899 let dir = tempdir().unwrap();
1900 write_root_manifest(
1901 dir.path(),
1902 test_manifest(
1903 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1904 true,
1905 true,
1906 ),
1907 );
1908 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
1909 let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
1910 let delete = ExportDelete {
1911 snapshot: uri.clone(),
1912 skip_confirmation: false,
1913 storage: file_storage_for_dir(dir.path()),
1914 };
1915
1916 delete
1917 .run_with_confirmation(|snapshot| {
1918 assert_eq!(snapshot, uri);
1919 Ok(false)
1920 })
1921 .await
1922 .unwrap();
1923
1924 assert!(dir.path().join(MANIFEST_FILE).exists());
1925 assert!(dir.path().join("schema/schemas.json").exists());
1926 }
1927
1928 #[test]
1929 fn test_delete_confirmation_requires_exact_yes() {
1930 assert!(delete_confirmation_matches("yes"));
1931 assert!(delete_confirmation_matches(" yes\n"));
1932 assert!(!delete_confirmation_matches("YES"));
1933 assert!(!delete_confirmation_matches("y"));
1934 assert!(!delete_confirmation_matches("yes please"));
1935 }
1936
1937 #[test]
1938 fn test_display_snapshot_prefix_adds_trailing_slash() {
1939 assert_eq!(
1940 display_snapshot_prefix("s3://bucket/snapshot"),
1941 "s3://bucket/snapshot/"
1942 );
1943 assert_eq!(
1944 display_snapshot_prefix("s3://bucket/snapshot/"),
1945 "s3://bucket/snapshot/"
1946 );
1947 }
1948
1949 #[tokio::test]
1950 async fn test_verify_snapshot_accepts_valid_full_snapshot() {
1951 let dir = tempdir().unwrap();
1952 let manifest = test_manifest(
1953 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1954 false,
1955 true,
1956 );
1957 write_root_manifest(dir.path(), manifest);
1958 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
1959 write_default_ddl_files(dir.path());
1960 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
1961
1962 let storage = file_storage_for_dir(dir.path());
1963 let report = verify_snapshot(&storage).await.unwrap();
1964
1965 assert_eq!(report.error_count(), 0);
1966 assert_eq!(report.warning_count(), 0);
1967 assert_eq!(report.data_files_total, 1);
1968 assert_eq!(report.data_files_verified, 1);
1969 }
1970
1971 #[tokio::test]
1972 async fn test_verify_snapshot_reports_missing_data_file_and_failed_chunk() {
1973 let dir = tempdir().unwrap();
1974 let mut manifest = test_manifest(
1975 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
1976 false,
1977 true,
1978 );
1979 manifest.chunks[1].mark_failed("copy failed".to_string());
1980 write_root_manifest(dir.path(), manifest);
1981 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
1982 write_default_ddl_files(dir.path());
1983
1984 let storage = file_storage_for_dir(dir.path());
1985 let report = verify_snapshot(&storage).await.unwrap();
1986
1987 assert_eq!(report.error_count(), 2);
1988 assert!(
1989 report
1990 .problems
1991 .iter()
1992 .any(|problem| problem.message.contains("missing file"))
1993 );
1994 assert!(
1995 report
1996 .problems
1997 .iter()
1998 .any(|problem| problem.message.contains("status is 'failed'"))
1999 );
2000 }
2001
2002 #[tokio::test]
2003 async fn test_verify_snapshot_reports_missing_schema_index_as_warning() {
2004 let dir = tempdir().unwrap();
2005 let manifest = test_manifest(
2006 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2007 false,
2008 true,
2009 );
2010 write_root_manifest(dir.path(), manifest);
2011 write_default_ddl_files(dir.path());
2012 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2013
2014 let storage = file_storage_for_dir(dir.path());
2015 let report = verify_snapshot(&storage).await.unwrap();
2016
2017 assert_eq!(report.error_count(), 0);
2018 assert_eq!(report.warning_count(), 1);
2019 assert!(
2020 report
2021 .problems
2022 .iter()
2023 .any(|problem| problem.message.contains("Missing schema index"))
2024 );
2025 }
2026
2027 #[tokio::test]
2028 async fn test_verify_snapshot_rejects_schema_only_snapshot_with_chunks() {
2029 let dir = tempdir().unwrap();
2030 let mut manifest = test_manifest(
2031 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2032 true,
2033 true,
2034 );
2035 let mut chunk = ChunkMeta::new(1, TimeRange::unbounded());
2036 chunk.mark_completed(vec!["data/public/1/file.parquet".to_string()], None);
2037 manifest.chunks.push(chunk);
2038 write_root_manifest(dir.path(), manifest);
2039 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2040 write_default_ddl_files(dir.path());
2041
2042 let storage = file_storage_for_dir(dir.path());
2043 let report = verify_snapshot(&storage).await.unwrap();
2044
2045 assert_eq!(report.error_count(), 1);
2046 assert_eq!(report.data_files_total, 0);
2047 assert!(
2048 report
2049 .problems
2050 .iter()
2051 .any(|problem| problem.message.contains("should not contain data chunks"))
2052 );
2053 }
2054
2055 #[tokio::test]
2056 async fn test_verify_snapshot_rejects_schema_only_snapshot_with_data_files() {
2057 let dir = tempdir().unwrap();
2058 let manifest = test_manifest(
2059 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2060 true,
2061 true,
2062 );
2063 write_root_manifest(dir.path(), manifest);
2064 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2065 write_default_ddl_files(dir.path());
2066 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2067
2068 let storage = file_storage_for_dir(dir.path());
2069 let report = verify_snapshot(&storage).await.unwrap();
2070
2071 assert_eq!(report.error_count(), 1);
2072 assert_eq!(report.data_files_total, 0);
2073 assert!(
2074 report
2075 .problems
2076 .iter()
2077 .any(|problem| problem.message.contains("should not contain data files"))
2078 );
2079 }
2080
2081 #[tokio::test]
2082 async fn test_verify_snapshot_rejects_full_snapshot_without_chunks() {
2083 let dir = tempdir().unwrap();
2084 let mut manifest = test_manifest(
2085 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2086 false,
2087 true,
2088 );
2089 manifest.chunks.clear();
2090 write_root_manifest(dir.path(), manifest);
2091 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2092 write_default_ddl_files(dir.path());
2093
2094 let storage = file_storage_for_dir(dir.path());
2095 let report = verify_snapshot(&storage).await.unwrap();
2096
2097 assert_eq!(report.error_count(), 1);
2098 assert_eq!(report.data_files_total, 0);
2099 assert!(
2100 report
2101 .problems
2102 .iter()
2103 .any(|problem| problem.message.contains("at least one data chunk"))
2104 );
2105 }
2106
2107 #[tokio::test]
2108 async fn test_verify_snapshot_rejects_skipped_chunk_data_files() {
2109 let dir = tempdir().unwrap();
2110 let manifest = test_manifest(
2111 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2112 false,
2113 true,
2114 );
2115 write_root_manifest(dir.path(), manifest);
2116 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2117 write_default_ddl_files(dir.path());
2118 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2119 write_snapshot_file(dir.path(), "data/public/2/file.parquet", b"data");
2120
2121 let storage = file_storage_for_dir(dir.path());
2122 let report = verify_snapshot(&storage).await.unwrap();
2123
2124 assert_eq!(report.error_count(), 1);
2125 assert!(
2126 report
2127 .problems
2128 .iter()
2129 .any(|problem| { problem.message.contains("Unexpected data file") })
2130 );
2131 }
2132
2133 #[tokio::test]
2134 async fn test_verify_snapshot_rejects_duplicate_chunk_ids() {
2135 let dir = tempdir().unwrap();
2136 let mut manifest = test_manifest(
2137 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2138 false,
2139 true,
2140 );
2141 let mut duplicate = ChunkMeta::new(1, TimeRange::unbounded());
2142 duplicate.mark_completed(vec!["data/public/1/file.parquet".to_string()], None);
2143 manifest.chunks.push(duplicate);
2144 write_root_manifest(dir.path(), manifest);
2145 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2146 write_default_ddl_files(dir.path());
2147 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2148
2149 let storage = file_storage_for_dir(dir.path());
2150 let report = verify_snapshot(&storage).await.unwrap();
2151
2152 assert_eq!(report.error_count(), 1);
2153 assert!(
2154 report
2155 .problems
2156 .iter()
2157 .any(|problem| problem.message.contains("duplicate chunk id"))
2158 );
2159 }
2160
2161 #[tokio::test]
2162 async fn test_verify_snapshot_requires_all_schema_ddl() {
2163 let dir = tempdir().unwrap();
2164 let manifest = test_manifest(
2165 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2166 true,
2167 true,
2168 );
2169 write_root_manifest(dir.path(), manifest);
2170 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2171 write_snapshot_file(
2172 dir.path(),
2173 "schema/ddl/public.sql",
2174 b"CREATE DATABASE public;",
2175 );
2176
2177 let storage = file_storage_for_dir(dir.path());
2178 let report = verify_snapshot(&storage).await.unwrap();
2179
2180 assert_eq!(report.error_count(), 1);
2181 assert!(
2182 report
2183 .problems
2184 .iter()
2185 .any(|problem| problem.message.contains("analytics"))
2186 );
2187 }
2188
2189 #[tokio::test]
2190 async fn test_verify_snapshot_reports_missing_ddl_dir() {
2191 let dir = tempdir().unwrap();
2192 let manifest = test_manifest(
2193 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2194 false,
2195 true,
2196 );
2197 write_root_manifest(dir.path(), manifest);
2198 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2199 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2200
2201 let storage = file_storage_for_dir(dir.path());
2202 let report = verify_snapshot(&storage).await.unwrap();
2203
2204 assert_eq!(report.error_count(), 2);
2205 assert!(
2206 report
2207 .problems
2208 .iter()
2209 .any(|problem| problem.message.contains("schema/ddl/public.sql"))
2210 );
2211 assert!(
2212 report
2213 .problems
2214 .iter()
2215 .any(|problem| problem.message.contains("schema/ddl/analytics.sql"))
2216 );
2217 }
2218
2219 #[tokio::test]
2220 async fn test_verify_snapshot_reports_manifest_version_mismatch() {
2221 let dir = tempdir().unwrap();
2222 let mut manifest = test_manifest(
2223 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2224 false,
2225 true,
2226 );
2227 manifest.version = MANIFEST_VERSION + 1;
2228 write_root_manifest(dir.path(), manifest);
2229 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2230 write_default_ddl_files(dir.path());
2231 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2232
2233 let storage = file_storage_for_dir(dir.path());
2234 let report = verify_snapshot(&storage).await.unwrap();
2235
2236 assert_eq!(report.error_count(), 1);
2237 assert!(
2238 report
2239 .problems
2240 .iter()
2241 .any(|problem| problem.message.contains("Manifest version mismatch"))
2242 );
2243 }
2244
2245 #[tokio::test]
2246 async fn test_verify_snapshot_rejects_invalid_data_file_paths() {
2247 let dir = tempdir().unwrap();
2248 let mut manifest = test_manifest(
2249 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2250 false,
2251 true,
2252 );
2253 manifest.chunks[0].files = vec!["data/public/1/../file.parquet".to_string()];
2254 write_root_manifest(dir.path(), manifest);
2255 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2256 write_default_ddl_files(dir.path());
2257
2258 let storage = file_storage_for_dir(dir.path());
2259 let report = verify_snapshot(&storage).await.unwrap();
2260
2261 assert_eq!(report.error_count(), 1);
2262 assert!(
2263 report
2264 .problems
2265 .iter()
2266 .any(|problem| problem.message.contains("invalid data file path"))
2267 );
2268 assert_eq!(report.data_files_verified, 0);
2269 }
2270
2271 #[tokio::test]
2272 async fn test_verify_snapshot_accepts_leading_slash_manifest_data_paths() {
2273 let dir = tempdir().unwrap();
2274 let mut manifest = test_manifest(
2275 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2276 false,
2277 true,
2278 );
2279 manifest.chunks[0].files = vec!["/data/public/1/file.parquet".to_string()];
2280 write_root_manifest(dir.path(), manifest);
2281 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2282 write_default_ddl_files(dir.path());
2283 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2284
2285 let storage = file_storage_for_dir(dir.path());
2286 let report = verify_snapshot(&storage).await.unwrap();
2287
2288 assert_eq!(report.error_count(), 0);
2289 assert_eq!(report.data_files_verified, 1);
2290 }
2291
2292 #[tokio::test]
2293 async fn test_verify_snapshot_rejects_unlisted_files_under_completed_chunk_prefix() {
2294 let dir = tempdir().unwrap();
2295 let manifest = test_manifest(
2296 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2297 false,
2298 true,
2299 );
2300 write_root_manifest(dir.path(), manifest);
2301 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2302 write_default_ddl_files(dir.path());
2303 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2304 write_snapshot_file(dir.path(), "data/public/1/extra.parquet", b"data");
2305
2306 let storage = file_storage_for_dir(dir.path());
2307 let report = verify_snapshot(&storage).await.unwrap();
2308
2309 assert_eq!(report.error_count(), 1);
2310 assert!(
2311 report
2312 .problems
2313 .iter()
2314 .any(|problem| problem.message.contains("Unexpected data file"))
2315 );
2316 assert_eq!(report.data_files_verified, 1);
2317 }
2318
2319 #[tokio::test]
2320 async fn test_verify_snapshot_rejects_orphan_data_files_outside_known_chunk_prefixes() {
2321 let dir = tempdir().unwrap();
2322 let manifest = test_manifest(
2323 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2324 false,
2325 true,
2326 );
2327 write_root_manifest(dir.path(), manifest);
2328 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2329 write_default_ddl_files(dir.path());
2330 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2331 write_snapshot_file(dir.path(), "data/public/99/file.parquet", b"data");
2332
2333 let storage = file_storage_for_dir(dir.path());
2334 let report = verify_snapshot(&storage).await.unwrap();
2335
2336 assert_eq!(report.error_count(), 1);
2337 assert!(
2338 report
2339 .problems
2340 .iter()
2341 .any(|problem| problem.message.contains("Unexpected data file"))
2342 );
2343 assert_eq!(report.data_files_verified, 1);
2344 }
2345
2346 #[tokio::test]
2347 async fn test_verify_snapshot_rejects_data_files_under_wrong_chunk_or_schema() {
2348 let dir = tempdir().unwrap();
2349 let mut manifest = test_manifest(
2350 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2351 false,
2352 true,
2353 );
2354 manifest.chunks[0].files = vec![
2355 "data/public/99/file.parquet".to_string(),
2356 "data/metrics/1/file.parquet".to_string(),
2357 ];
2358 write_root_manifest(dir.path(), manifest);
2359 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2360 write_default_ddl_files(dir.path());
2361 write_snapshot_file(dir.path(), "data/public/99/file.parquet", b"data");
2362 write_snapshot_file(dir.path(), "data/metrics/1/file.parquet", b"data");
2363
2364 let storage = file_storage_for_dir(dir.path());
2365 let report = verify_snapshot(&storage).await.unwrap();
2366
2367 assert_eq!(report.error_count(), 2);
2368 assert_eq!(report.data_files_verified, 0);
2369 assert!(
2370 report
2371 .problems
2372 .iter()
2373 .all(|problem| problem.message.contains("invalid data file path"))
2374 );
2375 }
2376
2377 #[test]
2378 fn test_build_verify_plan_classifies_chunks_without_io() {
2379 let mut manifest = test_manifest(
2380 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2381 false,
2382 true,
2383 );
2384 let mut failed = ChunkMeta::new(3, TimeRange::unbounded());
2386 failed.mark_failed("boom".to_string());
2387 manifest.chunks.push(failed);
2388 manifest
2389 .chunks
2390 .push(ChunkMeta::new(4, TimeRange::unbounded()));
2391
2392 let plan = build_verify_plan(&manifest);
2393
2394 assert_eq!(plan.files_to_check.len(), 1);
2395 assert_eq!(plan.files_to_check[0].chunk_id, 1);
2396 assert_eq!(plan.files_to_check[0].path, "data/public/1/file.parquet");
2397 assert_eq!(plan.data_files_total, 1);
2398 assert!(
2399 plan.claimed_data_files
2400 .contains("data/public/1/file.parquet")
2401 );
2402 assert_eq!(plan.problems.len(), 2);
2403 assert!(
2404 plan.problems
2405 .iter()
2406 .any(|problem| problem.message.contains("status is 'failed'"))
2407 );
2408 assert!(
2409 plan.problems
2410 .iter()
2411 .any(|problem| problem.message.contains("status is 'pending'"))
2412 );
2413 }
2414
2415 #[tokio::test]
2416 async fn test_verify_snapshot_produces_deterministic_problem_output() {
2417 let dir = tempdir().unwrap();
2418 let manifest = test_manifest(
2419 chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
2420 false,
2421 true,
2422 );
2423 write_root_manifest(dir.path(), manifest);
2424 write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
2425 write_default_ddl_files(dir.path());
2426 write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
2427 for i in 0..50 {
2429 write_snapshot_file(
2430 dir.path(),
2431 &format!("data/public/1/orphan_{:02}.parquet", i),
2432 b"x",
2433 );
2434 }
2435
2436 let storage = file_storage_for_dir(dir.path());
2437 let messages = |report: &VerifyReport| {
2438 report
2439 .problems
2440 .iter()
2441 .map(|problem| problem.message.clone())
2442 .collect::<Vec<_>>()
2443 };
2444 let first = messages(&verify_snapshot(&storage).await.unwrap());
2445 let second = messages(&verify_snapshot(&storage).await.unwrap());
2446
2447 assert_eq!(first, second);
2449
2450 let orphans = first
2451 .iter()
2452 .filter(|message| message.contains("Unexpected data file"))
2453 .cloned()
2454 .collect::<Vec<_>>();
2455 assert_eq!(orphans.len(), 50);
2456 let mut sorted = orphans.clone();
2457 sorted.sort();
2458 assert_eq!(orphans, sorted);
2459 }
2460
2461 fn write_test_manifest(root: &std::path::Path, dir: &str, manifest: Manifest) {
2462 let snapshot_dir = root.join(dir);
2463 std::fs::create_dir_all(&snapshot_dir).unwrap();
2464 std::fs::write(
2465 snapshot_dir.join(MANIFEST_FILE),
2466 serde_json::to_vec_pretty(&manifest).unwrap(),
2467 )
2468 .unwrap();
2469 }
2470
2471 fn write_root_manifest(root: &std::path::Path, manifest: Manifest) {
2472 std::fs::write(
2473 root.join(MANIFEST_FILE),
2474 serde_json::to_vec_pretty(&manifest).unwrap(),
2475 )
2476 .unwrap();
2477 }
2478
2479 fn write_snapshot_file(root: &std::path::Path, relative_path: &str, content: &[u8]) {
2480 let mut path = root.to_path_buf();
2481 for segment in relative_path.split('/') {
2482 path.push(segment);
2483 }
2484 std::fs::create_dir_all(path.parent().unwrap()).unwrap();
2485 std::fs::write(path, content).unwrap();
2486 }
2487
2488 fn write_default_ddl_files(root: &std::path::Path) {
2489 write_snapshot_file(root, "schema/ddl/public.sql", b"CREATE DATABASE public;");
2490 write_snapshot_file(
2491 root,
2492 "schema/ddl/analytics.sql",
2493 b"CREATE DATABASE analytics;",
2494 );
2495 }
2496
2497 fn file_storage_for_dir(root: &std::path::Path) -> OpenDalStorage {
2498 let uri = Url::from_directory_path(root).unwrap().to_string();
2499 OpenDalStorage::from_file_uri(&uri).unwrap()
2500 }
2501
2502 fn test_manifest(
2503 created_at: chrono::DateTime<chrono::Utc>,
2504 schema_only: bool,
2505 complete: bool,
2506 ) -> Manifest {
2507 let mut manifest = Manifest::new_for_export(
2508 "greptime".to_string(),
2509 vec!["public".to_string(), "analytics".to_string()],
2510 schema_only,
2511 TimeRange::unbounded(),
2512 DataFormat::Parquet,
2513 None,
2514 )
2515 .unwrap();
2516 manifest.created_at = created_at;
2517 manifest.updated_at = created_at;
2518
2519 if !schema_only {
2520 manifest.chunks.clear();
2521 let mut first = ChunkMeta::new(1, TimeRange::unbounded());
2522 first.mark_completed(vec!["data/public/1/file.parquet".to_string()], None);
2523 manifest.chunks.push(first);
2524
2525 if complete {
2526 manifest
2527 .chunks
2528 .push(ChunkMeta::skipped(2, TimeRange::unbounded()));
2529 } else {
2530 manifest
2531 .chunks
2532 .push(ChunkMeta::new(2, TimeRange::unbounded()));
2533 }
2534 }
2535
2536 manifest
2537 }
2538}