Skip to main content

cli/data/import_v2/
command.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Import V2 CLI command.
16
17use std::collections::HashSet;
18use std::time::Duration;
19
20use async_trait::async_trait;
21use clap::Parser;
22use common_error::ext::BoxedError;
23use common_telemetry::info;
24use snafu::ResultExt;
25
26use crate::Tool;
27use crate::common::ObjectStoreConfig;
28use crate::data::export_v2::manifest::MANIFEST_VERSION;
29use crate::data::import_v2::error::{
30    ManifestVersionMismatchSnafu, Result, SchemaNotInSnapshotSnafu, SnapshotStorageSnafu,
31};
32use crate::data::import_v2::executor::{DdlExecutor, DdlStatement};
33use crate::data::path::ddl_path_for_schema;
34use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
35use crate::database::{DatabaseClient, parse_proxy_opts};
36
37/// Import from a snapshot.
38#[derive(Debug, Parser)]
39pub struct ImportV2Command {
40    /// Server address to connect (e.g., 127.0.0.1:4000).
41    #[clap(long)]
42    addr: String,
43
44    /// Source snapshot location (e.g., s3://bucket/path, file:///tmp/backup).
45    #[clap(long)]
46    from: String,
47
48    /// Target catalog name.
49    #[clap(long, default_value = "greptime")]
50    catalog: String,
51
52    /// Schema list to import (default: all in snapshot).
53    /// Can be specified multiple times or comma-separated.
54    #[clap(long, value_delimiter = ',')]
55    schemas: Vec<String>,
56
57    /// Verify without importing (dry-run).
58    #[clap(long)]
59    dry_run: bool,
60
61    /// Concurrency level (for future use).
62    #[clap(long, default_value = "1")]
63    parallelism: usize,
64
65    /// Basic authentication (user:password).
66    #[clap(long)]
67    auth_basic: Option<String>,
68
69    /// Request timeout.
70    #[clap(long, value_parser = humantime::parse_duration)]
71    timeout: Option<Duration>,
72
73    /// Proxy server address.
74    ///
75    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
76    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
77    #[clap(long)]
78    proxy: Option<String>,
79
80    /// Disable all proxy usage (ignores `--proxy` and system proxy).
81    ///
82    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
83    #[clap(long)]
84    no_proxy: bool,
85
86    /// Object store configuration for remote storage backends.
87    #[clap(flatten)]
88    storage: ObjectStoreConfig,
89}
90
91impl ImportV2Command {
92    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
93        // Validate URI format
94        validate_uri(&self.from)
95            .context(SnapshotStorageSnafu)
96            .map_err(BoxedError::new)?;
97
98        // Parse schemas (empty vec means all schemas)
99        let schemas = if self.schemas.is_empty() {
100            None
101        } else {
102            Some(self.schemas.clone())
103        };
104
105        // Build storage
106        let storage = OpenDalStorage::from_uri(&self.from, &self.storage)
107            .context(SnapshotStorageSnafu)
108            .map_err(BoxedError::new)?;
109
110        // Build database client
111        let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
112        let database_client = DatabaseClient::new(
113            self.addr.clone(),
114            self.catalog.clone(),
115            self.auth_basic.clone(),
116            self.timeout.unwrap_or(Duration::from_secs(60)),
117            proxy,
118            self.no_proxy,
119        );
120
121        Ok(Box::new(Import {
122            schemas,
123            dry_run: self.dry_run,
124            _parallelism: self.parallelism,
125            storage: Box::new(storage),
126            database_client,
127        }))
128    }
129}
130
131/// Import tool implementation.
132pub struct Import {
133    schemas: Option<Vec<String>>,
134    dry_run: bool,
135    _parallelism: usize,
136    storage: Box<dyn SnapshotStorage>,
137    database_client: DatabaseClient,
138}
139
140#[async_trait]
141impl Tool for Import {
142    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
143        self.run().await.map_err(BoxedError::new)
144    }
145}
146
147impl Import {
148    async fn run(&self) -> Result<()> {
149        // 1. Read manifest
150        let manifest = self
151            .storage
152            .read_manifest()
153            .await
154            .context(SnapshotStorageSnafu)?;
155
156        info!(
157            "Loading snapshot: {} (version: {}, schema_only: {})",
158            manifest.snapshot_id, manifest.version, manifest.schema_only
159        );
160
161        // Check version compatibility
162        if manifest.version != MANIFEST_VERSION {
163            return ManifestVersionMismatchSnafu {
164                expected: MANIFEST_VERSION,
165                found: manifest.version,
166            }
167            .fail();
168        }
169
170        info!("Snapshot contains {} schema(s)", manifest.schemas.len());
171
172        // 2. Determine schemas to import
173        let schemas_to_import = match &self.schemas {
174            Some(filter) => canonicalize_schema_filter(filter, &manifest.schemas)?,
175            None => manifest.schemas.clone(),
176        };
177
178        info!("Importing schemas: {:?}", schemas_to_import);
179
180        // 3. Read DDL statements
181        let ddl_statements = self.read_ddl_statements(&schemas_to_import).await?;
182
183        info!("Generated {} DDL statements", ddl_statements.len());
184
185        // 4. Dry-run mode: print DDL and exit
186        if self.dry_run {
187            info!("Dry-run mode - DDL statements to execute:");
188            println!();
189            for (i, stmt) in ddl_statements.iter().enumerate() {
190                println!("-- Statement {}", i + 1);
191                println!("{};", stmt.sql);
192                println!();
193            }
194            return Ok(());
195        }
196
197        // 5. Execute DDL
198        let executor = DdlExecutor::new(&self.database_client);
199        executor.execute_strict(&ddl_statements).await?;
200
201        info!(
202            "Import completed: {} DDL statements executed",
203            ddl_statements.len()
204        );
205
206        // 6. Data import would happen here for non-schema-only snapshots (M2/M3)
207        if !manifest.schema_only && !manifest.chunks.is_empty() {
208            info!(
209                "Data import not yet implemented (M3). {} chunks pending.",
210                manifest.chunks.len()
211            );
212        }
213
214        Ok(())
215    }
216
217    async fn read_ddl_statements(&self, schemas: &[String]) -> Result<Vec<DdlStatement>> {
218        let mut statements = Vec::new();
219        for schema in schemas {
220            let path = ddl_path_for_schema(schema);
221            let content = self
222                .storage
223                .read_text(&path)
224                .await
225                .context(SnapshotStorageSnafu)?;
226            statements.extend(
227                parse_ddl_statements(&content)
228                    .into_iter()
229                    .map(|sql| ddl_statement_for_schema(schema, sql)),
230            );
231        }
232
233        Ok(statements)
234    }
235}
236
237fn parse_ddl_statements(content: &str) -> Vec<String> {
238    let mut statements = Vec::new();
239    let mut current = String::new();
240    let mut chars = content.chars().peekable();
241    let mut in_single_quote = false;
242    let mut in_double_quote = false;
243    let mut in_line_comment = false;
244    let mut in_block_comment = false;
245
246    while let Some(ch) = chars.next() {
247        if in_line_comment {
248            if ch == '\n' {
249                in_line_comment = false;
250                current.push('\n');
251            }
252            continue;
253        }
254
255        if in_block_comment {
256            if ch == '*' && chars.peek() == Some(&'/') {
257                chars.next();
258                in_block_comment = false;
259            }
260            continue;
261        }
262
263        if in_single_quote {
264            current.push(ch);
265            if ch == '\'' {
266                if chars.peek() == Some(&'\'') {
267                    current.push(chars.next().expect("peeked quote must exist"));
268                } else {
269                    in_single_quote = false;
270                }
271            }
272            continue;
273        }
274
275        if in_double_quote {
276            current.push(ch);
277            if ch == '"' {
278                if chars.peek() == Some(&'"') {
279                    current.push(chars.next().expect("peeked quote must exist"));
280                } else {
281                    in_double_quote = false;
282                }
283            }
284            continue;
285        }
286
287        match ch {
288            '-' if chars.peek() == Some(&'-') => {
289                chars.next();
290                in_line_comment = true;
291            }
292            '/' if chars.peek() == Some(&'*') => {
293                chars.next();
294                in_block_comment = true;
295            }
296            '\'' => {
297                in_single_quote = true;
298                current.push(ch);
299            }
300            '"' => {
301                in_double_quote = true;
302                current.push(ch);
303            }
304            ';' => {
305                let statement = current.trim();
306                if !statement.is_empty() {
307                    statements.push(statement.to_string());
308                }
309                current.clear();
310            }
311            _ => current.push(ch),
312        }
313    }
314
315    let statement = current.trim();
316    if !statement.is_empty() {
317        statements.push(statement.to_string());
318    }
319
320    statements
321}
322
323fn ddl_statement_for_schema(schema: &str, sql: String) -> DdlStatement {
324    if is_schema_scoped_statement(&sql) {
325        DdlStatement::with_execution_schema(sql, schema.to_string())
326    } else {
327        DdlStatement::new(sql)
328    }
329}
330
331fn is_schema_scoped_statement(sql: &str) -> bool {
332    let trimmed = sql.trim_start();
333    if !starts_with_keyword(trimmed, "CREATE") {
334        return false;
335    }
336
337    let Some(rest) = trimmed.get("CREATE".len()..) else {
338        return false;
339    };
340    let mut rest = rest.trim_start();
341    if starts_with_keyword(rest, "OR") {
342        let Some(next) = rest.get("OR".len()..) else {
343            return false;
344        };
345        rest = next.trim_start();
346        if !starts_with_keyword(rest, "REPLACE") {
347            return false;
348        }
349        let Some(next) = rest.get("REPLACE".len()..) else {
350            return false;
351        };
352        rest = next.trim_start();
353    }
354
355    if starts_with_keyword(rest, "EXTERNAL") {
356        let Some(next) = rest.get("EXTERNAL".len()..) else {
357            return false;
358        };
359        rest = next.trim_start();
360    }
361
362    starts_with_keyword(rest, "TABLE") || starts_with_keyword(rest, "VIEW")
363}
364
365fn starts_with_keyword(input: &str, keyword: &str) -> bool {
366    input
367        .get(0..keyword.len())
368        .map(|s| s.eq_ignore_ascii_case(keyword))
369        .unwrap_or(false)
370        && input
371            .as_bytes()
372            .get(keyword.len())
373            .map(|b| !b.is_ascii_alphanumeric() && *b != b'_')
374            .unwrap_or(true)
375}
376
377fn canonicalize_schema_filter(
378    filter: &[String],
379    manifest_schemas: &[String],
380) -> Result<Vec<String>> {
381    let mut canonicalized = Vec::new();
382    let mut seen = HashSet::new();
383
384    for schema in filter {
385        let canonical = manifest_schemas
386            .iter()
387            .find(|candidate| candidate.eq_ignore_ascii_case(schema))
388            .cloned()
389            .ok_or_else(|| {
390                SchemaNotInSnapshotSnafu {
391                    schema: schema.clone(),
392                }
393                .build()
394            })?;
395
396        if seen.insert(canonical.to_ascii_lowercase()) {
397            canonicalized.push(canonical);
398        }
399    }
400
401    Ok(canonicalized)
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407
408    #[test]
409    fn test_parse_ddl_statements() {
410        let content = r#"
411-- Schema: public
412CREATE DATABASE public;
413CREATE TABLE t (ts TIMESTAMP TIME INDEX, host STRING, PRIMARY KEY (host)) ENGINE=mito;
414
415-- comment
416CREATE VIEW v AS SELECT * FROM t;
417"#;
418        let statements = parse_ddl_statements(content);
419        assert_eq!(statements.len(), 3);
420        assert!(statements[0].starts_with("CREATE DATABASE public"));
421        assert!(statements[1].starts_with("CREATE TABLE t"));
422        assert!(statements[2].starts_with("CREATE VIEW v"));
423    }
424
425    #[test]
426    fn test_parse_ddl_statements_preserves_semicolons_in_string_literals() {
427        let content = r#"
428CREATE TABLE t (
429    host STRING DEFAULT 'a;b'
430);
431CREATE VIEW v AS SELECT ';' AS marker;
432"#;
433
434        let statements = parse_ddl_statements(content);
435
436        assert_eq!(statements.len(), 2);
437        assert!(statements[0].contains("'a;b'"));
438        assert!(statements[1].contains("';' AS marker"));
439    }
440
441    #[test]
442    fn test_parse_ddl_statements_handles_comments_without_splitting() {
443        let content = r#"
444-- leading comment
445CREATE TABLE t (ts TIMESTAMP TIME INDEX); /* block; comment */
446CREATE VIEW v AS SELECT 1;
447"#;
448
449        let statements = parse_ddl_statements(content);
450
451        assert_eq!(statements.len(), 2);
452        assert!(statements[0].starts_with("CREATE TABLE t"));
453        assert!(statements[1].starts_with("CREATE VIEW v"));
454    }
455
456    #[test]
457    fn test_canonicalize_schema_filter_uses_manifest_casing() {
458        let filter = vec!["TEST_DB".to_string(), "PUBLIC".to_string()];
459        let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
460
461        let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
462
463        assert_eq!(canonicalized, vec!["test_db", "public"]);
464    }
465
466    #[test]
467    fn test_canonicalize_schema_filter_dedupes_case_insensitive_matches() {
468        let filter = vec![
469            "TEST_DB".to_string(),
470            "test_db".to_string(),
471            "PUBLIC".to_string(),
472            "public".to_string(),
473        ];
474        let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
475
476        let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
477
478        assert_eq!(canonicalized, vec!["test_db", "public"]);
479    }
480
481    #[test]
482    fn test_canonicalize_schema_filter_rejects_missing_schema() {
483        let filter = vec!["missing".to_string()];
484        let manifest_schemas = vec!["test_db".to_string()];
485
486        let error = canonicalize_schema_filter(&filter, &manifest_schemas)
487            .expect_err("missing schema should fail")
488            .to_string();
489
490        assert!(error.contains("missing"));
491    }
492
493    #[test]
494    fn test_ddl_statement_for_schema_create_table_uses_execution_schema() {
495        let stmt = ddl_statement_for_schema(
496            "test_db",
497            "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX) ENGINE=mito".to_string(),
498        );
499        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
500    }
501
502    #[test]
503    fn test_ddl_statement_for_schema_create_view_uses_execution_schema() {
504        let stmt = ddl_statement_for_schema(
505            "test_db",
506            "CREATE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
507        );
508        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
509    }
510
511    #[test]
512    fn test_ddl_statement_for_schema_create_or_replace_view_uses_execution_schema() {
513        let stmt = ddl_statement_for_schema(
514            "test_db",
515            "CREATE OR REPLACE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
516        );
517        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
518    }
519
520    #[test]
521    fn test_ddl_statement_for_schema_create_external_table_uses_execution_schema() {
522        let stmt = ddl_statement_for_schema(
523            "test_db",
524            "CREATE EXTERNAL TABLE IF NOT EXISTS ext_metrics (ts TIMESTAMP TIME INDEX) ENGINE=file"
525                .to_string(),
526        );
527        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
528    }
529
530    #[test]
531    fn test_ddl_statement_for_schema_create_database_uses_public_context() {
532        let stmt = ddl_statement_for_schema("test_db", "CREATE DATABASE test_db".to_string());
533        assert_eq!(stmt.execution_schema, None);
534    }
535
536    #[test]
537    fn test_starts_with_keyword_requires_word_boundary() {
538        assert!(starts_with_keyword("CREATE TABLE t", "CREATE"));
539        assert!(!starts_with_keyword("CREATED TABLE t", "CREATE"));
540        assert!(!starts_with_keyword("TABLESPACE foo", "TABLE"));
541    }
542}