feat: copy database from (#3164)

* wip: impl COPY DATABASE FROM parser

* wip: impl copy database from

* wip: add some ut

* wip: add continue_on_error option

* test: add sqlness cases for copy database

* fix: trailing newline

* fix: typo

* fix: some cr comments

* chore: resolve confilicts

* fix: some cr comments
This commit is contained in:
Lei, HUANG
2024-01-22 14:33:54 +08:00
committed by GitHub
parent 966875ee11
commit 3834ea7422
12 changed files with 459 additions and 163 deletions

View File

@@ -54,3 +54,6 @@ substrait.workspace = true
table.workspace = true
tokio.workspace = true
tonic.workspace = true
[dev-dependencies]
common-test-util.workspace = true

View File

@@ -418,6 +418,9 @@ pub enum Error {
location: Location,
},
#[snafu(display("Invalid COPY DATABASE location, must end with '/': {}", value))]
InvalidCopyDatabasePath { value: String, location: Location },
#[snafu(display("Table metadata manager error"))]
TableMetadataManager {
source: common_meta::error::Error,
@@ -596,7 +599,9 @@ impl ErrorExt for Error {
| Error::BuildBackend { source, .. } => source.status_code(),
Error::ExecuteDdl { source, .. } => source.status_code(),
Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments,
Error::InvalidCopyParameter { .. } | Error::InvalidCopyDatabasePath { .. } => {
StatusCode::InvalidArguments
}
Error::ReadRecordBatch { source, .. } | Error::BuildColumnVectors { source, .. } => {
source.status_code()

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod backup;
mod copy_database;
mod copy_table_from;
mod copy_table_to;
mod ddl;
@@ -41,7 +41,7 @@ use query::plan::LogicalPlan;
use query::QueryEngineRef;
use session::context::QueryContextRef;
use snafu::{OptionExt, ResultExt};
use sql::statements::copy::{CopyDatabaseArgument, CopyTable, CopyTableArgument};
use sql::statements::copy::{CopyDatabase, CopyDatabaseArgument, CopyTable, CopyTableArgument};
use sql::statements::statement::Statement;
use sql::statements::OptionMap;
use sql::util::format_raw_object_name;
@@ -55,7 +55,7 @@ use crate::error::{
PlanStatementSnafu, Result, TableNotFoundSnafu,
};
use crate::insert::InserterRef;
use crate::statement::backup::{COPY_DATABASE_TIME_END_KEY, COPY_DATABASE_TIME_START_KEY};
use crate::statement::copy_database::{COPY_DATABASE_TIME_END_KEY, COPY_DATABASE_TIME_START_KEY};
use crate::table::table_idents_to_full_name;
#[derive(Clone)]
@@ -131,9 +131,23 @@ impl StatementExecutor {
}
}
Statement::Copy(sql::statements::copy::Copy::CopyDatabase(arg)) => {
self.copy_database(to_copy_database_request(arg, &query_ctx)?)
.await
Statement::Copy(sql::statements::copy::Copy::CopyDatabase(copy_database)) => {
match copy_database {
CopyDatabase::To(arg) => {
self.copy_database_to(
to_copy_database_request(arg, &query_ctx)?,
query_ctx.clone(),
)
.await
}
CopyDatabase::From(arg) => {
self.copy_database_from(
to_copy_database_request(arg, &query_ctx)?,
query_ctx,
)
.await
}
}
}
Statement::CreateTable(stmt) => {

View File

@@ -1,89 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_datasource::file_format::Format;
use common_query::Output;
use common_telemetry::{info, tracing};
use session::context::QueryContextBuilder;
use snafu::{ensure, ResultExt};
use table::requests::{CopyDatabaseRequest, CopyDirection, CopyTableRequest};
use crate::error;
use crate::error::{CatalogSnafu, InvalidCopyParameterSnafu};
use crate::statement::StatementExecutor;
pub(crate) const COPY_DATABASE_TIME_START_KEY: &str = "start_time";
pub(crate) const COPY_DATABASE_TIME_END_KEY: &str = "end_time";
impl StatementExecutor {
#[tracing::instrument(skip_all)]
pub(crate) async fn copy_database(&self, req: CopyDatabaseRequest) -> error::Result<Output> {
// location must end with / so that every table is exported to a file.
ensure!(
req.location.ends_with('/'),
InvalidCopyParameterSnafu {
key: "location",
value: req.location,
}
);
info!(
"Copy database {}.{}, dir: {},. time: {:?}",
req.catalog_name, req.schema_name, req.location, req.time_range
);
let table_names = self
.catalog_manager
.table_names(&req.catalog_name, &req.schema_name)
.await
.context(CatalogSnafu)?;
let suffix = Format::try_from(&req.with)
.context(error::ParseFileFormatSnafu)?
.suffix();
let mut exported_rows = 0;
for table_name in table_names {
// TODO(hl): remove this hardcode once we've removed numbers table.
if table_name == "numbers" {
continue;
}
let mut table_file = req.location.clone();
table_file.push_str(&table_name);
table_file.push_str(suffix);
info!(
"Copy table: {}.{}.{} to {}",
req.catalog_name, req.schema_name, table_name, table_file
);
let exported = self
.copy_table_to(
CopyTableRequest {
catalog_name: req.catalog_name.clone(),
schema_name: req.schema_name.clone(),
table_name,
location: table_file,
with: req.with.clone(),
connection: req.connection.clone(),
pattern: None,
direction: CopyDirection::Export,
timestamp_range: req.time_range,
},
QueryContextBuilder::default().build(),
)
.await?;
exported_rows += exported;
}
Ok(Output::AffectedRows(exported_rows))
}
}

View File

@@ -0,0 +1,250 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::Path;
use std::str::FromStr;
use common_datasource::file_format::Format;
use common_datasource::lister::{Lister, Source};
use common_datasource::object_store::build_backend;
use common_query::Output;
use common_telemetry::{debug, error, info, tracing};
use object_store::Entry;
use regex::Regex;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use table::requests::{CopyDatabaseRequest, CopyDirection, CopyTableRequest};
use crate::error;
use crate::error::{CatalogSnafu, InvalidCopyDatabasePathSnafu};
use crate::statement::StatementExecutor;
pub(crate) const COPY_DATABASE_TIME_START_KEY: &str = "start_time";
pub(crate) const COPY_DATABASE_TIME_END_KEY: &str = "end_time";
pub(crate) const CONTINUE_ON_ERROR_KEY: &str = "continue_on_error";
impl StatementExecutor {
#[tracing::instrument(skip_all)]
pub(crate) async fn copy_database_to(
&self,
req: CopyDatabaseRequest,
ctx: QueryContextRef,
) -> error::Result<Output> {
// location must end with / so that every table is exported to a file.
ensure!(
req.location.ends_with('/'),
InvalidCopyDatabasePathSnafu {
value: req.location,
}
);
info!(
"Copy database {}.{} to dir: {}, time: {:?}",
req.catalog_name, req.schema_name, req.location, req.time_range
);
let table_names = self
.catalog_manager
.table_names(&req.catalog_name, &req.schema_name)
.await
.context(CatalogSnafu)?;
let suffix = Format::try_from(&req.with)
.context(error::ParseFileFormatSnafu)?
.suffix();
let mut exported_rows = 0;
for table_name in table_names {
// TODO(hl): also handles tables with metric engine.
// TODO(hl): remove this hardcode once we've removed numbers table.
if table_name == "numbers" {
continue;
}
let mut table_file = req.location.clone();
table_file.push_str(&table_name);
table_file.push_str(suffix);
info!(
"Copy table: {}.{}.{} to {}",
req.catalog_name, req.schema_name, table_name, table_file
);
let exported = self
.copy_table_to(
CopyTableRequest {
catalog_name: req.catalog_name.clone(),
schema_name: req.schema_name.clone(),
table_name,
location: table_file,
with: req.with.clone(),
connection: req.connection.clone(),
pattern: None,
direction: CopyDirection::Export,
timestamp_range: req.time_range,
},
ctx.clone(),
)
.await?;
exported_rows += exported;
}
Ok(Output::AffectedRows(exported_rows))
}
/// Imports data to database from a given location and returns total rows imported.
#[tracing::instrument(skip_all)]
pub(crate) async fn copy_database_from(
&self,
req: CopyDatabaseRequest,
ctx: QueryContextRef,
) -> error::Result<Output> {
// location must end with /
ensure!(
req.location.ends_with('/'),
InvalidCopyDatabasePathSnafu {
value: req.location,
}
);
info!(
"Copy database {}.{} from dir: {}, time: {:?}",
req.catalog_name, req.schema_name, req.location, req.time_range
);
let suffix = Format::try_from(&req.with)
.context(error::ParseFileFormatSnafu)?
.suffix();
let entries = list_files_to_copy(&req, suffix).await?;
let continue_on_error = req
.with
.get(CONTINUE_ON_ERROR_KEY)
.and_then(|v| bool::from_str(v).ok())
.unwrap_or(false);
let mut rows_inserted = 0;
for e in entries {
let table_name = match parse_file_name_to_copy(&e) {
Ok(table_name) => table_name,
Err(err) => {
if continue_on_error {
error!(err; "Failed to import table from file: {:?}", e);
continue;
} else {
return Err(err);
}
}
};
let req = CopyTableRequest {
catalog_name: req.catalog_name.clone(),
schema_name: req.schema_name.clone(),
table_name: table_name.clone(),
location: format!("{}/{}", req.location, e.path()),
with: req.with.clone(),
connection: req.connection.clone(),
pattern: None,
direction: CopyDirection::Import,
timestamp_range: None,
};
debug!("Copy table, arg: {:?}", req);
match self.copy_table_from(req, ctx.clone()).await {
Ok(rows) => {
rows_inserted += rows;
}
Err(err) => {
if continue_on_error {
error!(err; "Failed to import file to table: {}", table_name);
continue;
} else {
return Err(err);
}
}
}
}
Ok(Output::AffectedRows(rows_inserted))
}
}
/// Parses table names from files' names.
fn parse_file_name_to_copy(e: &Entry) -> error::Result<String> {
Path::new(e.name())
.file_stem()
.and_then(|os_str| os_str.to_str())
.map(|s| s.to_string())
.context(error::InvalidTableNameSnafu {
table_name: e.name().to_string(),
})
}
/// Lists all files with expected suffix that can be imported to database.
async fn list_files_to_copy(req: &CopyDatabaseRequest, suffix: &str) -> error::Result<Vec<Entry>> {
let object_store =
build_backend(&req.location, &req.connection).context(error::BuildBackendSnafu)?;
let pattern = Regex::try_from(format!(".*{}", suffix)).context(error::BuildRegexSnafu)?;
let lister = Lister::new(
object_store.clone(),
Source::Dir,
"/".to_string(),
Some(pattern),
);
lister.list().await.context(error::ListObjectsSnafu)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use object_store::services::Fs;
use object_store::util::normalize_dir;
use object_store::ObjectStore;
use table::requests::CopyDatabaseRequest;
use crate::statement::copy_database::{list_files_to_copy, parse_file_name_to_copy};
#[tokio::test]
async fn test_list_files_and_parse_table_name() {
let dir = common_test_util::temp_dir::create_temp_dir("test_list_files_to_copy");
let store_dir = normalize_dir(dir.path().to_str().unwrap());
let mut builder = Fs::default();
let _ = builder.root(&store_dir);
let object_store = ObjectStore::new(builder).unwrap().finish();
object_store.write("a.parquet", "").await.unwrap();
object_store.write("b.parquet", "").await.unwrap();
object_store.write("c.csv", "").await.unwrap();
object_store.write("d", "").await.unwrap();
object_store.write("e.f.parquet", "").await.unwrap();
let request = CopyDatabaseRequest {
catalog_name: "catalog_0".to_string(),
schema_name: "schema_0".to_string(),
location: store_dir,
with: [("FORMAT".to_string(), "parquet".to_string())]
.into_iter()
.collect(),
connection: Default::default(),
time_range: None,
};
let listed = list_files_to_copy(&request, ".parquet")
.await
.unwrap()
.into_iter()
.map(|e| parse_file_name_to_copy(&e).unwrap())
.collect::<HashSet<_>>();
assert_eq!(
["a".to_string(), "b".to_string(), "e.f".to_string()]
.into_iter()
.collect::<HashSet<_>>(),
listed
);
}
}

View File

@@ -111,15 +111,17 @@ impl StatementExecutor {
let table_provider = Arc::new(DfTableProviderAdapter::new(table));
let table_source = Arc::new(DefaultTableSource::new(table_provider));
let plan = LogicalPlanBuilder::scan_with_filters(
let mut builder = LogicalPlanBuilder::scan_with_filters(
df_table_ref.to_owned_reference(),
table_source,
None,
filters,
filters.clone(),
)
.context(BuildDfLogicalPlanSnafu)?
.build()
.context(BuildDfLogicalPlanSnafu)?;
for f in filters {
builder = builder.filter(f).context(BuildDfLogicalPlanSnafu)?;
}
let plan = builder.build().context(BuildDfLogicalPlanSnafu)?;
let output = self
.query_engine