feat: support to create external table (#1372)

* feat: support to create external table

* chore: apply suggestions from CR

* test: add create external table without ts type

* chore: apply suggestions from CR

* fix: fix import typo

* refactor: move consts to table crate

* chore: apply suggestions from CR

* refactor: rename create_table_schema
This commit is contained in:
Weny Xu
2023-04-24 15:43:12 +09:00
committed by GitHub
parent 17daf4cdff
commit f2167663b2
26 changed files with 527 additions and 72 deletions

View File

@@ -12,6 +12,7 @@ catalog = { path = "../catalog" }
chrono.workspace = true
common-base = { path = "../common/base" }
common-catalog = { path = "../common/catalog" }
common-datasource = { path = "../common/datasource" }
common-error = { path = "../common/error" }
common-function = { path = "../common/function" }
common-query = { path = "../common/query" }
@@ -29,9 +30,11 @@ futures = "0.3"
futures-util.workspace = true
humantime = "2.1"
metrics.workspace = true
object-store = { path = "../object-store" }
once_cell = "1.10"
promql = { path = "../promql" }
promql-parser = "0.1.0"
regex = "1.6"
serde.workspace = true
serde_json = "1.0"
session = { path = "../session" }

View File

@@ -126,6 +126,54 @@ pub enum Error {
#[snafu(backtrace)]
source: sql::error::Error,
},
#[snafu(display("Failed to parse SQL, source: {}", source))]
ParseSql {
#[snafu(backtrace)]
source: sql::error::Error,
},
#[snafu(display("Missing required field: {}", name))]
MissingRequiredField { name: String, location: Location },
#[snafu(display("Failed to regex, source: {}", source))]
BuildRegex {
location: Location,
source: regex::Error,
},
#[snafu(display("Failed to build data source backend, source: {}", source))]
BuildBackend {
#[snafu(backtrace)]
source: common_datasource::error::Error,
},
#[snafu(display("Failed to list objects, source: {}", source))]
ListObjects {
#[snafu(backtrace)]
source: common_datasource::error::Error,
},
#[snafu(display("Unsupported file format: {}", format))]
UnsupportedFileFormat { format: String, location: Location },
#[snafu(display("Failed to parse file format: {}", source))]
ParseFileFormat {
#[snafu(backtrace)]
source: common_datasource::error::Error,
},
#[snafu(display("Failed to infer schema: {}", source))]
InferSchema {
#[snafu(backtrace)]
source: common_datasource::error::Error,
},
#[snafu(display("Failed to convert datafusion schema, source: {}", source))]
ConvertSchema {
#[snafu(backtrace)]
source: datatypes::error::Error,
},
}
impl ErrorExt for Error {
@@ -139,12 +187,22 @@ impl ErrorExt for Error {
| SchemaNotFound { .. }
| TableNotFound { .. }
| ParseTimestamp { .. }
| ParseFloat { .. } => StatusCode::InvalidArguments,
| ParseFloat { .. }
| MissingRequiredField { .. }
| BuildRegex { .. }
| UnsupportedFileFormat { .. }
| ConvertSchema { .. } => StatusCode::InvalidArguments,
BuildBackend { .. } | ListObjects { .. } => StatusCode::StorageUnavailable,
ParseFileFormat { source, .. } | InferSchema { source, .. } => source.status_code(),
QueryAccessDenied { .. } => StatusCode::AccessDenied,
Catalog { source } => source.status_code(),
VectorComputation { source } | ConvertDatafusionSchema { source } => {
source.status_code()
}
ParseSql { source } => source.status_code(),
CreateRecordBatch { source } => source.status_code(),
QueryExecution { source } | QueryPlan { source } => source.status_code(),
DataFusion { .. } | MissingTimestampColumn { .. } => StatusCode::Internal,

View File

@@ -14,20 +14,35 @@
mod show;
use std::collections::HashMap;
use std::sync::Arc;
use catalog::CatalogManagerRef;
use common_catalog::consts::DEFAULT_CATALOG_NAME;
use common_datasource::file_format::csv::CsvFormat;
use common_datasource::file_format::json::JsonFormat;
use common_datasource::file_format::parquet::ParquetFormat;
use common_datasource::file_format::{infer_schemas, FileFormat};
use common_datasource::lister::{Lister, Source};
use common_datasource::object_store::build_backend;
use common_datasource::util::find_dir_and_filename;
use common_query::Output;
use common_recordbatch::RecordBatches;
use datatypes::prelude::*;
use datatypes::schema::{ColumnSchema, Schema};
use datatypes::schema::{ColumnSchema, RawSchema, Schema};
use datatypes::vectors::{Helper, StringVector};
use object_store::ObjectStore;
use once_cell::sync::Lazy;
use regex::Regex;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use sql::ast::ColumnDef;
use sql::statements::column_def_to_schema;
use sql::statements::create::Partitions;
use sql::statements::show::{ShowDatabases, ShowKind, ShowTables};
use table::requests::{
IMMUTABLE_TABLE_FORMAT_KEY, IMMUTABLE_TABLE_LOCATION_KEY, IMMUTABLE_TABLE_PATTERN_KEY,
};
use table::TableRef;
use crate::error::{self, Result};
@@ -250,6 +265,105 @@ fn describe_column_semantic_types(
))
}
pub async fn prepare_immutable_file_table_files_and_schema(
options: &HashMap<String, String>,
columns: &Vec<ColumnDef>,
) -> Result<(Vec<String>, RawSchema)> {
let (object_store, files) = prepare_immutable_file_table(options).await?;
let schema = if !columns.is_empty() {
let columns_schemas: Vec<_> = columns
.iter()
.map(|column| column_def_to_schema(column, false).context(error::ParseSqlSnafu))
.collect::<Result<Vec<_>>>()?;
RawSchema::new(columns_schemas)
} else {
let format = parse_immutable_file_table_format(options)?;
infer_immutable_file_table_schema(&object_store, &*format, &files).await?
};
Ok((files, schema))
}
// lists files in the frontend to reduce unnecessary scan requests repeated in each datanode.
async fn prepare_immutable_file_table(
options: &HashMap<String, String>,
) -> Result<(ObjectStore, Vec<String>)> {
let url =
options
.get(IMMUTABLE_TABLE_LOCATION_KEY)
.context(error::MissingRequiredFieldSnafu {
name: IMMUTABLE_TABLE_LOCATION_KEY,
})?;
let (dir, filename) = find_dir_and_filename(url);
let source = if let Some(filename) = filename {
Source::Filename(filename)
} else {
Source::Dir
};
let regex = options
.get(IMMUTABLE_TABLE_PATTERN_KEY)
.map(|x| Regex::new(x))
.transpose()
.context(error::BuildRegexSnafu)?;
let object_store = build_backend(url, options).context(error::BuildBackendSnafu)?;
let lister = Lister::new(object_store.clone(), source, dir, regex);
// If we scan files in a directory every time the database restarts,
// then it might lead to a potential undefined behavior:
// If a user adds a file with an incompatible schema to that directory,
// it will make the external table unavailable.
let files = lister
.list()
.await
.context(error::ListObjectsSnafu)?
.into_iter()
.filter_map(|entry| {
if entry.path().ends_with('/') {
None
} else {
Some(entry.path().to_string())
}
})
.collect::<Vec<_>>();
Ok((object_store, files))
}
fn parse_immutable_file_table_format(
options: &HashMap<String, String>,
) -> Result<Box<dyn FileFormat>> {
let format = options
.get(IMMUTABLE_TABLE_FORMAT_KEY)
.cloned()
.unwrap_or_default()
.to_uppercase();
match format.as_str() {
"CSV" => {
let file_format = CsvFormat::try_from(options).context(error::ParseFileFormatSnafu)?;
Ok(Box::new(file_format))
}
"JSON" => {
let file_format = JsonFormat::try_from(options).context(error::ParseFileFormatSnafu)?;
Ok(Box::new(file_format))
}
"PARQUET" => Ok(Box::new(ParquetFormat {})),
format => error::UnsupportedFileFormatSnafu { format }.fail(),
}
}
async fn infer_immutable_file_table_schema(
object_store: &ObjectStore,
file_format: &dyn FileFormat,
files: &[String],
) -> Result<RawSchema> {
let merged = infer_schemas(object_store, files, file_format)
.await
.context(error::InferSchemaSnafu)?;
Ok(RawSchema::from(
&Schema::try_from(merged).context(error::ConvertSchemaSnafu)?,
))
}
#[cfg(test)]
mod test {
use std::sync::Arc;