refactor: replace Copy Format with datasource Format (#1435)

* refactor: replace Copy Format with datasource Format

* chore: apply suggestions from CR

* chore: apply suggestions from CR
This commit is contained in:
Weny Xu
2023-04-23 17:31:54 +09:00
committed by GitHub
parent c5dba29f9e
commit d374859e24
10 changed files with 148 additions and 106 deletions

View File

@@ -22,19 +22,32 @@ use url::ParseError;
#[snafu(visibility(pub))]
pub enum Error {
#[snafu(display("Unsupported compression type: {}", compression_type))]
UnsupportedCompressionType { compression_type: String },
UnsupportedCompressionType {
compression_type: String,
location: Location,
},
#[snafu(display("Unsupported backend protocol: {}", protocol))]
UnsupportedBackendProtocol { protocol: String },
UnsupportedBackendProtocol {
protocol: String,
location: Location,
},
#[snafu(display("Unsupported format protocol: {}", format))]
UnsupportedFormat { format: String, location: Location },
#[snafu(display("empty host: {}", url))]
EmptyHostPath { url: String },
EmptyHostPath { url: String, location: Location },
#[snafu(display("Invalid path: {}", path))]
InvalidPath { path: String },
InvalidPath { path: String, location: Location },
#[snafu(display("Invalid url: {}, error :{}", url, source))]
InvalidUrl { url: String, source: ParseError },
InvalidUrl {
url: String,
source: ParseError,
location: Location,
},
#[snafu(display("Failed to decompression, source: {}", source))]
Decompression {
@@ -82,7 +95,7 @@ pub enum Error {
},
#[snafu(display("Invalid connection: {}", msg))]
InvalidConnection { msg: String },
InvalidConnection { msg: String, location: Location },
#[snafu(display("Failed to join handle: {}", source))]
JoinHandle {
@@ -102,6 +115,9 @@ pub enum Error {
source: arrow_schema::ArrowError,
location: Location,
},
#[snafu(display("Missing required field: {}", name))]
MissingRequiredField { name: String, location: Location },
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -116,6 +132,7 @@ impl ErrorExt for Error {
UnsupportedBackendProtocol { .. }
| UnsupportedCompressionType { .. }
| UnsupportedFormat { .. }
| InvalidConnection { .. }
| InvalidUrl { .. }
| EmptyHostPath { .. }
@@ -124,7 +141,8 @@ impl ErrorExt for Error {
| ReadParquetSnafu { .. }
| ParquetToSchema { .. }
| ParseFormat { .. }
| MergeSchema { .. } => StatusCode::InvalidArguments,
| MergeSchema { .. }
| MissingRequiredField { .. } => StatusCode::InvalidArguments,
Decompression { .. } | JoinHandle { .. } => StatusCode::Unexpected,
}
@@ -147,13 +165,15 @@ impl ErrorExt for Error {
JoinHandle { location, .. } => Some(*location),
ParseFormat { location, .. } => Some(*location),
MergeSchema { location, .. } => Some(*location),
MissingRequiredField { location, .. } => Some(*location),
UnsupportedBackendProtocol { .. }
| EmptyHostPath { .. }
| InvalidPath { .. }
| InvalidUrl { .. }
| InvalidConnection { .. }
| UnsupportedCompressionType { .. } => None,
UnsupportedBackendProtocol { location, .. } => Some(*location),
EmptyHostPath { location, .. } => Some(*location),
InvalidPath { location, .. } => Some(*location),
InvalidUrl { location, .. } => Some(*location),
InvalidConnection { location, .. } => Some(*location),
UnsupportedCompressionType { location, .. } => Some(*location),
UnsupportedFormat { location, .. } => Some(*location),
}
}
}

View File

@@ -20,6 +20,7 @@ pub mod tests;
pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
use std::collections::HashMap;
use std::result;
use std::sync::Arc;
use std::task::Poll;
@@ -33,13 +34,42 @@ use datafusion::physical_plan::file_format::FileOpenFuture;
use futures::StreamExt;
use object_store::ObjectStore;
use self::csv::CsvFormat;
use self::json::JsonFormat;
use self::parquet::ParquetFormat;
use crate::compression::CompressionType;
use crate::error::Result;
use crate::error::{self, Result};
pub const FORMAT_COMPRESSION_TYPE: &str = "COMPRESSION_TYPE";
pub const FORMAT_DELIMTERL: &str = "DELIMTERL";
pub const FORMAT_SCHEMA_INFER_MAX_RECORD: &str = "SCHEMA_INFER_MAX_RECORD";
pub const FORMAT_HAS_HEADER: &str = "FORMAT_HAS_HEADER";
pub const FORMAT_TYPE: &str = "FORMAT";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Format {
Csv(CsvFormat),
Json(JsonFormat),
Parquet(ParquetFormat),
}
impl TryFrom<&HashMap<String, String>> for Format {
type Error = error::Error;
fn try_from(options: &HashMap<String, String>) -> Result<Self> {
let format = options
.get(FORMAT_TYPE)
.map(|format| format.to_ascii_uppercase())
.unwrap_or_else(|| "PARQUET".to_string());
match format.as_str() {
"CSV" => Ok(Self::Csv(CsvFormat::try_from(options)?)),
"JSON" => Ok(Self::Json(JsonFormat::try_from(options)?)),
"PARQUET" => Ok(Self::Parquet(ParquetFormat::default())),
_ => error::UnsupportedFormatSnafu { format: &format }.fail(),
}
}
}
#[async_trait]
pub trait FileFormat: Send + Sync + std::fmt::Debug {

View File

@@ -31,7 +31,7 @@ use snafu::ResultExt;
use crate::error::{self, Result};
use crate::file_format::FileFormat;
#[derive(Debug, Default)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct ParquetFormat {}
#[async_trait]
@@ -142,7 +142,6 @@ impl AsyncFileReader for LazyParquetFileReader {
#[cfg(test)]
mod tests {
use super::*;
use crate::file_format::FileFormat;
use crate::test_util::{self, format_schema, test_store};
fn test_data_root() -> String {

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::assert_matches::assert_matches;
use std::collections::HashMap;
use std::sync::Arc;
use std::vec;
@@ -26,10 +28,13 @@ use datafusion::physical_plan::ExecutionPlan;
use datafusion::prelude::SessionContext;
use futures::StreamExt;
use super::FORMAT_TYPE;
use crate::compression::CompressionType;
use crate::error;
use crate::file_format::csv::{CsvConfigBuilder, CsvOpener};
use crate::file_format::json::JsonOpener;
use crate::file_format::parquet::DefaultParquetFileReaderFactory;
use crate::file_format::Format;
use crate::test_util::{self, test_basic_schema, test_store};
fn scan_config(file_schema: SchemaRef, limit: Option<usize>, filename: &str) -> FileScanConfig {
@@ -204,3 +209,37 @@ async fn test_parquet_exec() {
&result
);
}
#[test]
fn test_format() {
let value = [(FORMAT_TYPE.to_string(), "csv".to_string())]
.into_iter()
.collect::<HashMap<_, _>>();
assert_matches!(Format::try_from(&value).unwrap(), Format::Csv(_));
let value = [(FORMAT_TYPE.to_string(), "Parquet".to_string())]
.into_iter()
.collect::<HashMap<_, _>>();
assert_matches!(Format::try_from(&value).unwrap(), Format::Parquet(_));
let value = [(FORMAT_TYPE.to_string(), "JSON".to_string())]
.into_iter()
.collect::<HashMap<_, _>>();
assert_matches!(Format::try_from(&value).unwrap(), Format::Json(_));
let value = [(FORMAT_TYPE.to_string(), "Foobar".to_string())]
.into_iter()
.collect::<HashMap<_, _>>();
assert_matches!(
Format::try_from(&value).unwrap_err(),
error::Error::UnsupportedFormat { .. }
);
let value = HashMap::new();
assert_matches!(Format::try_from(&value).unwrap(), Format::Parquet(_));
}

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![feature(assert_matches)]
pub mod compression;
pub mod error;
pub mod file_format;