mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-28 10:50:39 +00:00
feat: implement infer schema from single file (#1348)
* feat: implement infer schema from file * feat: implement compression type * refactor: remove unnecessary BufReader * refactor: remove SyncIoBridge and using tokio_util::io::SyncIoBridge instead * chore: apply suggestions from CR
This commit is contained in:
8
Cargo.lock
generated
8
Cargo.lock
generated
@@ -1561,11 +1561,19 @@ dependencies = [
|
||||
name = "common-datasource"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-schema",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"common-error",
|
||||
"common-runtime",
|
||||
"datafusion",
|
||||
"futures",
|
||||
"object-store",
|
||||
"regex",
|
||||
"snafu",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"url",
|
||||
]
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ snafu = { version = "0.7", features = ["backtraces"] }
|
||||
sqlparser = "0.32"
|
||||
tempfile = "3"
|
||||
tokio = { version = "1.24.2", features = ["full"] }
|
||||
tokio-util = "0.7"
|
||||
tokio-util = { version = "0.7", features = ["io-util"] }
|
||||
tonic = { version = "0.8", features = ["tls"] }
|
||||
uuid = { version = "1", features = ["serde", "v4", "fast-rng"] }
|
||||
|
||||
|
||||
@@ -5,9 +5,24 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
arrow.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
async-compression = { version = "0.3", features = [
|
||||
"bzip2",
|
||||
"gzip",
|
||||
"xz",
|
||||
"zstd",
|
||||
"futures-io",
|
||||
"tokio",
|
||||
] }
|
||||
async-trait.workspace = true
|
||||
common-error = { path = "../error" }
|
||||
common-runtime = { path = "../runtime" }
|
||||
datafusion.workspace = true
|
||||
futures.workspace = true
|
||||
object-store = { path = "../../object-store" }
|
||||
regex = "1.7"
|
||||
snafu.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
url = "2.3"
|
||||
|
||||
84
src/common/datasource/src/compression.rs
Normal file
84
src/common/datasource/src/compression.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt::Display;
|
||||
use std::str::FromStr;
|
||||
|
||||
use async_compression::tokio::bufread::{BzDecoder, GzipDecoder, XzDecoder, ZstdDecoder};
|
||||
use tokio::io::{AsyncRead, BufReader};
|
||||
|
||||
use crate::error::{self, Error, Result};
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum CompressionType {
|
||||
/// Gzip-ed file
|
||||
GZIP,
|
||||
/// Bzip2-ed file
|
||||
BZIP2,
|
||||
/// Xz-ed file (liblzma)
|
||||
XZ,
|
||||
/// Zstd-ed file,
|
||||
ZSTD,
|
||||
/// Uncompressed file
|
||||
UNCOMPRESSED,
|
||||
}
|
||||
|
||||
impl FromStr for CompressionType {
|
||||
type Err = Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self> {
|
||||
let s = s.to_uppercase();
|
||||
match s.as_str() {
|
||||
"GZIP" | "GZ" => Ok(Self::GZIP),
|
||||
"BZIP2" | "BZ2" => Ok(Self::BZIP2),
|
||||
"XZ" => Ok(Self::XZ),
|
||||
"ZST" | "ZSTD" => Ok(Self::ZSTD),
|
||||
"" => Ok(Self::UNCOMPRESSED),
|
||||
_ => error::UnsupportedCompressionTypeSnafu {
|
||||
compression_type: s,
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for CompressionType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(match self {
|
||||
Self::GZIP => "GZIP",
|
||||
Self::BZIP2 => "BZIP2",
|
||||
Self::XZ => "XZ",
|
||||
Self::ZSTD => "ZSTD",
|
||||
Self::UNCOMPRESSED => "",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl CompressionType {
|
||||
pub const fn is_compressed(&self) -> bool {
|
||||
!matches!(self, &Self::UNCOMPRESSED)
|
||||
}
|
||||
|
||||
pub fn convert_async_read<T: AsyncRead + Unpin + Send + 'static>(
|
||||
&self,
|
||||
s: T,
|
||||
) -> Box<dyn AsyncRead + Unpin + Send> {
|
||||
match self {
|
||||
CompressionType::GZIP => Box::new(GzipDecoder::new(BufReader::new(s))),
|
||||
CompressionType::BZIP2 => Box::new(BzDecoder::new(BufReader::new(s))),
|
||||
CompressionType::XZ => Box::new(XzDecoder::new(BufReader::new(s))),
|
||||
CompressionType::ZSTD => Box::new(ZstdDecoder::new(BufReader::new(s))),
|
||||
CompressionType::UNCOMPRESSED => Box::new(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,9 @@ use url::ParseError;
|
||||
#[derive(Debug, Snafu)]
|
||||
#[snafu(visibility(pub))]
|
||||
pub enum Error {
|
||||
#[snafu(display("Unsupported compression type: {}", compression_type))]
|
||||
UnsupportedCompressionType { compression_type: String },
|
||||
|
||||
#[snafu(display("Unsupported backend protocol: {}", protocol))]
|
||||
UnsupportedBackendProtocol { protocol: String },
|
||||
|
||||
@@ -33,12 +36,44 @@ pub enum Error {
|
||||
#[snafu(display("Invalid url: {}, error :{}", url, source))]
|
||||
InvalidUrl { url: String, source: ParseError },
|
||||
|
||||
#[snafu(display("Failed to decompression, source: {}", source))]
|
||||
Decompression {
|
||||
source: object_store::Error,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to build backend, source: {}", source))]
|
||||
BuildBackend {
|
||||
source: object_store::Error,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to read object from path: {}, source: {}", path, source))]
|
||||
ReadObject {
|
||||
path: String,
|
||||
location: Location,
|
||||
source: object_store::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to read parquet source: {}", source))]
|
||||
ReadParquetSnafu {
|
||||
location: Location,
|
||||
source: datafusion::parquet::errors::ParquetError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to convert parquet to schema: {}", source))]
|
||||
ParquetToSchema {
|
||||
location: Location,
|
||||
source: datafusion::parquet::errors::ParquetError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to infer schema from file: {}, source: {}", path, source))]
|
||||
InferSchema {
|
||||
path: String,
|
||||
location: Location,
|
||||
source: arrow_schema::ArrowError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to list object in path: {}, source: {}", path, source))]
|
||||
ListObjects {
|
||||
path: String,
|
||||
@@ -48,6 +83,12 @@ pub enum Error {
|
||||
|
||||
#[snafu(display("Invalid connection: {}", msg))]
|
||||
InvalidConnection { msg: String },
|
||||
|
||||
#[snafu(display("Failed to join handle: {}", source))]
|
||||
JoinHandle {
|
||||
location: Location,
|
||||
source: tokio::task::JoinError,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
@@ -56,13 +97,21 @@ impl ErrorExt for Error {
|
||||
fn status_code(&self) -> StatusCode {
|
||||
use Error::*;
|
||||
match self {
|
||||
BuildBackend { .. } | ListObjects { .. } => StatusCode::StorageUnavailable,
|
||||
BuildBackend { .. } | ListObjects { .. } | ReadObject { .. } => {
|
||||
StatusCode::StorageUnavailable
|
||||
}
|
||||
|
||||
UnsupportedBackendProtocol { .. }
|
||||
| UnsupportedCompressionType { .. }
|
||||
| InvalidConnection { .. }
|
||||
| InvalidUrl { .. }
|
||||
| EmptyHostPath { .. }
|
||||
| InvalidPath { .. } => StatusCode::InvalidArguments,
|
||||
| InvalidPath { .. }
|
||||
| InferSchema { .. }
|
||||
| ReadParquetSnafu { .. }
|
||||
| ParquetToSchema { .. } => StatusCode::InvalidArguments,
|
||||
|
||||
Decompression { .. } | JoinHandle { .. } => StatusCode::Unexpected,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,14 +120,23 @@ impl ErrorExt for Error {
|
||||
}
|
||||
|
||||
fn location_opt(&self) -> Option<common_error::snafu::Location> {
|
||||
use Error::*;
|
||||
match self {
|
||||
Error::BuildBackend { location, .. } => Some(*location),
|
||||
Error::ListObjects { location, .. } => Some(*location),
|
||||
Error::UnsupportedBackendProtocol { .. }
|
||||
| Error::EmptyHostPath { .. }
|
||||
| Error::InvalidPath { .. }
|
||||
| Error::InvalidUrl { .. }
|
||||
| Error::InvalidConnection { .. } => None,
|
||||
BuildBackend { location, .. } => Some(*location),
|
||||
ReadObject { location, .. } => Some(*location),
|
||||
ListObjects { location, .. } => Some(*location),
|
||||
InferSchema { location, .. } => Some(*location),
|
||||
ReadParquetSnafu { location, .. } => Some(*location),
|
||||
ParquetToSchema { location, .. } => Some(*location),
|
||||
Decompression { location, .. } => Some(*location),
|
||||
JoinHandle { location, .. } => Some(*location),
|
||||
|
||||
UnsupportedBackendProtocol { .. }
|
||||
| EmptyHostPath { .. }
|
||||
| InvalidPath { .. }
|
||||
| InvalidUrl { .. }
|
||||
| InvalidConnection { .. }
|
||||
| UnsupportedCompressionType { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
30
src/common/datasource/src/file_format.rs
Normal file
30
src/common/datasource/src/file_format.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod csv;
|
||||
pub mod json;
|
||||
pub mod parquet;
|
||||
|
||||
pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
|
||||
|
||||
use arrow::datatypes::SchemaRef;
|
||||
use async_trait::async_trait;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
#[async_trait]
|
||||
pub trait FileFormat: Send + Sync + std::fmt::Debug {
|
||||
async fn infer_schema(&self, store: &ObjectStore, path: String) -> Result<SchemaRef>;
|
||||
}
|
||||
158
src/common/datasource/src/file_format/csv.rs
Normal file
158
src/common/datasource/src/file_format/csv.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::csv::reader::infer_reader_schema as infer_csv_schema;
|
||||
use arrow_schema::SchemaRef;
|
||||
use async_trait::async_trait;
|
||||
use common_runtime;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
|
||||
use crate::compression::CompressionType;
|
||||
use crate::error::{self, Result};
|
||||
use crate::file_format::{self, FileFormat};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CsvFormat {
|
||||
pub has_header: bool,
|
||||
pub delimiter: u8,
|
||||
pub schema_infer_max_record: Option<usize>,
|
||||
pub compression_type: CompressionType,
|
||||
}
|
||||
|
||||
impl Default for CsvFormat {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
has_header: true,
|
||||
delimiter: b',',
|
||||
schema_infer_max_record: Some(file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD),
|
||||
compression_type: CompressionType::UNCOMPRESSED,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileFormat for CsvFormat {
|
||||
async fn infer_schema(&self, store: &ObjectStore, path: String) -> Result<SchemaRef> {
|
||||
let reader = store
|
||||
.reader(&path)
|
||||
.await
|
||||
.context(error::ReadObjectSnafu { path: &path })?;
|
||||
|
||||
let decoded = self.compression_type.convert_async_read(reader);
|
||||
|
||||
let delimiter = self.delimiter;
|
||||
let schema_infer_max_record = self.schema_infer_max_record;
|
||||
let has_header = self.has_header;
|
||||
|
||||
common_runtime::spawn_blocking_read(move || {
|
||||
let reader = SyncIoBridge::new(decoded);
|
||||
|
||||
let (schema, _records_read) =
|
||||
infer_csv_schema(reader, delimiter, schema_infer_max_record, has_header)
|
||||
.context(error::InferSchemaSnafu { path: &path })?;
|
||||
|
||||
Ok(Arc::new(schema))
|
||||
})
|
||||
.await
|
||||
.context(error::JoinHandleSnafu)?
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::file_format::FileFormat;
|
||||
use crate::test_util::{self, format_schema, test_store};
|
||||
|
||||
fn test_data_root() -> String {
|
||||
test_util::get_data_dir("tests/csv").display().to_string()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn infer_schema_basic() {
|
||||
let csv = CsvFormat::default();
|
||||
let store = test_store(&test_data_root());
|
||||
let schema = csv
|
||||
.infer_schema(&store, "simple.csv".to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let formatted: Vec<_> = format_schema(schema);
|
||||
|
||||
assert_eq!(
|
||||
vec![
|
||||
"c1: Utf8: NULL",
|
||||
"c2: Int64: NULL",
|
||||
"c3: Int64: NULL",
|
||||
"c4: Int64: NULL",
|
||||
"c5: Int64: NULL",
|
||||
"c6: Int64: NULL",
|
||||
"c7: Int64: NULL",
|
||||
"c8: Int64: NULL",
|
||||
"c9: Int64: NULL",
|
||||
"c10: Int64: NULL",
|
||||
"c11: Float64: NULL",
|
||||
"c12: Float64: NULL",
|
||||
"c13: Utf8: NULL"
|
||||
],
|
||||
formatted,
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn infer_schema_with_limit() {
|
||||
let json = CsvFormat {
|
||||
schema_infer_max_record: Some(3),
|
||||
..CsvFormat::default()
|
||||
};
|
||||
let store = test_store(&test_data_root());
|
||||
let schema = json
|
||||
.infer_schema(&store, "schema_infer_limit.csv".to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let formatted: Vec<_> = format_schema(schema);
|
||||
|
||||
assert_eq!(
|
||||
vec![
|
||||
"a: Int64: NULL",
|
||||
"b: Float64: NULL",
|
||||
"c: Int64: NULL",
|
||||
"d: Int64: NULL"
|
||||
],
|
||||
formatted
|
||||
);
|
||||
|
||||
let json = CsvFormat::default();
|
||||
let store = test_store(&test_data_root());
|
||||
let schema = json
|
||||
.infer_schema(&store, "schema_infer_limit.csv".to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let formatted: Vec<_> = format_schema(schema);
|
||||
|
||||
assert_eq!(
|
||||
vec![
|
||||
"a: Int64: NULL",
|
||||
"b: Float64: NULL",
|
||||
"c: Int64: NULL",
|
||||
"d: Utf8: NULL"
|
||||
],
|
||||
formatted
|
||||
);
|
||||
}
|
||||
}
|
||||
121
src/common/datasource/src/file_format/json.rs
Normal file
121
src/common/datasource/src/file_format/json.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::io::BufReader;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::datatypes::SchemaRef;
|
||||
use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
|
||||
use async_trait::async_trait;
|
||||
use common_runtime;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
|
||||
use crate::compression::CompressionType;
|
||||
use crate::error::{self, Result};
|
||||
use crate::file_format::{self, FileFormat};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct JsonFormat {
|
||||
pub schema_infer_max_record: Option<usize>,
|
||||
pub compression_type: CompressionType,
|
||||
}
|
||||
|
||||
impl Default for JsonFormat {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
schema_infer_max_record: Some(file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD),
|
||||
compression_type: CompressionType::UNCOMPRESSED,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileFormat for JsonFormat {
|
||||
async fn infer_schema(&self, store: &ObjectStore, path: String) -> Result<SchemaRef> {
|
||||
let reader = store
|
||||
.reader(&path)
|
||||
.await
|
||||
.context(error::ReadObjectSnafu { path: &path })?;
|
||||
|
||||
let decoded = self.compression_type.convert_async_read(reader);
|
||||
|
||||
let schema_infer_max_record = self.schema_infer_max_record;
|
||||
|
||||
common_runtime::spawn_blocking_read(move || {
|
||||
let mut reader = BufReader::new(SyncIoBridge::new(decoded));
|
||||
|
||||
let iter = ValueIter::new(&mut reader, schema_infer_max_record);
|
||||
|
||||
let schema = infer_json_schema_from_iterator(iter)
|
||||
.context(error::InferSchemaSnafu { path: &path })?;
|
||||
|
||||
Ok(Arc::new(schema))
|
||||
})
|
||||
.await
|
||||
.context(error::JoinHandleSnafu)?
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::file_format::FileFormat;
|
||||
use crate::test_util::{self, format_schema, test_store};
|
||||
|
||||
fn test_data_root() -> String {
|
||||
test_util::get_data_dir("tests/json").display().to_string()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn infer_schema_basic() {
|
||||
let json = JsonFormat::default();
|
||||
let store = test_store(&test_data_root());
|
||||
let schema = json
|
||||
.infer_schema(&store, "simple.json".to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let formatted: Vec<_> = format_schema(schema);
|
||||
|
||||
assert_eq!(
|
||||
vec![
|
||||
"a: Int64: NULL",
|
||||
"b: Float64: NULL",
|
||||
"c: Boolean: NULL",
|
||||
"d: Utf8: NULL",
|
||||
],
|
||||
formatted
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn infer_schema_with_limit() {
|
||||
let json = JsonFormat {
|
||||
schema_infer_max_record: Some(3),
|
||||
..JsonFormat::default()
|
||||
};
|
||||
let store = test_store(&test_data_root());
|
||||
let schema = json
|
||||
.infer_schema(&store, "schema_infer_limit.json".to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let formatted: Vec<_> = format_schema(schema);
|
||||
|
||||
assert_eq!(
|
||||
vec!["a: Int64: NULL", "b: Float64: NULL", "c: Boolean: NULL"],
|
||||
formatted
|
||||
);
|
||||
}
|
||||
}
|
||||
78
src/common/datasource/src/file_format/parquet.rs
Normal file
78
src/common/datasource/src/file_format/parquet.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::SchemaRef;
|
||||
use async_trait::async_trait;
|
||||
use datafusion::parquet::arrow::async_reader::AsyncFileReader;
|
||||
use datafusion::parquet::arrow::parquet_to_arrow_schema;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::file_format::FileFormat;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ParquetFormat {}
|
||||
|
||||
#[async_trait]
|
||||
impl FileFormat for ParquetFormat {
|
||||
async fn infer_schema(&self, store: &ObjectStore, path: String) -> Result<SchemaRef> {
|
||||
let mut reader = store
|
||||
.reader(&path)
|
||||
.await
|
||||
.context(error::ReadObjectSnafu { path: &path })?;
|
||||
|
||||
let metadata = reader
|
||||
.get_metadata()
|
||||
.await
|
||||
.context(error::ReadParquetSnafuSnafu)?;
|
||||
|
||||
let file_metadata = metadata.file_metadata();
|
||||
let schema = parquet_to_arrow_schema(
|
||||
file_metadata.schema_descr(),
|
||||
file_metadata.key_value_metadata(),
|
||||
)
|
||||
.context(error::ParquetToSchemaSnafu)?;
|
||||
|
||||
Ok(Arc::new(schema))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::file_format::FileFormat;
|
||||
use crate::test_util::{self, format_schema, test_store};
|
||||
|
||||
fn test_data_root() -> String {
|
||||
test_util::get_data_dir("tests/parquet")
|
||||
.display()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn infer_schema_basic() {
|
||||
let json = ParquetFormat::default();
|
||||
let store = test_store(&test_data_root());
|
||||
let schema = json
|
||||
.infer_schema(&store, "basic.parquet".to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let formatted: Vec<_> = format_schema(schema);
|
||||
|
||||
assert_eq!(vec!["num: Int64: NULL", "str: Utf8: NULL"], formatted);
|
||||
}
|
||||
}
|
||||
@@ -12,7 +12,10 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod compression;
|
||||
pub mod error;
|
||||
pub mod file_format;
|
||||
pub mod lister;
|
||||
pub mod object_store;
|
||||
pub mod test_util;
|
||||
pub mod util;
|
||||
|
||||
48
src/common/datasource/src/test_util.rs
Normal file
48
src/common/datasource/src/test_util.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use arrow_schema::SchemaRef;
|
||||
use object_store::services::Fs;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
pub fn get_data_dir(path: &str) -> PathBuf {
|
||||
// https://doc.rust-lang.org/cargo/reference/environment-variables.html
|
||||
let dir = env!("CARGO_MANIFEST_DIR");
|
||||
|
||||
PathBuf::from(dir).join(path)
|
||||
}
|
||||
|
||||
pub fn format_schema(schema: SchemaRef) -> Vec<String> {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|f| {
|
||||
format!(
|
||||
"{}: {:?}: {}",
|
||||
f.name(),
|
||||
f.data_type(),
|
||||
if f.is_nullable() { "NULL" } else { "NOT NULL" }
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn test_store(root: &str) -> ObjectStore {
|
||||
let mut builder = Fs::default();
|
||||
builder.root(root);
|
||||
|
||||
ObjectStore::new(builder).unwrap().finish()
|
||||
}
|
||||
24
src/common/datasource/tests/README.md
Normal file
24
src/common/datasource/tests/README.md
Normal file
@@ -0,0 +1,24 @@
|
||||
### Parquet
|
||||
The `parquet/basic.parquet` was converted from `csv/basic.csv` via [bdt](https://github.com/andygrove/bdt).
|
||||
|
||||
Internal of `parquet/basic.parquet`:
|
||||
|
||||
Data:
|
||||
```
|
||||
+-----+-------+
|
||||
| num | str |
|
||||
+-----+-------+
|
||||
| 5 | test |
|
||||
| 2 | hello |
|
||||
| 4 | foo |
|
||||
+-----+-------+
|
||||
```
|
||||
Schema:
|
||||
```
|
||||
+-------------+-----------+-------------+
|
||||
| column_name | data_type | is_nullable |
|
||||
+-------------+-----------+-------------+
|
||||
| num | Int64 | YES |
|
||||
| str | Utf8 | YES |
|
||||
+-------------+-----------+-------------+
|
||||
```
|
||||
4
src/common/datasource/tests/csv/basic.csv
Normal file
4
src/common/datasource/tests/csv/basic.csv
Normal file
@@ -0,0 +1,4 @@
|
||||
num,str
|
||||
5,test
|
||||
2,hello
|
||||
4,foo
|
||||
|
5
src/common/datasource/tests/csv/schema_infer_limit.csv
Normal file
5
src/common/datasource/tests/csv/schema_infer_limit.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
a,b,c,d
|
||||
1,2,3,4
|
||||
1,2,3,4
|
||||
1,2.0,3,4
|
||||
1,2,4,test
|
||||
|
11
src/common/datasource/tests/csv/simple.csv
Normal file
11
src/common/datasource/tests/csv/simple.csv
Normal file
@@ -0,0 +1,11 @@
|
||||
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13
|
||||
c,2,1,18109,2033001162,-6513304855495910254,25,43062,1491205016,5863949479783605708,0.110830784,0.9294097332465232,6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW
|
||||
d,5,-40,22614,706441268,-7542719935673075327,155,14337,3373581039,11720144131976083864,0.69632107,0.3114712539863804,C2GT5KVyOPZpgKVl110TyZO0NcJ434
|
||||
b,1,29,-18218,994303988,5983957848665088916,204,9489,3275293996,14857091259186476033,0.53840446,0.17909035118828576,AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz
|
||||
a,1,-85,-15154,1171968280,1919439543497968449,77,52286,774637006,12101411955859039553,0.12285209,0.6864391962767343,0keZ5G8BffGwgF2RwQD59TFzMStxCB
|
||||
b,5,-82,22080,1824882165,7373730676428214987,208,34331,3342719438,3330177516592499461,0.82634634,0.40975383525297016,Ig1QcuKsjHXkproePdERo2w0mYzIqd
|
||||
b,4,-111,-1967,-4229382,1892872227362838079,67,9832,1243785310,8382489916947120498,0.06563997,0.152498292971736,Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH
|
||||
e,3,104,-25136,1738331255,300633854973581194,139,20807,3577318119,13079037564113702254,0.40154034,0.7764360990307122,DuJNG8tufSqW0ZstHqWj3aGvFLMg4A
|
||||
a,3,13,12613,1299719633,2020498574254265315,191,17835,3998790955,14881411008939145569,0.041445434,0.8813167497816289,Amn2K87Db5Es3dFQO9cw9cvpAM6h35
|
||||
d,1,38,18384,-335410409,-1632237090406591229,26,57510,2712615025,1842662804748246269,0.6064476,0.6404495093354053,4HX6feIvmNXBN7XGqgO4YVBkhu8GDI
|
||||
a,4,-38,20744,762932956,308913475857409919,7,45465,1787652631,878137512938218976,0.7459874,0.02182578039211991,ydkwycaISlYSlEq3TlkS2m15I2pcp8
|
||||
|
4
src/common/datasource/tests/json/schema_infer_limit.json
Normal file
4
src/common/datasource/tests/json/schema_infer_limit.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{"a":1}
|
||||
{"a":-10, "b":-3.5}
|
||||
{"a":2, "b":0.6, "c":false}
|
||||
{"a":1, "b":2.0, "c":false, "d":"4"}
|
||||
12
src/common/datasource/tests/json/simple.json
Normal file
12
src/common/datasource/tests/json/simple.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{"a":1, "b":2.0, "c":false, "d":"4"}
|
||||
{"a":-10, "b":-3.5, "c":true, "d":"4"}
|
||||
{"a":2, "b":0.6, "c":false, "d":"text"}
|
||||
{"a":1, "b":2.0, "c":false, "d":"4"}
|
||||
{"a":7, "b":-3.5, "c":true, "d":"4"}
|
||||
{"a":1, "b":0.6, "c":false, "d":"text"}
|
||||
{"a":1, "b":2.0, "c":false, "d":"4"}
|
||||
{"a":5, "b":-3.5, "c":true, "d":"4"}
|
||||
{"a":1, "b":0.6, "c":false, "d":"text"}
|
||||
{"a":1, "b":2.0, "c":false, "d":"4"}
|
||||
{"a":1, "b":-3.5, "c":true, "d":"4"}
|
||||
{"a":100000000000000, "b":0.6, "c":false, "d":"text"}
|
||||
BIN
src/common/datasource/tests/parquet/basic.parquet
Normal file
BIN
src/common/datasource/tests/parquet/basic.parquet
Normal file
Binary file not shown.
Reference in New Issue
Block a user