mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-18 05:50:41 +00:00
feat: implement JsonOpener and CsvOpener (#1367)
* feat: introduce JsonOpener and CsvOpener * refactor: refactor Opener * docs: add doc
This commit is contained in:
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -1566,9 +1566,11 @@ dependencies = [
|
||||
"arrow-schema",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"common-error",
|
||||
"common-runtime",
|
||||
"datafusion",
|
||||
"derive_builder 0.12.0",
|
||||
"futures",
|
||||
"object-store",
|
||||
"regex",
|
||||
|
||||
@@ -16,9 +16,11 @@ async-compression = { version = "0.3", features = [
|
||||
"tokio",
|
||||
] }
|
||||
async-trait.workspace = true
|
||||
bytes = "1.1"
|
||||
common-error = { path = "../error" }
|
||||
common-runtime = { path = "../runtime" }
|
||||
datafusion.workspace = true
|
||||
derive_builder = "0.12"
|
||||
futures.workspace = true
|
||||
object-store = { path = "../../object-store" }
|
||||
regex = "1.7"
|
||||
|
||||
@@ -13,10 +13,14 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt::Display;
|
||||
use std::io;
|
||||
use std::str::FromStr;
|
||||
|
||||
use async_compression::tokio::bufread::{BzDecoder, GzipDecoder, XzDecoder, ZstdDecoder};
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
use tokio::io::{AsyncRead, BufReader};
|
||||
use tokio_util::io::{ReaderStream, StreamReader};
|
||||
|
||||
use crate::error::{self, Error, Result};
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
@@ -81,4 +85,25 @@ impl CompressionType {
|
||||
CompressionType::UNCOMPRESSED => Box::new(s),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn convert_stream<T: Stream<Item = io::Result<Bytes>> + Unpin + Send + 'static>(
|
||||
&self,
|
||||
s: T,
|
||||
) -> Box<dyn Stream<Item = io::Result<Bytes>> + Send + Unpin> {
|
||||
match self {
|
||||
CompressionType::GZIP => {
|
||||
Box::new(ReaderStream::new(GzipDecoder::new(StreamReader::new(s))))
|
||||
}
|
||||
CompressionType::BZIP2 => {
|
||||
Box::new(ReaderStream::new(BzDecoder::new(StreamReader::new(s))))
|
||||
}
|
||||
CompressionType::XZ => {
|
||||
Box::new(ReaderStream::new(XzDecoder::new(StreamReader::new(s))))
|
||||
}
|
||||
CompressionType::ZSTD => {
|
||||
Box::new(ReaderStream::new(ZstdDecoder::new(StreamReader::new(s))))
|
||||
}
|
||||
CompressionType::UNCOMPRESSED => Box::new(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,16 +15,108 @@
|
||||
pub mod csv;
|
||||
pub mod json;
|
||||
pub mod parquet;
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
|
||||
pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
|
||||
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
|
||||
use arrow::datatypes::SchemaRef;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use arrow_schema::ArrowError;
|
||||
use async_trait::async_trait;
|
||||
use bytes::{Buf, Bytes};
|
||||
use datafusion::error::{DataFusionError, Result as DataFusionResult};
|
||||
use datafusion::physical_plan::file_format::FileOpenFuture;
|
||||
use futures::StreamExt;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
use crate::compression::CompressionType;
|
||||
use crate::error::Result;
|
||||
|
||||
#[async_trait]
|
||||
pub trait FileFormat: Send + Sync + std::fmt::Debug {
|
||||
async fn infer_schema(&self, store: &ObjectStore, path: String) -> Result<SchemaRef>;
|
||||
}
|
||||
|
||||
pub trait ArrowDecoder: Send + 'static {
|
||||
/// Decode records from `buf` returning the number of bytes read.
|
||||
///
|
||||
/// This method returns `Ok(0)` once `batch_size` objects have been parsed since the
|
||||
/// last call to [`Self::flush`], or `buf` is exhausted.
|
||||
///
|
||||
/// Any remaining bytes should be included in the next call to [`Self::decode`].
|
||||
fn decode(&mut self, buf: &[u8]) -> result::Result<usize, ArrowError>;
|
||||
|
||||
/// Flushes the currently buffered data to a [`RecordBatch`].
|
||||
///
|
||||
/// This should only be called after [`Self::decode`] has returned `Ok(0)`,
|
||||
/// otherwise may return an error if part way through decoding a record
|
||||
///
|
||||
/// Returns `Ok(None)` if no buffered data.
|
||||
fn flush(&mut self) -> result::Result<Option<RecordBatch>, ArrowError>;
|
||||
}
|
||||
|
||||
impl ArrowDecoder for arrow::csv::reader::Decoder {
|
||||
fn decode(&mut self, buf: &[u8]) -> result::Result<usize, ArrowError> {
|
||||
self.decode(buf)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> result::Result<Option<RecordBatch>, ArrowError> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
impl ArrowDecoder for arrow::json::RawDecoder {
|
||||
fn decode(&mut self, buf: &[u8]) -> result::Result<usize, ArrowError> {
|
||||
self.decode(buf)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> result::Result<Option<RecordBatch>, ArrowError> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open_with_decoder<T: ArrowDecoder, F: Fn() -> DataFusionResult<T>>(
|
||||
object_store: Arc<ObjectStore>,
|
||||
path: String,
|
||||
compression_type: CompressionType,
|
||||
decoder_factory: F,
|
||||
) -> DataFusionResult<FileOpenFuture> {
|
||||
let mut decoder = decoder_factory()?;
|
||||
Ok(Box::pin(async move {
|
||||
let reader = object_store
|
||||
.reader(&path)
|
||||
.await
|
||||
.map_err(|e| DataFusionError::External(Box::new(e)))?;
|
||||
|
||||
let mut upstream = compression_type.convert_stream(reader).fuse();
|
||||
|
||||
let mut buffered = Bytes::new();
|
||||
|
||||
let stream = futures::stream::poll_fn(move |cx| {
|
||||
loop {
|
||||
if buffered.is_empty() {
|
||||
if let Some(result) = futures::ready!(upstream.poll_next_unpin(cx)) {
|
||||
buffered = result?;
|
||||
};
|
||||
}
|
||||
|
||||
let decoded = decoder.decode(buffered.as_ref())?;
|
||||
|
||||
if decoded == 0 {
|
||||
break;
|
||||
} else {
|
||||
buffered.advance(decoded);
|
||||
}
|
||||
}
|
||||
|
||||
Poll::Ready(decoder.flush().transpose())
|
||||
});
|
||||
|
||||
Ok(stream.boxed())
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -14,17 +14,21 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::csv;
|
||||
use arrow::csv::reader::infer_reader_schema as infer_csv_schema;
|
||||
use arrow_schema::SchemaRef;
|
||||
use async_trait::async_trait;
|
||||
use common_runtime;
|
||||
use datafusion::error::Result as DataFusionResult;
|
||||
use datafusion::physical_plan::file_format::{FileMeta, FileOpenFuture, FileOpener};
|
||||
use derive_builder::Builder;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
|
||||
use crate::compression::CompressionType;
|
||||
use crate::error::{self, Result};
|
||||
use crate::file_format::{self, FileFormat};
|
||||
use crate::file_format::{self, open_with_decoder, FileFormat};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CsvFormat {
|
||||
@@ -45,6 +49,67 @@ impl Default for CsvFormat {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Builder)]
|
||||
pub struct CsvConfig {
|
||||
batch_size: usize,
|
||||
file_schema: SchemaRef,
|
||||
#[builder(default = "None")]
|
||||
file_projection: Option<Vec<usize>>,
|
||||
#[builder(default = "true")]
|
||||
has_header: bool,
|
||||
#[builder(default = "b','")]
|
||||
delimiter: u8,
|
||||
}
|
||||
|
||||
impl CsvConfig {
|
||||
fn builder(&self) -> csv::ReaderBuilder {
|
||||
let mut builder = csv::ReaderBuilder::new()
|
||||
.with_schema(self.file_schema.clone())
|
||||
.with_delimiter(self.delimiter)
|
||||
.with_batch_size(self.batch_size)
|
||||
.has_header(self.has_header);
|
||||
|
||||
if let Some(proj) = &self.file_projection {
|
||||
builder = builder.with_projection(proj.clone());
|
||||
}
|
||||
|
||||
builder
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CsvOpener {
|
||||
config: Arc<CsvConfig>,
|
||||
object_store: Arc<ObjectStore>,
|
||||
compression_type: CompressionType,
|
||||
}
|
||||
|
||||
impl CsvOpener {
|
||||
/// Return a new [`CsvOpener`]. The caller must ensure [`CsvConfig`].file_schema must correspond to the opening file.
|
||||
pub fn new(
|
||||
config: CsvConfig,
|
||||
object_store: ObjectStore,
|
||||
compression_type: CompressionType,
|
||||
) -> Self {
|
||||
CsvOpener {
|
||||
config: Arc::new(config),
|
||||
object_store: Arc::new(object_store),
|
||||
compression_type,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileOpener for CsvOpener {
|
||||
fn open(&self, meta: FileMeta) -> DataFusionResult<FileOpenFuture> {
|
||||
open_with_decoder(
|
||||
self.object_store.clone(),
|
||||
meta.location().to_string(),
|
||||
self.compression_type,
|
||||
|| Ok(self.config.builder().build_decoder()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FileFormat for CsvFormat {
|
||||
async fn infer_schema(&self, store: &ObjectStore, path: String) -> Result<SchemaRef> {
|
||||
|
||||
@@ -17,15 +17,18 @@ use std::sync::Arc;
|
||||
|
||||
use arrow::datatypes::SchemaRef;
|
||||
use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
|
||||
use arrow::json::RawReaderBuilder;
|
||||
use async_trait::async_trait;
|
||||
use common_runtime;
|
||||
use datafusion::error::{DataFusionError, Result as DataFusionResult};
|
||||
use datafusion::physical_plan::file_format::{FileMeta, FileOpenFuture, FileOpener};
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
|
||||
use crate::compression::CompressionType;
|
||||
use crate::error::{self, Result};
|
||||
use crate::file_format::{self, FileFormat};
|
||||
use crate::file_format::{self, open_with_decoder, FileFormat};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct JsonFormat {
|
||||
@@ -69,6 +72,47 @@ impl FileFormat for JsonFormat {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JsonOpener {
|
||||
batch_size: usize,
|
||||
projected_schema: SchemaRef,
|
||||
object_store: Arc<ObjectStore>,
|
||||
compression_type: CompressionType,
|
||||
}
|
||||
|
||||
impl JsonOpener {
|
||||
/// Return a new [`JsonOpener`]. Any fields not present in `projected_schema` will be ignored.
|
||||
pub fn new(
|
||||
batch_size: usize,
|
||||
projected_schema: SchemaRef,
|
||||
object_store: ObjectStore,
|
||||
compression_type: CompressionType,
|
||||
) -> Self {
|
||||
Self {
|
||||
batch_size,
|
||||
projected_schema,
|
||||
object_store: Arc::new(object_store),
|
||||
compression_type,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileOpener for JsonOpener {
|
||||
fn open(&self, meta: FileMeta) -> DataFusionResult<FileOpenFuture> {
|
||||
open_with_decoder(
|
||||
self.object_store.clone(),
|
||||
meta.location().to_string(),
|
||||
self.compression_type,
|
||||
|| {
|
||||
RawReaderBuilder::new(self.projected_schema.clone())
|
||||
.with_batch_size(self.batch_size)
|
||||
.build_decoder()
|
||||
.map_err(DataFusionError::from)
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
161
src/common/datasource/src/file_format/tests.rs
Normal file
161
src/common/datasource/src/file_format/tests.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::vec;
|
||||
|
||||
use arrow_schema::SchemaRef;
|
||||
use datafusion::assert_batches_eq;
|
||||
use datafusion::datasource::listing::PartitionedFile;
|
||||
use datafusion::datasource::object_store::ObjectStoreUrl;
|
||||
use datafusion::physical_plan::file_format::{FileOpener, FileScanConfig, FileStream};
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use futures::StreamExt;
|
||||
|
||||
use crate::compression::CompressionType;
|
||||
use crate::file_format::csv::{CsvConfigBuilder, CsvOpener};
|
||||
use crate::file_format::json::JsonOpener;
|
||||
use crate::test_util::{self, test_basic_schema, test_store};
|
||||
|
||||
fn scan_config(file_schema: SchemaRef, limit: Option<usize>, filename: &str) -> FileScanConfig {
|
||||
FileScanConfig {
|
||||
object_store_url: ObjectStoreUrl::parse("empty://").unwrap(), // won't be used
|
||||
file_schema,
|
||||
file_groups: vec![vec![PartitionedFile::new(filename.to_string(), 10)]],
|
||||
statistics: Default::default(),
|
||||
projection: None,
|
||||
limit,
|
||||
table_partition_cols: vec![],
|
||||
output_ordering: None,
|
||||
infinite_source: false,
|
||||
}
|
||||
}
|
||||
|
||||
struct Test<'a, T: FileOpener> {
|
||||
config: FileScanConfig,
|
||||
opener: T,
|
||||
expected: Vec<&'a str>,
|
||||
}
|
||||
|
||||
impl<'a, T: FileOpener> Test<'a, T> {
|
||||
pub async fn run(self) {
|
||||
let result = FileStream::new(
|
||||
&self.config,
|
||||
0,
|
||||
self.opener,
|
||||
&ExecutionPlanMetricsSet::new(),
|
||||
)
|
||||
.unwrap()
|
||||
.map(|b| b.unwrap())
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
|
||||
assert_batches_eq!(self.expected, &result);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_json_opener() {
|
||||
let store = test_store("/");
|
||||
|
||||
let schema = test_basic_schema();
|
||||
|
||||
let json_opener = JsonOpener::new(
|
||||
100,
|
||||
schema.clone(),
|
||||
store.clone(),
|
||||
CompressionType::UNCOMPRESSED,
|
||||
);
|
||||
|
||||
let path = &test_util::get_data_dir("tests/json/basic.json")
|
||||
.display()
|
||||
.to_string();
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path),
|
||||
opener: json_opener.clone(),
|
||||
expected: vec![
|
||||
"+-----+-------+",
|
||||
"| num | str |",
|
||||
"+-----+-------+",
|
||||
"| 5 | test |",
|
||||
"| 2 | hello |",
|
||||
"| 4 | foo |",
|
||||
"+-----+-------+",
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema.clone(), Some(1), path),
|
||||
opener: json_opener.clone(),
|
||||
expected: vec![
|
||||
"+-----+------+",
|
||||
"| num | str |",
|
||||
"+-----+------+",
|
||||
"| 5 | test |",
|
||||
"+-----+------+",
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for test in tests {
|
||||
test.run().await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_csv_opener() {
|
||||
let store = test_store("/");
|
||||
|
||||
let schema = test_basic_schema();
|
||||
let path = &test_util::get_data_dir("tests/csv/basic.csv")
|
||||
.display()
|
||||
.to_string();
|
||||
let csv_conf = CsvConfigBuilder::default()
|
||||
.batch_size(test_util::TEST_BATCH_SIZE)
|
||||
.file_schema(schema.clone())
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let csv_opener = CsvOpener::new(csv_conf, store, CompressionType::UNCOMPRESSED);
|
||||
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path),
|
||||
opener: csv_opener.clone(),
|
||||
expected: vec![
|
||||
"+-----+-------+",
|
||||
"| num | str |",
|
||||
"+-----+-------+",
|
||||
"| 5 | test |",
|
||||
"| 2 | hello |",
|
||||
"| 4 | foo |",
|
||||
"+-----+-------+",
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema.clone(), Some(1), path),
|
||||
opener: csv_opener.clone(),
|
||||
expected: vec![
|
||||
"+-----+------+",
|
||||
"| num | str |",
|
||||
"+-----+------+",
|
||||
"| 5 | test |",
|
||||
"+-----+------+",
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for test in tests {
|
||||
test.run().await;
|
||||
}
|
||||
}
|
||||
@@ -17,5 +17,6 @@ pub mod error;
|
||||
pub mod file_format;
|
||||
pub mod lister;
|
||||
pub mod object_store;
|
||||
#[cfg(test)]
|
||||
pub mod test_util;
|
||||
pub mod util;
|
||||
|
||||
@@ -13,11 +13,14 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::SchemaRef;
|
||||
use arrow_schema::{DataType, Field, Schema, SchemaRef};
|
||||
use object_store::services::Fs;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
pub const TEST_BATCH_SIZE: usize = 100;
|
||||
|
||||
pub fn get_data_dir(path: &str) -> PathBuf {
|
||||
// https://doc.rust-lang.org/cargo/reference/environment-variables.html
|
||||
let dir = env!("CARGO_MANIFEST_DIR");
|
||||
@@ -46,3 +49,11 @@ pub fn test_store(root: &str) -> ObjectStore {
|
||||
|
||||
ObjectStore::new(builder).unwrap().finish()
|
||||
}
|
||||
|
||||
pub fn test_basic_schema() -> SchemaRef {
|
||||
let schema = Schema::new(vec![
|
||||
Field::new("num", DataType::Int64, false),
|
||||
Field::new("str", DataType::Utf8, false),
|
||||
]);
|
||||
Arc::new(schema)
|
||||
}
|
||||
|
||||
3
src/common/datasource/tests/json/basic.json
Normal file
3
src/common/datasource/tests/json/basic.json
Normal file
@@ -0,0 +1,3 @@
|
||||
{"num":5,"str":"test"}
|
||||
{"num":2,"str":"hello"}
|
||||
{"num":4,"str":"foo"}
|
||||
Reference in New Issue
Block a user