common_datasource/
file_format.rs1pub mod csv;
16pub mod json;
17pub mod orc;
18pub mod parquet;
19#[cfg(test)]
20pub mod tests;
21
22pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
23
24use std::collections::HashMap;
25use std::result;
26use std::sync::Arc;
27use std::task::Poll;
28
29use arrow::record_batch::RecordBatch;
30use arrow_schema::{ArrowError, Schema as ArrowSchema};
31use async_trait::async_trait;
32use bytes::{Buf, Bytes};
33use common_recordbatch::DfSendableRecordBatchStream;
34use datafusion::datasource::file_format::file_compression_type::FileCompressionType as DfCompressionType;
35use datafusion::datasource::listing::PartitionedFile;
36use datafusion::datasource::object_store::ObjectStoreUrl;
37use datafusion::datasource::physical_plan::{
38 FileGroup, FileOpenFuture, FileScanConfigBuilder, FileSource, FileStream,
39};
40use datafusion::error::{DataFusionError, Result as DataFusionResult};
41use datafusion::physical_plan::SendableRecordBatchStream;
42use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
43use futures::{StreamExt, TryStreamExt};
44use object_store::ObjectStore;
45use snafu::ResultExt;
46use tokio::io::AsyncWriteExt;
47use tokio_util::compat::FuturesAsyncWriteCompatExt;
48
49use self::csv::CsvFormat;
50use self::json::JsonFormat;
51use self::orc::OrcFormat;
52use self::parquet::ParquetFormat;
53use crate::DEFAULT_WRITE_BUFFER_SIZE;
54use crate::buffered_writer::DfRecordBatchEncoder;
55use crate::compressed_writer::{CompressedWriter, IntoCompressedWriter};
56use crate::compression::CompressionType;
57use crate::error::{self, Result};
58use crate::share_buffer::SharedBuffer;
59
60pub const FORMAT_COMPRESSION_TYPE: &str = "compression_type";
61pub const FORMAT_DELIMITER: &str = "delimiter";
62pub const FORMAT_SCHEMA_INFER_MAX_RECORD: &str = "schema_infer_max_record";
63pub const FORMAT_HAS_HEADER: &str = "has_header";
64pub const FORMAT_TYPE: &str = "format";
65pub const FILE_PATTERN: &str = "pattern";
66pub const TIMESTAMP_FORMAT: &str = "timestamp_format";
67pub const TIME_FORMAT: &str = "time_format";
68pub const DATE_FORMAT: &str = "date_format";
69
70#[derive(Debug, Clone, PartialEq, Eq)]
71pub enum Format {
72 Csv(CsvFormat),
73 Json(JsonFormat),
74 Parquet(ParquetFormat),
75 Orc(OrcFormat),
76}
77
78impl Format {
79 pub fn suffix(&self) -> &'static str {
80 match self {
81 Format::Csv(_) => ".csv",
82 Format::Json(_) => ".json",
83 Format::Parquet(_) => ".parquet",
84 &Format::Orc(_) => ".orc",
85 }
86 }
87}
88
89impl TryFrom<&HashMap<String, String>> for Format {
90 type Error = error::Error;
91
92 fn try_from(options: &HashMap<String, String>) -> Result<Self> {
93 let format = options
94 .get(FORMAT_TYPE)
95 .map(|format| format.to_ascii_uppercase())
96 .unwrap_or_else(|| "PARQUET".to_string());
97
98 match format.as_str() {
99 "CSV" => Ok(Self::Csv(CsvFormat::try_from(options)?)),
100 "JSON" => Ok(Self::Json(JsonFormat::try_from(options)?)),
101 "PARQUET" => Ok(Self::Parquet(ParquetFormat::default())),
102 "ORC" => Ok(Self::Orc(OrcFormat)),
103 _ => error::UnsupportedFormatSnafu { format: &format }.fail(),
104 }
105 }
106}
107
108#[async_trait]
109pub trait FileFormat: Send + Sync + std::fmt::Debug {
110 async fn infer_schema(&self, store: &ObjectStore, path: &str) -> Result<ArrowSchema>;
111}
112
113pub trait ArrowDecoder: Send + 'static {
114 fn decode(&mut self, buf: &[u8]) -> result::Result<usize, ArrowError>;
121
122 fn flush(&mut self) -> result::Result<Option<RecordBatch>, ArrowError>;
129}
130
131impl ArrowDecoder for arrow::csv::reader::Decoder {
132 fn decode(&mut self, buf: &[u8]) -> result::Result<usize, ArrowError> {
133 self.decode(buf)
134 }
135
136 fn flush(&mut self) -> result::Result<Option<RecordBatch>, ArrowError> {
137 self.flush()
138 }
139}
140
141impl ArrowDecoder for arrow::json::reader::Decoder {
142 fn decode(&mut self, buf: &[u8]) -> result::Result<usize, ArrowError> {
143 self.decode(buf)
144 }
145
146 fn flush(&mut self) -> result::Result<Option<RecordBatch>, ArrowError> {
147 self.flush()
148 }
149}
150
151pub fn open_with_decoder<T: ArrowDecoder, F: Fn() -> DataFusionResult<T>>(
152 object_store: Arc<ObjectStore>,
153 path: String,
154 compression_type: CompressionType,
155 decoder_factory: F,
156) -> DataFusionResult<FileOpenFuture> {
157 let mut decoder = decoder_factory()?;
158 Ok(Box::pin(async move {
159 let reader = object_store
160 .reader(&path)
161 .await
162 .map_err(|e| DataFusionError::External(Box::new(e)))?
163 .into_bytes_stream(..)
164 .await
165 .map_err(|e| DataFusionError::External(Box::new(e)))?;
166
167 let mut upstream = compression_type.convert_stream(reader).fuse();
168
169 let mut buffered = Bytes::new();
170
171 let stream = futures::stream::poll_fn(move |cx| {
172 loop {
173 if buffered.is_empty()
174 && let Some(result) = futures::ready!(upstream.poll_next_unpin(cx))
175 {
176 buffered = result?;
177 }
178
179 let decoded = decoder.decode(buffered.as_ref())?;
180
181 if decoded == 0 {
182 break;
183 } else {
184 buffered.advance(decoded);
185 }
186 }
187
188 Poll::Ready(decoder.flush().transpose())
189 });
190
191 Ok(stream.map_err(Into::into).boxed())
192 }))
193}
194
195pub async fn infer_schemas(
196 store: &ObjectStore,
197 files: &[String],
198 file_format: &dyn FileFormat,
199) -> Result<ArrowSchema> {
200 let mut schemas = Vec::with_capacity(files.len());
201 for file in files {
202 schemas.push(file_format.infer_schema(store, file).await?)
203 }
204 ArrowSchema::try_merge(schemas).context(error::MergeSchemaSnafu)
205}
206
207async fn write_to_compressed_writer(
211 compressed_writer: &mut CompressedWriter,
212 data: &[u8],
213) -> Result<()> {
214 if !data.is_empty() {
215 compressed_writer
216 .write_all(data)
217 .await
218 .context(error::AsyncWriteSnafu)?;
219 }
220 Ok(())
221}
222
223pub async fn stream_to_file<E>(
230 mut stream: SendableRecordBatchStream,
231 store: ObjectStore,
232 path: &str,
233 threshold: usize,
234 concurrency: usize,
235 compression_type: CompressionType,
236 encoder_factory: impl Fn(SharedBuffer) -> E,
237) -> Result<usize>
238where
239 E: DfRecordBatchEncoder,
240{
241 let writer = store
243 .writer_with(path)
244 .concurrent(concurrency)
245 .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize)
246 .await
247 .with_context(|_| error::WriteObjectSnafu { path })?
248 .into_futures_async_write()
249 .compat_write();
250
251 let mut compressed_writer = writer.into_compressed_writer(compression_type);
253
254 let buffer = SharedBuffer::with_capacity(threshold);
256 let mut encoder = encoder_factory(buffer.clone());
257
258 let mut rows = 0;
259
260 while let Some(batch) = stream.next().await {
262 let batch = batch.context(error::ReadRecordBatchSnafu)?;
263
264 encoder.write(&batch)?;
266 rows += batch.num_rows();
267
268 loop {
269 let chunk = {
270 let mut buffer_guard = buffer.buffer.lock().unwrap();
271 if buffer_guard.len() < threshold {
272 break;
273 }
274 buffer_guard.split_to(threshold)
275 };
276 write_to_compressed_writer(&mut compressed_writer, &chunk).await?;
277 }
278 }
279
280 if rows != 0 {
283 let final_data = {
285 let mut buffer_guard = buffer.buffer.lock().unwrap();
286 buffer_guard.split()
287 };
288 write_to_compressed_writer(&mut compressed_writer, &final_data).await?;
289 }
290
291 compressed_writer.shutdown().await?;
293
294 Ok(rows)
295}
296
297pub async fn file_to_stream(
302 store: &ObjectStore,
303 filename: &str,
304 file_source: Arc<dyn FileSource>,
305 projection: Option<Vec<usize>>,
306 compression_type: CompressionType,
307) -> Result<DfSendableRecordBatchStream> {
308 let df_compression: DfCompressionType = compression_type.into();
309 let config =
310 FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source.clone())
311 .with_file_group(FileGroup::new(vec![PartitionedFile::new(
312 filename.to_string(),
313 0,
314 )]))
315 .with_projection_indices(projection)?
316 .with_file_compression_type(df_compression)
317 .build();
318
319 let store = Arc::new(object_store::compat::OpendalStore::new(store.clone()));
320 let file_opener = config.file_source().create_file_opener(store, &config, 0)?;
321 let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new())?;
322
323 Ok(Box::pin(stream))
324}