mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-31 12:20:38 +00:00
feat(copy_to_csv): add date_format/timestamp_format/time_format. (#6995)
feat(copy_to_csv): add `date_format` and so on to `Copy ... to with` syntax Signed-off-by: Yihai Lin <yihai-lin@foxmail.com>
This commit is contained in:
@@ -54,8 +54,11 @@ pub const FORMAT_SCHEMA_INFER_MAX_RECORD: &str = "schema_infer_max_record";
|
||||
pub const FORMAT_HAS_HEADER: &str = "has_header";
|
||||
pub const FORMAT_TYPE: &str = "format";
|
||||
pub const FILE_PATTERN: &str = "pattern";
|
||||
pub const TIMESTAMP_FORMAT: &str = "timestamp_format";
|
||||
pub const TIME_FORMAT: &str = "time_format";
|
||||
pub const DATE_FORMAT: &str = "date_format";
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Format {
|
||||
Csv(CsvFormat),
|
||||
Json(JsonFormat),
|
||||
|
||||
@@ -15,8 +15,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
|
||||
use arrow::csv;
|
||||
use arrow::csv::reader::Format;
|
||||
use arrow::csv::{self, WriterBuilder};
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use arrow_schema::Schema;
|
||||
use async_trait::async_trait;
|
||||
@@ -33,12 +33,15 @@ use crate::error::{self, Result};
|
||||
use crate::file_format::{self, FileFormat, stream_to_file};
|
||||
use crate::share_buffer::SharedBuffer;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct CsvFormat {
|
||||
pub has_header: bool,
|
||||
pub delimiter: u8,
|
||||
pub schema_infer_max_record: Option<usize>,
|
||||
pub compression_type: CompressionType,
|
||||
pub timestamp_format: Option<String>,
|
||||
pub time_format: Option<String>,
|
||||
pub date_format: Option<String>,
|
||||
}
|
||||
|
||||
impl TryFrom<&HashMap<String, String>> for CsvFormat {
|
||||
@@ -79,6 +82,15 @@ impl TryFrom<&HashMap<String, String>> for CsvFormat {
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
};
|
||||
if let Some(timestamp_format) = value.get(file_format::TIMESTAMP_FORMAT) {
|
||||
format.timestamp_format = Some(timestamp_format.clone());
|
||||
}
|
||||
if let Some(time_format) = value.get(file_format::TIME_FORMAT) {
|
||||
format.time_format = Some(time_format.clone());
|
||||
}
|
||||
if let Some(date_format) = value.get(file_format::DATE_FORMAT) {
|
||||
format.date_format = Some(date_format.clone());
|
||||
}
|
||||
Ok(format)
|
||||
}
|
||||
@@ -91,6 +103,9 @@ impl Default for CsvFormat {
|
||||
delimiter: b',',
|
||||
schema_infer_max_record: Some(file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD),
|
||||
compression_type: CompressionType::Uncompressed,
|
||||
timestamp_format: None,
|
||||
time_format: None,
|
||||
date_format: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -140,9 +155,20 @@ pub async fn stream_to_csv(
|
||||
path: &str,
|
||||
threshold: usize,
|
||||
concurrency: usize,
|
||||
format: &CsvFormat,
|
||||
) -> Result<usize> {
|
||||
stream_to_file(stream, store, path, threshold, concurrency, |buffer| {
|
||||
csv::Writer::new(buffer)
|
||||
let mut builder = WriterBuilder::new();
|
||||
if let Some(timestamp_format) = &format.timestamp_format {
|
||||
builder = builder.with_timestamp_format(timestamp_format.to_owned())
|
||||
}
|
||||
if let Some(date_format) = &format.date_format {
|
||||
builder = builder.with_date_format(date_format.to_owned())
|
||||
}
|
||||
if let Some(time_format) = &format.time_format {
|
||||
builder = builder.with_time_format(time_format.to_owned())
|
||||
}
|
||||
builder.build(buffer)
|
||||
})
|
||||
.await
|
||||
}
|
||||
@@ -265,6 +291,9 @@ mod tests {
|
||||
schema_infer_max_record: Some(2000),
|
||||
delimiter: b'\t',
|
||||
has_header: false,
|
||||
timestamp_format: None,
|
||||
time_format: None,
|
||||
date_format: None
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ use object_store::ObjectStore;
|
||||
use super::FORMAT_TYPE;
|
||||
use crate::file_format::parquet::DefaultParquetFileReaderFactory;
|
||||
use crate::file_format::{FileFormat, Format, OrcFormat};
|
||||
use crate::test_util::{scan_config, test_basic_schema, test_store};
|
||||
use crate::test_util::{csv_basic_schema, scan_config, test_basic_schema, test_store};
|
||||
use crate::{error, test_util};
|
||||
|
||||
struct Test<'a> {
|
||||
@@ -107,7 +107,7 @@ async fn test_json_opener() {
|
||||
#[tokio::test]
|
||||
async fn test_csv_opener() {
|
||||
let store = test_store("/");
|
||||
let schema = test_basic_schema();
|
||||
let schema = csv_basic_schema();
|
||||
let path = &find_workspace_path("/src/common/datasource/tests/csv/basic.csv")
|
||||
.display()
|
||||
.to_string();
|
||||
@@ -121,24 +121,24 @@ async fn test_csv_opener() {
|
||||
config: scan_config(schema.clone(), None, path, file_source.clone()),
|
||||
file_source: file_source.clone(),
|
||||
expected: vec![
|
||||
"+-----+-------+",
|
||||
"| num | str |",
|
||||
"+-----+-------+",
|
||||
"| 5 | test |",
|
||||
"| 2 | hello |",
|
||||
"| 4 | foo |",
|
||||
"+-----+-------+",
|
||||
"+-----+-------+---------------------+----------+------------+",
|
||||
"| num | str | ts | t | date |",
|
||||
"+-----+-------+---------------------+----------+------------+",
|
||||
"| 5 | test | 2023-04-01T00:00:00 | 00:00:10 | 2023-04-01 |",
|
||||
"| 2 | hello | 2023-04-01T00:00:00 | 00:00:20 | 2023-04-01 |",
|
||||
"| 4 | foo | 2023-04-01T00:00:00 | 00:00:30 | 2023-04-01 |",
|
||||
"+-----+-------+---------------------+----------+------------+",
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema, Some(1), path, file_source.clone()),
|
||||
file_source,
|
||||
expected: vec![
|
||||
"+-----+------+",
|
||||
"| num | str |",
|
||||
"+-----+------+",
|
||||
"| 5 | test |",
|
||||
"+-----+------+",
|
||||
"+-----+------+---------------------+----------+------------+",
|
||||
"| num | str | ts | t | date |",
|
||||
"+-----+------+---------------------+----------+------------+",
|
||||
"| 5 | test | 2023-04-01T00:00:00 | 00:00:10 | 2023-04-01 |",
|
||||
"+-----+------+---------------------+----------+------------+",
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::{DataType, Field, Schema, SchemaRef};
|
||||
use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit};
|
||||
use common_test_util::temp_dir::{TempDir, create_temp_dir};
|
||||
use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
|
||||
use datafusion::datasource::listing::PartitionedFile;
|
||||
@@ -27,7 +27,7 @@ use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use object_store::ObjectStore;
|
||||
use object_store::services::Fs;
|
||||
|
||||
use crate::file_format::csv::stream_to_csv;
|
||||
use crate::file_format::csv::{CsvFormat, stream_to_csv};
|
||||
use crate::file_format::json::stream_to_json;
|
||||
use crate::test_util;
|
||||
|
||||
@@ -68,6 +68,17 @@ pub fn test_basic_schema() -> SchemaRef {
|
||||
Arc::new(schema)
|
||||
}
|
||||
|
||||
pub fn csv_basic_schema() -> SchemaRef {
|
||||
let schema = Schema::new(vec![
|
||||
Field::new("num", DataType::Int64, false),
|
||||
Field::new("str", DataType::Utf8, false),
|
||||
Field::new("ts", DataType::Timestamp(TimeUnit::Second, None), false),
|
||||
Field::new("t", DataType::Time32(TimeUnit::Second), false),
|
||||
Field::new("date", DataType::Date32, false),
|
||||
]);
|
||||
Arc::new(schema)
|
||||
}
|
||||
|
||||
pub(crate) fn scan_config(
|
||||
file_schema: SchemaRef,
|
||||
limit: Option<usize>,
|
||||
@@ -128,10 +139,14 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
|
||||
assert_eq_lines(written.to_vec(), origin.to_vec());
|
||||
}
|
||||
|
||||
pub async fn setup_stream_to_csv_test(origin_path: &str, threshold: impl Fn(usize) -> usize) {
|
||||
pub async fn setup_stream_to_csv_test(
|
||||
origin_path: &str,
|
||||
format_path: &str,
|
||||
threshold: impl Fn(usize) -> usize,
|
||||
) {
|
||||
let store = test_store("/");
|
||||
|
||||
let schema = test_basic_schema();
|
||||
let schema = csv_basic_schema();
|
||||
|
||||
let csv_source = CsvSource::new(true, b',', b'"')
|
||||
.with_schema(schema.clone())
|
||||
@@ -150,21 +165,29 @@ pub async fn setup_stream_to_csv_test(origin_path: &str, threshold: impl Fn(usiz
|
||||
|
||||
let output_path = format!("{}/{}", dir.path().display(), "output");
|
||||
|
||||
let csv_format = CsvFormat {
|
||||
timestamp_format: Some("%m-%d-%Y".to_string()),
|
||||
date_format: Some("%m-%d-%Y".to_string()),
|
||||
time_format: Some("%Ss".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert!(
|
||||
stream_to_csv(
|
||||
Box::pin(stream),
|
||||
tmp_store.clone(),
|
||||
&output_path,
|
||||
threshold(size),
|
||||
8
|
||||
8,
|
||||
&csv_format,
|
||||
)
|
||||
.await
|
||||
.is_ok()
|
||||
);
|
||||
|
||||
let written = tmp_store.read(&output_path).await.unwrap();
|
||||
let origin = store.read(origin_path).await.unwrap();
|
||||
assert_eq_lines(written.to_vec(), origin.to_vec());
|
||||
let format_expect = store.read(format_path).await.unwrap();
|
||||
assert_eq_lines(written.to_vec(), format_expect.to_vec());
|
||||
}
|
||||
|
||||
// Ignore the CRLF difference across operating systems.
|
||||
|
||||
@@ -37,11 +37,15 @@ async fn test_stream_to_csv() {
|
||||
.display()
|
||||
.to_string();
|
||||
|
||||
let format_path = &find_workspace_path("/src/common/datasource/tests/csv/basic_format.csv")
|
||||
.display()
|
||||
.to_string();
|
||||
|
||||
// A small threshold
|
||||
// Triggers the flush each writes
|
||||
test_util::setup_stream_to_csv_test(origin_path, |size| size / 2).await;
|
||||
test_util::setup_stream_to_csv_test(origin_path, format_path, |size| size / 2).await;
|
||||
|
||||
// A large threshold
|
||||
// Only triggers the flush at last
|
||||
test_util::setup_stream_to_csv_test(origin_path, |size| size * 2).await;
|
||||
test_util::setup_stream_to_csv_test(origin_path, format_path, |size| size * 2).await;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
num,str
|
||||
5,test
|
||||
2,hello
|
||||
4,foo
|
||||
num,str,ts,t,date
|
||||
5,test,2023-04-01 00:00:00,10,2023-04-01
|
||||
2,hello,2023-04-01 00:00:00,20,2023-04-01
|
||||
4,foo,2023-04-01 00:00:00,30,2023-04-01
|
||||
|
||||
|
4
src/common/datasource/tests/csv/basic_format.csv
Normal file
4
src/common/datasource/tests/csv/basic_format.csv
Normal file
@@ -0,0 +1,4 @@
|
||||
num,str,ts,t,date
|
||||
5,test,04-01-2023,10s,04-01-2023
|
||||
2,hello,04-01-2023,20s,04-01-2023
|
||||
4,foo,04-01-2023,30s,04-01-2023
|
||||
|
Reference in New Issue
Block a user