mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-25 09:20:40 +00:00
chore: update datafusion family (#6675)
* chore: update datafusion family Signed-off-by: luofucong <luofc@foxmail.com> * fix ci Signed-off-by: luofucong <luofc@foxmail.com> * use official otel-arrow-rust Signed-off-by: luofucong <luofc@foxmail.com> * rebase Signed-off-by: luofucong <luofc@foxmail.com> * use the official orc-rust Signed-off-by: luofucong <luofc@foxmail.com> * resolve PR comments Signed-off-by: luofucong <luofc@foxmail.com> * remove the empty lines Signed-off-by: luofucong <luofc@foxmail.com> * try following PR comments Signed-off-by: luofucong <luofc@foxmail.com> --------- Signed-off-by: luofucong <luofc@foxmail.com>
This commit is contained in:
@@ -25,19 +25,17 @@ common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-recordbatch.workspace = true
|
||||
common-runtime.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
datafusion.workspace = true
|
||||
datafusion-orc.workspace = true
|
||||
datatypes.workspace = true
|
||||
derive_builder.workspace = true
|
||||
futures.workspace = true
|
||||
lazy_static.workspace = true
|
||||
object-store.workspace = true
|
||||
object_store_opendal.workspace = true
|
||||
orc-rust = { git = "https://github.com/datafusion-contrib/orc-rust", rev = "3134cab581a8e91b942d6a23aca2916ea965f6bb", default-features = false, features = [
|
||||
"async",
|
||||
] }
|
||||
orc-rust = { version = "0.6.3", default-features = false, features = ["async"] }
|
||||
parquet.workspace = true
|
||||
paste.workspace = true
|
||||
rand.workspace = true
|
||||
regex = "1.7"
|
||||
serde.workspace = true
|
||||
snafu.workspace = true
|
||||
@@ -47,6 +45,4 @@ tokio-util.workspace = true
|
||||
url = "2.3"
|
||||
|
||||
[dev-dependencies]
|
||||
common-telemetry.workspace = true
|
||||
common-test-util.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
@@ -12,16 +12,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::{ArrowError, Schema, SchemaRef};
|
||||
use arrow_schema::Schema;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use common_recordbatch::adapter::RecordBatchStreamTypeAdapter;
|
||||
use datafusion::datasource::physical_plan::{FileMeta, FileOpenFuture, FileOpener};
|
||||
use datafusion::error::{DataFusionError, Result as DfResult};
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, StreamExt, TryStreamExt};
|
||||
use futures::FutureExt;
|
||||
use object_store::ObjectStore;
|
||||
use orc_rust::arrow_reader::ArrowReaderBuilder;
|
||||
use orc_rust::async_arrow_reader::ArrowStreamReader;
|
||||
@@ -97,67 +92,6 @@ impl FileFormat for OrcFormat {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OrcOpener {
|
||||
object_store: Arc<ObjectStore>,
|
||||
output_schema: SchemaRef,
|
||||
projection: Option<Vec<usize>>,
|
||||
}
|
||||
|
||||
impl OrcOpener {
|
||||
pub fn new(
|
||||
object_store: ObjectStore,
|
||||
output_schema: SchemaRef,
|
||||
projection: Option<Vec<usize>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
object_store: Arc::from(object_store),
|
||||
output_schema,
|
||||
projection,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileOpener for OrcOpener {
|
||||
fn open(&self, meta: FileMeta) -> DfResult<FileOpenFuture> {
|
||||
let object_store = self.object_store.clone();
|
||||
let projected_schema = if let Some(projection) = &self.projection {
|
||||
let projected_schema = self
|
||||
.output_schema
|
||||
.project(projection)
|
||||
.map_err(|e| DataFusionError::External(Box::new(e)))?;
|
||||
Arc::new(projected_schema)
|
||||
} else {
|
||||
self.output_schema.clone()
|
||||
};
|
||||
let projection = self.projection.clone();
|
||||
Ok(Box::pin(async move {
|
||||
let path = meta.location().to_string();
|
||||
|
||||
let meta = object_store
|
||||
.stat(&path)
|
||||
.await
|
||||
.map_err(|e| DataFusionError::External(Box::new(e)))?;
|
||||
|
||||
let reader = object_store
|
||||
.reader(&path)
|
||||
.await
|
||||
.map_err(|e| DataFusionError::External(Box::new(e)))?;
|
||||
|
||||
let stream_reader =
|
||||
new_orc_stream_reader(ReaderAdapter::new(reader, meta.content_length()))
|
||||
.await
|
||||
.map_err(|e| DataFusionError::External(Box::new(e)))?;
|
||||
|
||||
let stream =
|
||||
RecordBatchStreamTypeAdapter::new(projected_schema, stream_reader, projection);
|
||||
|
||||
let adopted = stream.map_err(|e| ArrowError::ExternalError(Box::new(e)));
|
||||
Ok(adopted.boxed())
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_test_util::find_workspace_path;
|
||||
|
||||
@@ -31,6 +31,7 @@ use datatypes::schema::SchemaRef;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::StreamExt;
|
||||
use object_store::{FuturesAsyncReader, ObjectStore};
|
||||
use parquet::arrow::arrow_reader::ArrowReaderOptions;
|
||||
use parquet::arrow::AsyncArrowWriter;
|
||||
use parquet::basic::{Compression, Encoding, ZstdLevel};
|
||||
use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder};
|
||||
@@ -65,7 +66,7 @@ impl FileFormat for ParquetFormat {
|
||||
.compat();
|
||||
|
||||
let metadata = reader
|
||||
.get_metadata()
|
||||
.get_metadata(None)
|
||||
.await
|
||||
.context(error::ReadParquetSnafuSnafu)?;
|
||||
|
||||
@@ -146,7 +147,7 @@ impl LazyParquetFileReader {
|
||||
impl AsyncFileReader for LazyParquetFileReader {
|
||||
fn get_bytes(
|
||||
&mut self,
|
||||
range: std::ops::Range<usize>,
|
||||
range: std::ops::Range<u64>,
|
||||
) -> BoxFuture<'_, ParquetResult<bytes::Bytes>> {
|
||||
Box::pin(async move {
|
||||
self.maybe_initialize()
|
||||
@@ -157,13 +158,16 @@ impl AsyncFileReader for LazyParquetFileReader {
|
||||
})
|
||||
}
|
||||
|
||||
fn get_metadata(&mut self) -> BoxFuture<'_, ParquetResult<Arc<ParquetMetaData>>> {
|
||||
fn get_metadata<'a>(
|
||||
&'a mut self,
|
||||
options: Option<&'a ArrowReaderOptions>,
|
||||
) -> BoxFuture<'a, parquet::errors::Result<Arc<ParquetMetaData>>> {
|
||||
Box::pin(async move {
|
||||
self.maybe_initialize()
|
||||
.await
|
||||
.map_err(|e| ParquetError::External(Box::new(e)))?;
|
||||
// Safety: Must initialized
|
||||
self.reader.as_mut().unwrap().get_metadata().await
|
||||
self.reader.as_mut().unwrap().get_metadata(options).await
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,35 +19,39 @@ use std::vec;
|
||||
|
||||
use common_test_util::find_workspace_path;
|
||||
use datafusion::assert_batches_eq;
|
||||
use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
|
||||
use datafusion::datasource::physical_plan::{
|
||||
CsvConfig, CsvOpener, FileOpener, FileScanConfig, FileStream, JsonOpener, ParquetExec,
|
||||
CsvSource, FileScanConfig, FileSource, FileStream, JsonSource, ParquetSource,
|
||||
};
|
||||
use datafusion::datasource::source::DataSourceExec;
|
||||
use datafusion::execution::context::TaskContext;
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use datafusion::physical_plan::ExecutionPlan;
|
||||
use datafusion::prelude::SessionContext;
|
||||
use datafusion_orc::OrcSource;
|
||||
use futures::StreamExt;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
use super::FORMAT_TYPE;
|
||||
use crate::file_format::orc::{OrcFormat, OrcOpener};
|
||||
use crate::file_format::parquet::DefaultParquetFileReaderFactory;
|
||||
use crate::file_format::{FileFormat, Format};
|
||||
use crate::file_format::{FileFormat, Format, OrcFormat};
|
||||
use crate::test_util::{scan_config, test_basic_schema, test_store};
|
||||
use crate::{error, test_util};
|
||||
|
||||
struct Test<'a, T: FileOpener> {
|
||||
struct Test<'a> {
|
||||
config: FileScanConfig,
|
||||
opener: T,
|
||||
file_source: Arc<dyn FileSource>,
|
||||
expected: Vec<&'a str>,
|
||||
}
|
||||
|
||||
impl<T: FileOpener> Test<'_, T> {
|
||||
pub async fn run(self) {
|
||||
impl Test<'_> {
|
||||
async fn run(self, store: &ObjectStore) {
|
||||
let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
|
||||
let file_opener = self.file_source.create_file_opener(store, &self.config, 0);
|
||||
|
||||
let result = FileStream::new(
|
||||
&self.config,
|
||||
0,
|
||||
self.opener,
|
||||
file_opener,
|
||||
&ExecutionPlanMetricsSet::new(),
|
||||
)
|
||||
.unwrap()
|
||||
@@ -62,26 +66,16 @@ impl<T: FileOpener> Test<'_, T> {
|
||||
#[tokio::test]
|
||||
async fn test_json_opener() {
|
||||
let store = test_store("/");
|
||||
let store = Arc::new(object_store_opendal::OpendalStore::new(store));
|
||||
|
||||
let schema = test_basic_schema();
|
||||
|
||||
let json_opener = || {
|
||||
JsonOpener::new(
|
||||
test_util::TEST_BATCH_SIZE,
|
||||
schema.clone(),
|
||||
FileCompressionType::UNCOMPRESSED,
|
||||
store.clone(),
|
||||
)
|
||||
};
|
||||
let file_source = Arc::new(JsonSource::new()).with_batch_size(test_util::TEST_BATCH_SIZE);
|
||||
|
||||
let path = &find_workspace_path("/src/common/datasource/tests/json/basic.json")
|
||||
.display()
|
||||
.to_string();
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path),
|
||||
opener: json_opener(),
|
||||
config: scan_config(schema.clone(), None, path, file_source.clone()),
|
||||
file_source: file_source.clone(),
|
||||
expected: vec![
|
||||
"+-----+-------+",
|
||||
"| num | str |",
|
||||
@@ -93,8 +87,8 @@ async fn test_json_opener() {
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema.clone(), Some(1), path),
|
||||
opener: json_opener(),
|
||||
config: scan_config(schema, Some(1), path, file_source.clone()),
|
||||
file_source,
|
||||
expected: vec![
|
||||
"+-----+------+",
|
||||
"| num | str |",
|
||||
@@ -106,37 +100,26 @@ async fn test_json_opener() {
|
||||
];
|
||||
|
||||
for test in tests {
|
||||
test.run().await;
|
||||
test.run(&store).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_csv_opener() {
|
||||
let store = test_store("/");
|
||||
let store = Arc::new(object_store_opendal::OpendalStore::new(store));
|
||||
|
||||
let schema = test_basic_schema();
|
||||
let path = &find_workspace_path("/src/common/datasource/tests/csv/basic.csv")
|
||||
.display()
|
||||
.to_string();
|
||||
let csv_config = Arc::new(CsvConfig::new(
|
||||
test_util::TEST_BATCH_SIZE,
|
||||
schema.clone(),
|
||||
None,
|
||||
true,
|
||||
b',',
|
||||
b'"',
|
||||
None,
|
||||
store,
|
||||
None,
|
||||
));
|
||||
|
||||
let csv_opener = || CsvOpener::new(csv_config.clone(), FileCompressionType::UNCOMPRESSED);
|
||||
let file_source = CsvSource::new(true, b',', b'"')
|
||||
.with_batch_size(test_util::TEST_BATCH_SIZE)
|
||||
.with_schema(schema.clone());
|
||||
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path),
|
||||
opener: csv_opener(),
|
||||
config: scan_config(schema.clone(), None, path, file_source.clone()),
|
||||
file_source: file_source.clone(),
|
||||
expected: vec![
|
||||
"+-----+-------+",
|
||||
"| num | str |",
|
||||
@@ -148,8 +131,8 @@ async fn test_csv_opener() {
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema.clone(), Some(1), path),
|
||||
opener: csv_opener(),
|
||||
config: scan_config(schema, Some(1), path, file_source.clone()),
|
||||
file_source,
|
||||
expected: vec![
|
||||
"+-----+------+",
|
||||
"| num | str |",
|
||||
@@ -161,7 +144,7 @@ async fn test_csv_opener() {
|
||||
];
|
||||
|
||||
for test in tests {
|
||||
test.run().await;
|
||||
test.run(&store).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,12 +157,12 @@ async fn test_parquet_exec() {
|
||||
let path = &find_workspace_path("/src/common/datasource/tests/parquet/basic.parquet")
|
||||
.display()
|
||||
.to_string();
|
||||
let base_config = scan_config(schema.clone(), None, path);
|
||||
|
||||
let exec = ParquetExec::builder(base_config)
|
||||
.with_parquet_file_reader_factory(Arc::new(DefaultParquetFileReaderFactory::new(store)))
|
||||
.build();
|
||||
let parquet_source = ParquetSource::default()
|
||||
.with_parquet_file_reader_factory(Arc::new(DefaultParquetFileReaderFactory::new(store)));
|
||||
|
||||
let config = scan_config(schema, None, path, Arc::new(parquet_source));
|
||||
let exec = DataSourceExec::from_data_source(config);
|
||||
let ctx = SessionContext::new();
|
||||
|
||||
let context = Arc::new(TaskContext::from(&ctx));
|
||||
@@ -208,20 +191,18 @@ async fn test_parquet_exec() {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_orc_opener() {
|
||||
let root = find_workspace_path("/src/common/datasource/tests/orc")
|
||||
let path = &find_workspace_path("/src/common/datasource/tests/orc/test.orc")
|
||||
.display()
|
||||
.to_string();
|
||||
let store = test_store(&root);
|
||||
let schema = OrcFormat.infer_schema(&store, "test.orc").await.unwrap();
|
||||
let schema = Arc::new(schema);
|
||||
|
||||
let orc_opener = OrcOpener::new(store.clone(), schema.clone(), None);
|
||||
let path = "test.orc";
|
||||
let store = test_store("/");
|
||||
let schema = Arc::new(OrcFormat.infer_schema(&store, path).await.unwrap());
|
||||
let file_source = Arc::new(OrcSource::default());
|
||||
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path),
|
||||
opener: orc_opener.clone(),
|
||||
config: scan_config(schema.clone(), None, path, file_source.clone()),
|
||||
file_source: file_source.clone(),
|
||||
expected: vec![
|
||||
"+----------+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+",
|
||||
"| double_a | a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple |",
|
||||
@@ -235,8 +216,8 @@ async fn test_orc_opener() {
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema.clone(), Some(1), path),
|
||||
opener: orc_opener.clone(),
|
||||
config: scan_config(schema.clone(), Some(1), path, file_source.clone()),
|
||||
file_source,
|
||||
expected: vec![
|
||||
"+----------+-----+------+------------+---+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+-------------------------+-------------+",
|
||||
"| double_a | a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple |",
|
||||
@@ -248,7 +229,7 @@ async fn test_orc_opener() {
|
||||
];
|
||||
|
||||
for test in tests {
|
||||
test.run().await;
|
||||
test.run(&store).await;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,12 +16,12 @@ use std::sync::Arc;
|
||||
|
||||
use arrow_schema::{DataType, Field, Schema, SchemaRef};
|
||||
use common_test_util::temp_dir::{create_temp_dir, TempDir};
|
||||
use datafusion::common::{Constraints, Statistics};
|
||||
use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
|
||||
use datafusion::datasource::listing::PartitionedFile;
|
||||
use datafusion::datasource::object_store::ObjectStoreUrl;
|
||||
use datafusion::datasource::physical_plan::{
|
||||
CsvConfig, CsvOpener, FileScanConfig, FileStream, JsonOpener,
|
||||
CsvSource, FileGroup, FileScanConfig, FileScanConfigBuilder, FileSource, FileStream,
|
||||
JsonOpener, JsonSource,
|
||||
};
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use object_store::services::Fs;
|
||||
@@ -68,21 +68,20 @@ pub fn test_basic_schema() -> SchemaRef {
|
||||
Arc::new(schema)
|
||||
}
|
||||
|
||||
pub fn scan_config(file_schema: SchemaRef, limit: Option<usize>, filename: &str) -> FileScanConfig {
|
||||
pub(crate) fn scan_config(
|
||||
file_schema: SchemaRef,
|
||||
limit: Option<usize>,
|
||||
filename: &str,
|
||||
file_source: Arc<dyn FileSource>,
|
||||
) -> FileScanConfig {
|
||||
// object_store only recognize the Unix style path, so make it happy.
|
||||
let filename = &filename.replace('\\', "/");
|
||||
let statistics = Statistics::new_unknown(file_schema.as_ref());
|
||||
FileScanConfig {
|
||||
object_store_url: ObjectStoreUrl::parse("empty://").unwrap(), // won't be used
|
||||
file_schema,
|
||||
file_groups: vec![vec![PartitionedFile::new(filename.to_string(), 10)]],
|
||||
constraints: Constraints::empty(),
|
||||
statistics,
|
||||
projection: None,
|
||||
limit,
|
||||
table_partition_cols: vec![],
|
||||
output_ordering: vec![],
|
||||
}
|
||||
let file_group = FileGroup::new(vec![PartitionedFile::new(filename.to_string(), 4096)]);
|
||||
|
||||
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema, file_source)
|
||||
.with_file_group(file_group)
|
||||
.with_limit(limit)
|
||||
.build()
|
||||
}
|
||||
|
||||
pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usize) -> usize) {
|
||||
@@ -99,9 +98,14 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
|
||||
|
||||
let size = store.read(origin_path).await.unwrap().len();
|
||||
|
||||
let config = scan_config(schema.clone(), None, origin_path);
|
||||
|
||||
let stream = FileStream::new(&config, 0, json_opener, &ExecutionPlanMetricsSet::new()).unwrap();
|
||||
let config = scan_config(schema, None, origin_path, Arc::new(JsonSource::new()));
|
||||
let stream = FileStream::new(
|
||||
&config,
|
||||
0,
|
||||
Arc::new(json_opener),
|
||||
&ExecutionPlanMetricsSet::new(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (tmp_store, dir) = test_tmp_store("test_stream_to_json");
|
||||
|
||||
@@ -127,24 +131,17 @@ pub async fn setup_stream_to_csv_test(origin_path: &str, threshold: impl Fn(usiz
|
||||
|
||||
let schema = test_basic_schema();
|
||||
|
||||
let csv_config = Arc::new(CsvConfig::new(
|
||||
TEST_BATCH_SIZE,
|
||||
schema.clone(),
|
||||
None,
|
||||
true,
|
||||
b',',
|
||||
b'"',
|
||||
None,
|
||||
Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
|
||||
None,
|
||||
));
|
||||
|
||||
let csv_opener = CsvOpener::new(csv_config, FileCompressionType::UNCOMPRESSED);
|
||||
|
||||
let csv_source = CsvSource::new(true, b',', b'"')
|
||||
.with_schema(schema.clone())
|
||||
.with_batch_size(TEST_BATCH_SIZE);
|
||||
let config = scan_config(schema, None, origin_path, csv_source.clone());
|
||||
let size = store.read(origin_path).await.unwrap().len();
|
||||
|
||||
let config = scan_config(schema.clone(), None, origin_path);
|
||||
|
||||
let csv_opener = csv_source.create_file_opener(
|
||||
Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
|
||||
&config,
|
||||
0,
|
||||
);
|
||||
let stream = FileStream::new(&config, 0, csv_opener, &ExecutionPlanMetricsSet::new()).unwrap();
|
||||
|
||||
let (tmp_store, dir) = test_tmp_store("test_stream_to_csv");
|
||||
|
||||
Reference in New Issue
Block a user