common_datasource/file_format/
orc.rs1use arrow_schema::Schema;
16use async_trait::async_trait;
17use bytes::Bytes;
18pub use datafusion_orc::OrcSource;
19use futures::FutureExt;
20use futures::future::BoxFuture;
21use object_store::ObjectStore;
22use orc_rust::arrow_reader::ArrowReaderBuilder;
23use orc_rust::async_arrow_reader::ArrowStreamReader;
24use orc_rust::reader::AsyncChunkReader;
25use snafu::ResultExt;
26
27use crate::error::{self, Result};
28use crate::file_format::FileFormat;
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
31pub struct OrcFormat;
32
33#[derive(Clone)]
34pub struct ReaderAdapter {
35 reader: object_store::Reader,
36 len: u64,
37}
38
39impl ReaderAdapter {
40 pub fn new(reader: object_store::Reader, len: u64) -> Self {
41 Self { reader, len }
42 }
43}
44
45impl AsyncChunkReader for ReaderAdapter {
46 fn len(&mut self) -> BoxFuture<'_, std::io::Result<u64>> {
47 async move { Ok(self.len) }.boxed()
48 }
49
50 fn get_bytes(
51 &mut self,
52 offset_from_start: u64,
53 length: u64,
54 ) -> BoxFuture<'_, std::io::Result<Bytes>> {
55 async move {
56 let bytes = self
57 .reader
58 .read(offset_from_start..offset_from_start + length)
59 .await?;
60 Ok(bytes.to_bytes())
61 }
62 .boxed()
63 }
64}
65
66pub async fn new_orc_stream_reader(
67 reader: ReaderAdapter,
68) -> Result<ArrowStreamReader<ReaderAdapter>> {
69 let reader_build = ArrowReaderBuilder::try_new_async(reader)
70 .await
71 .context(error::OrcReaderSnafu)?;
72 Ok(reader_build.build_async())
73}
74
75pub async fn infer_orc_schema(reader: ReaderAdapter) -> Result<Schema> {
76 let reader = new_orc_stream_reader(reader).await?;
77 Ok(reader.schema().as_ref().clone())
78}
79
80#[async_trait]
81impl FileFormat for OrcFormat {
82 async fn infer_schema(&self, store: &ObjectStore, path: &str) -> Result<Schema> {
83 let meta = store
84 .stat(path)
85 .await
86 .context(error::ReadObjectSnafu { path })?;
87 let reader = store
88 .reader(path)
89 .await
90 .context(error::ReadObjectSnafu { path })?;
91 let schema = infer_orc_schema(ReaderAdapter::new(reader, meta.content_length())).await?;
92 Ok(schema)
93 }
94}
95
96#[cfg(test)]
97mod tests {
98 use common_test_util::find_workspace_path;
99
100 use super::*;
101 use crate::file_format::FileFormat;
102 use crate::test_util::{format_schema, test_store};
103
104 fn test_data_root() -> String {
105 find_workspace_path("/src/common/datasource/tests/orc")
106 .display()
107 .to_string()
108 }
109
110 #[tokio::test]
111 async fn test_orc_infer_schema() {
112 let store = test_store(&test_data_root());
113 let schema = OrcFormat.infer_schema(&store, "test.orc").await.unwrap();
114 let formatted: Vec<_> = format_schema(schema);
115
116 assert_eq!(
117 vec![
118 "double_a: Float64: NULL",
119 "a: Float32: NULL",
120 "b: Boolean: NULL",
121 "str_direct: Utf8: NULL",
122 "d: Utf8: NULL",
123 "e: Utf8: NULL",
124 "f: Utf8: NULL",
125 "int_short_repeated: Int32: NULL",
126 "int_neg_short_repeated: Int32: NULL",
127 "int_delta: Int32: NULL",
128 "int_neg_delta: Int32: NULL",
129 "int_direct: Int32: NULL",
130 "int_neg_direct: Int32: NULL",
131 "bigint_direct: Int64: NULL",
132 "bigint_neg_direct: Int64: NULL",
133 "bigint_other: Int64: NULL",
134 "utf8_increase: Utf8: NULL",
135 "utf8_decrease: Utf8: NULL",
136 "timestamp_simple: Timestamp(Nanosecond, None): NULL",
137 "date_simple: Date32: NULL"
138 ],
139 formatted
140 );
141 }
142}