Skip to main content

common_datasource/file_format/
orc.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use arrow_schema::Schema;
16use async_trait::async_trait;
17use bytes::Bytes;
18pub use datafusion_orc::OrcSource;
19use futures::FutureExt;
20use futures::future::BoxFuture;
21use object_store::ObjectStore;
22use orc_rust::arrow_reader::ArrowReaderBuilder;
23use orc_rust::async_arrow_reader::ArrowStreamReader;
24use orc_rust::reader::AsyncChunkReader;
25use snafu::ResultExt;
26
27use crate::error::{self, Result};
28use crate::file_format::FileFormat;
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
31pub struct OrcFormat;
32
33#[derive(Clone)]
34pub struct ReaderAdapter {
35    reader: object_store::Reader,
36    len: u64,
37}
38
39impl ReaderAdapter {
40    pub fn new(reader: object_store::Reader, len: u64) -> Self {
41        Self { reader, len }
42    }
43}
44
45impl AsyncChunkReader for ReaderAdapter {
46    fn len(&mut self) -> BoxFuture<'_, std::io::Result<u64>> {
47        async move { Ok(self.len) }.boxed()
48    }
49
50    fn get_bytes(
51        &mut self,
52        offset_from_start: u64,
53        length: u64,
54    ) -> BoxFuture<'_, std::io::Result<Bytes>> {
55        async move {
56            let bytes = self
57                .reader
58                .read(offset_from_start..offset_from_start + length)
59                .await?;
60            Ok(bytes.to_bytes())
61        }
62        .boxed()
63    }
64}
65
66pub async fn new_orc_stream_reader(
67    reader: ReaderAdapter,
68) -> Result<ArrowStreamReader<ReaderAdapter>> {
69    let reader_build = ArrowReaderBuilder::try_new_async(reader)
70        .await
71        .context(error::OrcReaderSnafu)?;
72    Ok(reader_build.build_async())
73}
74
75pub async fn infer_orc_schema(reader: ReaderAdapter) -> Result<Schema> {
76    let reader = new_orc_stream_reader(reader).await?;
77    Ok(reader.schema().as_ref().clone())
78}
79
80#[async_trait]
81impl FileFormat for OrcFormat {
82    async fn infer_schema(&self, store: &ObjectStore, path: &str) -> Result<Schema> {
83        let meta = store
84            .stat(path)
85            .await
86            .context(error::ReadObjectSnafu { path })?;
87        let reader = store
88            .reader(path)
89            .await
90            .context(error::ReadObjectSnafu { path })?;
91        let schema = infer_orc_schema(ReaderAdapter::new(reader, meta.content_length())).await?;
92        Ok(schema)
93    }
94}
95
96#[cfg(test)]
97mod tests {
98    use common_test_util::find_workspace_path;
99
100    use super::*;
101    use crate::file_format::FileFormat;
102    use crate::test_util::{format_schema, test_store};
103
104    fn test_data_root() -> String {
105        find_workspace_path("/src/common/datasource/tests/orc")
106            .display()
107            .to_string()
108    }
109
110    #[tokio::test]
111    async fn test_orc_infer_schema() {
112        let store = test_store(&test_data_root());
113        let schema = OrcFormat.infer_schema(&store, "test.orc").await.unwrap();
114        let formatted: Vec<_> = format_schema(schema);
115
116        assert_eq!(
117            vec![
118                "double_a: Float64: NULL",
119                "a: Float32: NULL",
120                "b: Boolean: NULL",
121                "str_direct: Utf8: NULL",
122                "d: Utf8: NULL",
123                "e: Utf8: NULL",
124                "f: Utf8: NULL",
125                "int_short_repeated: Int32: NULL",
126                "int_neg_short_repeated: Int32: NULL",
127                "int_delta: Int32: NULL",
128                "int_neg_delta: Int32: NULL",
129                "int_direct: Int32: NULL",
130                "int_neg_direct: Int32: NULL",
131                "bigint_direct: Int64: NULL",
132                "bigint_neg_direct: Int64: NULL",
133                "bigint_other: Int64: NULL",
134                "utf8_increase: Utf8: NULL",
135                "utf8_decrease: Utf8: NULL",
136                "timestamp_simple: Timestamp(Nanosecond, None): NULL",
137                "date_simple: Date32: NULL"
138            ],
139            formatted
140        );
141    }
142}