mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-28 02:40:38 +00:00
chore: upgrade DataFusion family (#7558)
* chore: upgrade DataFusion family Signed-off-by: luofucong <luofc@foxmail.com> * use main proto Signed-off-by: luofucong <luofc@foxmail.com> * fix ci Signed-off-by: luofucong <luofc@foxmail.com> --------- Signed-off-by: luofucong <luofc@foxmail.com>
This commit is contained in:
@@ -35,6 +35,7 @@ use mito2::sst::parquet::reader::ParquetReaderBuilder;
|
||||
use mito2::sst::parquet::{PARQUET_METADATA_KEY, WriteOptions};
|
||||
use mito2::worker::write_cache_from_config;
|
||||
use object_store::ObjectStore;
|
||||
use parquet::file::metadata::{FooterTail, KeyValue};
|
||||
use regex::Regex;
|
||||
use snafu::OptionExt;
|
||||
use store_api::metadata::{RegionMetadata, RegionMetadataRef};
|
||||
@@ -463,7 +464,6 @@ fn extract_region_metadata(
|
||||
file_path: &str,
|
||||
meta: &parquet::file::metadata::ParquetMetaData,
|
||||
) -> error::Result<RegionMetadataRef> {
|
||||
use parquet::format::KeyValue;
|
||||
let kvs: Option<&Vec<KeyValue>> = meta.file_metadata().key_value_metadata();
|
||||
let Some(kvs) = kvs else {
|
||||
return Err(error::IllegalConfigSnafu {
|
||||
@@ -608,7 +608,7 @@ async fn load_parquet_metadata(
|
||||
let buffer_len = buffer.len();
|
||||
let mut footer = [0; 8];
|
||||
footer.copy_from_slice(&buffer[buffer_len - FOOTER_SIZE..]);
|
||||
let footer = ParquetMetaDataReader::decode_footer_tail(&footer)?;
|
||||
let footer = FooterTail::try_new(&footer)?;
|
||||
let metadata_len = footer.metadata_length() as u64;
|
||||
if actual_size - (FOOTER_SIZE as u64) < metadata_len {
|
||||
return Err("invalid footer/metadata length".into());
|
||||
|
||||
@@ -27,13 +27,14 @@ common-recordbatch.workspace = true
|
||||
common-runtime.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
datafusion.workspace = true
|
||||
datafusion-datasource.workspace = true
|
||||
datafusion-orc.workspace = true
|
||||
datatypes.workspace = true
|
||||
futures.workspace = true
|
||||
lazy_static.workspace = true
|
||||
object-store.workspace = true
|
||||
object_store_opendal.workspace = true
|
||||
orc-rust = { version = "0.6.3", default-features = false, features = ["async"] }
|
||||
orc-rust = { version = "0.7", default-features = false, features = ["async"] }
|
||||
parquet.workspace = true
|
||||
paste.workspace = true
|
||||
regex.workspace = true
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use async_trait::async_trait;
|
||||
use datafusion::parquet::format::FileMetaData;
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
@@ -24,5 +24,5 @@ pub trait DfRecordBatchEncoder {
|
||||
|
||||
#[async_trait]
|
||||
pub trait ArrowWriterCloser {
|
||||
async fn close(mut self) -> Result<FileMetaData>;
|
||||
async fn close(mut self) -> Result<ParquetMetaData>;
|
||||
}
|
||||
|
||||
@@ -40,7 +40,6 @@ use datafusion::datasource::physical_plan::{
|
||||
use datafusion::error::{DataFusionError, Result as DataFusionResult};
|
||||
use datafusion::physical_plan::SendableRecordBatchStream;
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use object_store::ObjectStore;
|
||||
use object_store_opendal::OpendalStore;
|
||||
@@ -303,24 +302,20 @@ where
|
||||
pub async fn file_to_stream(
|
||||
store: &ObjectStore,
|
||||
filename: &str,
|
||||
file_schema: SchemaRef,
|
||||
file_source: Arc<dyn FileSource>,
|
||||
projection: Option<Vec<usize>>,
|
||||
compression_type: CompressionType,
|
||||
) -> Result<DfSendableRecordBatchStream> {
|
||||
let df_compression: DfCompressionType = compression_type.into();
|
||||
let config = FileScanConfigBuilder::new(
|
||||
ObjectStoreUrl::local_filesystem(),
|
||||
file_schema,
|
||||
file_source.clone(),
|
||||
)
|
||||
.with_file_group(FileGroup::new(vec![PartitionedFile::new(
|
||||
filename.to_string(),
|
||||
0,
|
||||
)]))
|
||||
.with_projection(projection)
|
||||
.with_file_compression_type(df_compression)
|
||||
.build();
|
||||
let config =
|
||||
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source.clone())
|
||||
.with_file_group(FileGroup::new(vec![PartitionedFile::new(
|
||||
filename.to_string(),
|
||||
0,
|
||||
)]))
|
||||
.with_projection_indices(projection)
|
||||
.with_file_compression_type(df_compression)
|
||||
.build();
|
||||
|
||||
let store = Arc::new(OpendalStore::new(store.clone()));
|
||||
let file_opener = file_source
|
||||
|
||||
@@ -440,14 +440,11 @@ mod tests {
|
||||
.await
|
||||
.unwrap(),
|
||||
);
|
||||
let csv_source = CsvSource::new(true, b',', b'"')
|
||||
.with_schema(schema.clone())
|
||||
.with_batch_size(8192);
|
||||
let csv_source = CsvSource::new(schema).with_batch_size(8192);
|
||||
|
||||
let stream = file_to_stream(
|
||||
&store,
|
||||
compressed_file_path_str,
|
||||
schema.clone(),
|
||||
csv_source.clone(),
|
||||
None,
|
||||
compression_type,
|
||||
|
||||
@@ -347,14 +347,11 @@ mod tests {
|
||||
.await
|
||||
.unwrap(),
|
||||
);
|
||||
let json_source = JsonSource::new()
|
||||
.with_schema(schema.clone())
|
||||
.with_batch_size(8192);
|
||||
let json_source = JsonSource::new(schema).with_batch_size(8192);
|
||||
|
||||
let stream = file_to_stream(
|
||||
&store,
|
||||
compressed_file_path_str,
|
||||
schema.clone(),
|
||||
json_source.clone(),
|
||||
None,
|
||||
compression_type,
|
||||
|
||||
@@ -18,15 +18,15 @@ use std::sync::Arc;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use arrow_schema::Schema;
|
||||
use async_trait::async_trait;
|
||||
use datafusion::datasource::physical_plan::{FileMeta, ParquetFileReaderFactory};
|
||||
use datafusion::datasource::physical_plan::ParquetFileReaderFactory;
|
||||
use datafusion::error::Result as DatafusionResult;
|
||||
use datafusion::parquet::arrow::async_reader::AsyncFileReader;
|
||||
use datafusion::parquet::arrow::{ArrowWriter, parquet_to_arrow_schema};
|
||||
use datafusion::parquet::errors::{ParquetError, Result as ParquetResult};
|
||||
use datafusion::parquet::file::metadata::ParquetMetaData;
|
||||
use datafusion::parquet::format::FileMetaData;
|
||||
use datafusion::physical_plan::SendableRecordBatchStream;
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use datafusion_datasource::PartitionedFile;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use futures::StreamExt;
|
||||
use futures::future::BoxFuture;
|
||||
@@ -100,11 +100,11 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
|
||||
fn create_reader(
|
||||
&self,
|
||||
_partition_index: usize,
|
||||
file_meta: FileMeta,
|
||||
partitioned_file: PartitionedFile,
|
||||
_metadata_size_hint: Option<usize>,
|
||||
_metrics: &ExecutionPlanMetricsSet,
|
||||
) -> DatafusionResult<Box<dyn AsyncFileReader + Send>> {
|
||||
let path = file_meta.location().to_string();
|
||||
let path = partitioned_file.path().to_string();
|
||||
let object_store = self.object_store.clone();
|
||||
|
||||
Ok(Box::new(LazyParquetFileReader::new(object_store, path)))
|
||||
@@ -180,7 +180,7 @@ impl DfRecordBatchEncoder for ArrowWriter<SharedBuffer> {
|
||||
|
||||
#[async_trait]
|
||||
impl ArrowWriterCloser for ArrowWriter<SharedBuffer> {
|
||||
async fn close(self) -> Result<FileMetaData> {
|
||||
async fn close(self) -> Result<ParquetMetaData> {
|
||||
self.close().context(error::EncodeRecordBatchSnafu)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,14 +67,14 @@ impl Test<'_> {
|
||||
async fn test_json_opener() {
|
||||
let store = test_store("/");
|
||||
let schema = test_basic_schema();
|
||||
let file_source = Arc::new(JsonSource::new()).with_batch_size(test_util::TEST_BATCH_SIZE);
|
||||
let file_source = Arc::new(JsonSource::new(schema)).with_batch_size(test_util::TEST_BATCH_SIZE);
|
||||
|
||||
let path = &find_workspace_path("/src/common/datasource/tests/json/basic.json")
|
||||
.display()
|
||||
.to_string();
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path, file_source.clone()),
|
||||
config: scan_config(None, path, file_source.clone()),
|
||||
file_source: file_source.clone(),
|
||||
expected: vec![
|
||||
"+-----+-------+",
|
||||
@@ -87,7 +87,7 @@ async fn test_json_opener() {
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema, Some(1), path, file_source.clone()),
|
||||
config: scan_config(Some(1), path, file_source.clone()),
|
||||
file_source,
|
||||
expected: vec![
|
||||
"+-----+------+",
|
||||
@@ -112,13 +112,11 @@ async fn test_csv_opener() {
|
||||
.display()
|
||||
.to_string();
|
||||
|
||||
let file_source = CsvSource::new(true, b',', b'"')
|
||||
.with_batch_size(test_util::TEST_BATCH_SIZE)
|
||||
.with_schema(schema.clone());
|
||||
let file_source = CsvSource::new(schema).with_batch_size(test_util::TEST_BATCH_SIZE);
|
||||
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path, file_source.clone()),
|
||||
config: scan_config(None, path, file_source.clone()),
|
||||
file_source: file_source.clone(),
|
||||
expected: vec![
|
||||
"+-----+-------+---------------------+----------+------------+",
|
||||
@@ -131,7 +129,7 @@ async fn test_csv_opener() {
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema, Some(1), path, file_source.clone()),
|
||||
config: scan_config(Some(1), path, file_source.clone()),
|
||||
file_source,
|
||||
expected: vec![
|
||||
"+-----+------+---------------------+----------+------------+",
|
||||
@@ -158,10 +156,10 @@ async fn test_parquet_exec() {
|
||||
.display()
|
||||
.to_string();
|
||||
|
||||
let parquet_source = ParquetSource::default()
|
||||
let parquet_source = ParquetSource::new(schema)
|
||||
.with_parquet_file_reader_factory(Arc::new(DefaultParquetFileReaderFactory::new(store)));
|
||||
|
||||
let config = scan_config(schema, None, path, Arc::new(parquet_source));
|
||||
let config = scan_config(None, path, Arc::new(parquet_source));
|
||||
let exec = DataSourceExec::from_data_source(config);
|
||||
let ctx = SessionContext::new();
|
||||
|
||||
@@ -197,11 +195,11 @@ async fn test_orc_opener() {
|
||||
|
||||
let store = test_store("/");
|
||||
let schema = Arc::new(OrcFormat.infer_schema(&store, path).await.unwrap());
|
||||
let file_source = Arc::new(OrcSource::default());
|
||||
let file_source = Arc::new(OrcSource::new(schema.into()));
|
||||
|
||||
let tests = [
|
||||
Test {
|
||||
config: scan_config(schema.clone(), None, path, file_source.clone()),
|
||||
config: scan_config(None, path, file_source.clone()),
|
||||
file_source: file_source.clone(),
|
||||
expected: vec![
|
||||
"+----------+-----+-------+------------+-----+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+----------------------------+-------------+",
|
||||
@@ -216,7 +214,7 @@ async fn test_orc_opener() {
|
||||
],
|
||||
},
|
||||
Test {
|
||||
config: scan_config(schema.clone(), Some(1), path, file_source.clone()),
|
||||
config: scan_config(Some(1), path, file_source.clone()),
|
||||
file_source,
|
||||
expected: vec![
|
||||
"+----------+-----+------+------------+---+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+-------------------------+-------------+",
|
||||
|
||||
@@ -80,7 +80,6 @@ pub fn csv_basic_schema() -> SchemaRef {
|
||||
}
|
||||
|
||||
pub(crate) fn scan_config(
|
||||
file_schema: SchemaRef,
|
||||
limit: Option<usize>,
|
||||
filename: &str,
|
||||
file_source: Arc<dyn FileSource>,
|
||||
@@ -89,7 +88,7 @@ pub(crate) fn scan_config(
|
||||
let filename = &filename.replace('\\', "/");
|
||||
let file_group = FileGroup::new(vec![PartitionedFile::new(filename.clone(), 4096)]);
|
||||
|
||||
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema, file_source)
|
||||
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
|
||||
.with_file_group(file_group)
|
||||
.with_limit(limit)
|
||||
.build()
|
||||
@@ -109,7 +108,7 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
|
||||
|
||||
let size = store.read(origin_path).await.unwrap().len();
|
||||
|
||||
let config = scan_config(schema, None, origin_path, Arc::new(JsonSource::new()));
|
||||
let config = scan_config(None, origin_path, Arc::new(JsonSource::new(schema)));
|
||||
let stream = FileStream::new(
|
||||
&config,
|
||||
0,
|
||||
@@ -151,10 +150,8 @@ pub async fn setup_stream_to_csv_test(
|
||||
|
||||
let schema = csv_basic_schema();
|
||||
|
||||
let csv_source = CsvSource::new(true, b',', b'"')
|
||||
.with_schema(schema.clone())
|
||||
.with_batch_size(TEST_BATCH_SIZE);
|
||||
let config = scan_config(schema, None, origin_path, csv_source.clone());
|
||||
let csv_source = CsvSource::new(schema).with_batch_size(TEST_BATCH_SIZE);
|
||||
let config = scan_config(None, origin_path, csv_source.clone());
|
||||
let size = store.read(origin_path).await.unwrap().len();
|
||||
|
||||
let csv_opener = csv_source.create_file_opener(
|
||||
|
||||
@@ -104,7 +104,8 @@ mod tests {
|
||||
assert!(matches!(f.signature(),
|
||||
datafusion_expr::Signature {
|
||||
type_signature: datafusion_expr::TypeSignature::Uniform(1, valid_types),
|
||||
volatility: datafusion_expr::Volatility::Immutable
|
||||
volatility: datafusion_expr::Volatility::Immutable,
|
||||
..
|
||||
} if valid_types == &ConcreteDataType::numerics().into_iter().map(|dt| { use datatypes::data_type::DataType; dt.as_arrow_type() }).collect::<Vec<_>>()));
|
||||
}
|
||||
|
||||
|
||||
@@ -331,7 +331,8 @@ mod tests {
|
||||
assert!(matches!(f.signature(),
|
||||
datafusion_expr::Signature {
|
||||
type_signature: datafusion_expr::TypeSignature::Uniform(1, valid_types),
|
||||
volatility: datafusion_expr::Volatility::Immutable
|
||||
volatility: datafusion_expr::Volatility::Immutable,
|
||||
..
|
||||
} if valid_types == &vec![ArrowDataType::Utf8]));
|
||||
}
|
||||
|
||||
|
||||
@@ -145,7 +145,8 @@ mod tests {
|
||||
assert!(matches!(f.signature(),
|
||||
datafusion_expr::Signature {
|
||||
type_signature: datafusion_expr::TypeSignature::OneOf(sigs),
|
||||
volatility: datafusion_expr::Volatility::Immutable
|
||||
volatility: datafusion_expr::Volatility::Immutable,
|
||||
..
|
||||
} if sigs.len() == 2));
|
||||
}
|
||||
|
||||
|
||||
@@ -341,6 +341,7 @@ impl AggregateUDFImpl for StateWrapper {
|
||||
name: acc_args.name,
|
||||
is_distinct: acc_args.is_distinct,
|
||||
exprs: acc_args.exprs,
|
||||
expr_fields: acc_args.expr_fields,
|
||||
};
|
||||
self.inner.accumulator(acc_args)?
|
||||
};
|
||||
|
||||
@@ -650,7 +650,7 @@ async fn test_last_value_order_by_udaf() {
|
||||
DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
|
||||
true
|
||||
), // ordering field is added to state fields too
|
||||
Field::new("is_set", DataType::Boolean, true)
|
||||
Field::new("last_value[last_value_is_set]", DataType::Boolean, true)
|
||||
]
|
||||
.into()
|
||||
),
|
||||
@@ -735,7 +735,7 @@ async fn test_last_value_order_by_udaf() {
|
||||
DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
|
||||
true,
|
||||
),
|
||||
Field::new("is_set", DataType::Boolean, true),
|
||||
Field::new("last_value[last_value_is_set]", DataType::Boolean, true),
|
||||
]
|
||||
.into(),
|
||||
vec![
|
||||
|
||||
@@ -122,7 +122,8 @@ mod tests {
|
||||
matches!(f.signature(),
|
||||
Signature {
|
||||
type_signature: TypeSignature::OneOf(sigs),
|
||||
volatility: Volatility::Immutable
|
||||
volatility: Volatility::Immutable,
|
||||
..
|
||||
} if sigs.len() == 15),
|
||||
"{:?}",
|
||||
f.signature()
|
||||
|
||||
@@ -193,7 +193,8 @@ mod tests {
|
||||
assert!(matches!(f.signature(),
|
||||
Signature {
|
||||
type_signature: TypeSignature::OneOf(sigs),
|
||||
volatility: Volatility::Immutable
|
||||
volatility: Volatility::Immutable,
|
||||
..
|
||||
} if sigs.len() == 6));
|
||||
}
|
||||
|
||||
|
||||
@@ -120,7 +120,8 @@ mod tests {
|
||||
matches!(f.signature(),
|
||||
Signature {
|
||||
type_signature: TypeSignature::OneOf(sigs),
|
||||
volatility: Volatility::Immutable
|
||||
volatility: Volatility::Immutable,
|
||||
..
|
||||
} if sigs.len() == 15),
|
||||
"{:?}",
|
||||
f.signature()
|
||||
|
||||
@@ -25,7 +25,6 @@ use datafusion_common::arrow::array::{
|
||||
};
|
||||
use datafusion_common::arrow::datatypes::DataType;
|
||||
use datafusion_common::{DataFusionError, Result};
|
||||
use datafusion_expr::type_coercion::aggregates::STRINGS;
|
||||
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
|
||||
use datatypes::arrow_array::{int_array_value_at_index, string_array_value_at_index};
|
||||
use datatypes::json::JsonStructureSettings;
|
||||
@@ -519,7 +518,7 @@ impl Default for JsonGetObject {
|
||||
DataType::LargeBinary,
|
||||
DataType::BinaryView,
|
||||
],
|
||||
STRINGS.to_vec(),
|
||||
vec![DataType::UInt8, DataType::LargeUtf8, DataType::Utf8View],
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,7 +99,8 @@ mod tests {
|
||||
assert!(matches!(rate.signature(),
|
||||
Signature {
|
||||
type_signature: TypeSignature::Uniform(2, valid_types),
|
||||
volatility: Volatility::Immutable
|
||||
volatility: Volatility::Immutable,
|
||||
..
|
||||
} if valid_types == NUMERICS
|
||||
));
|
||||
let values = vec![1.0, 3.0, 6.0];
|
||||
|
||||
@@ -19,8 +19,10 @@ use datafusion_common::DataFusionError;
|
||||
use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder};
|
||||
use datafusion_common::arrow::compute;
|
||||
use datafusion_common::arrow::datatypes::DataType;
|
||||
use datafusion_expr::type_coercion::aggregates::BINARYS;
|
||||
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
|
||||
use datafusion_common::types::logical_binary;
|
||||
use datafusion_expr::{
|
||||
Coercion, ColumnarValue, ScalarFunctionArgs, Signature, TypeSignatureClass, Volatility,
|
||||
};
|
||||
use datatypes::types::vector_type_value_to_string;
|
||||
|
||||
use crate::function::{Function, extract_args};
|
||||
@@ -35,11 +37,10 @@ pub struct VectorToStringFunction {
|
||||
impl Default for VectorToStringFunction {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
signature: Signature::one_of(
|
||||
vec![
|
||||
TypeSignature::Uniform(1, vec![DataType::BinaryView]),
|
||||
TypeSignature::Uniform(1, BINARYS.to_vec()),
|
||||
],
|
||||
signature: Signature::coercible(
|
||||
vec![Coercion::new_exact(TypeSignatureClass::Native(
|
||||
logical_binary(),
|
||||
))],
|
||||
Volatility::Immutable,
|
||||
),
|
||||
}
|
||||
|
||||
@@ -15,10 +15,10 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use datafusion::arrow::datatypes::DataType;
|
||||
use datafusion::logical_expr::ColumnarValue;
|
||||
use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignature, TypeSignatureClass};
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS};
|
||||
use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
|
||||
use datafusion_common::types::{logical_binary, logical_string};
|
||||
use datafusion_expr::{ScalarFunctionArgs, Signature, Volatility};
|
||||
use nalgebra::DVectorView;
|
||||
|
||||
use crate::function::Function;
|
||||
@@ -36,9 +36,12 @@ impl Default for ElemAvgFunction {
|
||||
Self {
|
||||
signature: Signature::one_of(
|
||||
vec![
|
||||
TypeSignature::Uniform(1, STRINGS.to_vec()),
|
||||
TypeSignature::Uniform(1, BINARYS.to_vec()),
|
||||
TypeSignature::Uniform(1, vec![DataType::BinaryView]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_binary()),
|
||||
)]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_string()),
|
||||
)]),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
),
|
||||
|
||||
@@ -15,10 +15,10 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use datafusion::arrow::datatypes::DataType;
|
||||
use datafusion::logical_expr::ColumnarValue;
|
||||
use datafusion::logical_expr_common::type_coercion::aggregates::{BINARYS, STRINGS};
|
||||
use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignature, TypeSignatureClass};
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
|
||||
use datafusion_common::types::{logical_binary, logical_string};
|
||||
use datafusion_expr::{ScalarFunctionArgs, Signature, Volatility};
|
||||
use nalgebra::DVectorView;
|
||||
|
||||
use crate::function::Function;
|
||||
@@ -49,9 +49,12 @@ impl Default for ElemProductFunction {
|
||||
Self {
|
||||
signature: Signature::one_of(
|
||||
vec![
|
||||
TypeSignature::Uniform(1, STRINGS.to_vec()),
|
||||
TypeSignature::Uniform(1, BINARYS.to_vec()),
|
||||
TypeSignature::Uniform(1, vec![DataType::BinaryView]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_binary()),
|
||||
)]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_string()),
|
||||
)]),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
),
|
||||
|
||||
@@ -15,9 +15,9 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use datafusion::arrow::datatypes::DataType;
|
||||
use datafusion::logical_expr::ColumnarValue;
|
||||
use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignatureClass};
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS};
|
||||
use datafusion_common::types::{logical_binary, logical_string};
|
||||
use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
|
||||
use nalgebra::DVectorView;
|
||||
|
||||
@@ -36,9 +36,12 @@ impl Default for ElemSumFunction {
|
||||
Self {
|
||||
signature: Signature::one_of(
|
||||
vec![
|
||||
TypeSignature::Uniform(1, STRINGS.to_vec()),
|
||||
TypeSignature::Uniform(1, BINARYS.to_vec()),
|
||||
TypeSignature::Uniform(1, vec![DataType::BinaryView]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_binary()),
|
||||
)]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_string()),
|
||||
)]),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
),
|
||||
|
||||
@@ -15,9 +15,9 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use datafusion::arrow::datatypes::DataType;
|
||||
use datafusion::logical_expr::ColumnarValue;
|
||||
use datafusion::logical_expr_common::type_coercion::aggregates::{BINARYS, STRINGS};
|
||||
use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignatureClass};
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_common::types::{logical_binary, logical_string};
|
||||
use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
|
||||
|
||||
use crate::function::Function;
|
||||
@@ -49,8 +49,12 @@ impl Default for VectorDimFunction {
|
||||
Self {
|
||||
signature: Signature::one_of(
|
||||
vec![
|
||||
TypeSignature::Uniform(1, STRINGS.to_vec()),
|
||||
TypeSignature::Uniform(1, BINARYS.to_vec()),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_binary()),
|
||||
)]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_string()),
|
||||
)]),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
),
|
||||
|
||||
@@ -15,9 +15,9 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use datafusion::arrow::datatypes::DataType;
|
||||
use datafusion::logical_expr::ColumnarValue;
|
||||
use datafusion::logical_expr_common::type_coercion::aggregates::{BINARYS, STRINGS};
|
||||
use datafusion::logical_expr::{Coercion, ColumnarValue, TypeSignatureClass};
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_common::types::{logical_binary, logical_string};
|
||||
use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
|
||||
use nalgebra::DVectorView;
|
||||
|
||||
@@ -52,9 +52,12 @@ impl Default for VectorNormFunction {
|
||||
Self {
|
||||
signature: Signature::one_of(
|
||||
vec![
|
||||
TypeSignature::Uniform(1, STRINGS.to_vec()),
|
||||
TypeSignature::Uniform(1, BINARYS.to_vec()),
|
||||
TypeSignature::Uniform(1, vec![DataType::BinaryView]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_binary()),
|
||||
)]),
|
||||
TypeSignature::Coercible(vec![Coercion::new_exact(
|
||||
TypeSignatureClass::Native(logical_string()),
|
||||
)]),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
),
|
||||
|
||||
@@ -106,7 +106,8 @@ mod tests {
|
||||
assert!(matches!(f.signature(),
|
||||
datafusion_expr::Signature {
|
||||
type_signature: datafusion_expr::TypeSignature::Uniform(1, valid_types),
|
||||
volatility: datafusion_expr::Volatility::Immutable
|
||||
volatility: datafusion_expr::Volatility::Immutable,
|
||||
..
|
||||
} if valid_types == &vec![ArrowDataType::Utf8]));
|
||||
}
|
||||
|
||||
|
||||
@@ -103,10 +103,11 @@ impl FlightEncoder {
|
||||
FlightMessage::RecordBatch(record_batch) => {
|
||||
let (encoded_dictionaries, encoded_batch) = self
|
||||
.data_gen
|
||||
.encoded_batch(
|
||||
.encode(
|
||||
&record_batch,
|
||||
&mut self.dictionary_tracker,
|
||||
&self.write_options,
|
||||
&mut Default::default(),
|
||||
)
|
||||
.expect("DictionaryTracker configured above to not fail on replacement");
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ license.workspace = true
|
||||
[dependencies]
|
||||
common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
prost.workspace = true
|
||||
snafu.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
|
||||
@@ -1145,10 +1145,11 @@ impl TryFrom<ScalarValue> for Value {
|
||||
ScalarValue::List(array) => {
|
||||
// this is for item type
|
||||
let datatype = ConcreteDataType::try_from(&array.value_type())?;
|
||||
let items = ScalarValue::convert_array_to_scalar_vec(array.as_ref())
|
||||
.context(ConvertArrowArrayToScalarsSnafu)?
|
||||
let scalar_values = ScalarValue::convert_array_to_scalar_vec(array.as_ref())
|
||||
.context(ConvertArrowArrayToScalarsSnafu)?;
|
||||
let items = scalar_values
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.flat_map(|v| v.unwrap_or_else(|| vec![ScalarValue::Null]))
|
||||
.map(|x| x.try_into())
|
||||
.collect::<Result<Vec<Value>>>()?;
|
||||
Value::List(ListValue::new(items, Arc::new(datatype)))
|
||||
@@ -2997,6 +2998,7 @@ pub(crate) mod tests {
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.flatten()
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
vs,
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::any::Any;
|
||||
use std::borrow::Borrow;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{Array, ArrayBuilder, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder};
|
||||
@@ -69,8 +68,8 @@ impl From<Vec<Option<bool>>> for BooleanVector {
|
||||
}
|
||||
}
|
||||
|
||||
impl<Ptr: Borrow<Option<bool>>> FromIterator<Ptr> for BooleanVector {
|
||||
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
|
||||
impl FromIterator<Option<bool>> for BooleanVector {
|
||||
fn from_iter<T: IntoIterator<Item = Option<bool>>>(iter: T) -> Self {
|
||||
BooleanVector {
|
||||
array: BooleanArray::from_iter(iter),
|
||||
}
|
||||
@@ -303,7 +302,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_boolean_vector_from_iter() {
|
||||
let input = vec![Some(false), Some(true), Some(false), Some(true)];
|
||||
let vec = input.iter().collect::<BooleanVector>();
|
||||
let vec = input.iter().cloned().collect::<BooleanVector>();
|
||||
assert_eq!(4, vec.len());
|
||||
for (i, v) in input.into_iter().enumerate() {
|
||||
assert_eq!(v, vec.get_data(i), "Failed at {i}")
|
||||
|
||||
@@ -83,8 +83,6 @@ impl Decimal128Vector {
|
||||
/// For example:
|
||||
/// value = 12345, precision = 3, return error.
|
||||
pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result<Self> {
|
||||
// validate if precision is too small
|
||||
self.validate_decimal_precision(precision)?;
|
||||
let array = self
|
||||
.array
|
||||
.with_precision_and_scale(precision, scale)
|
||||
@@ -124,7 +122,7 @@ impl Decimal128Vector {
|
||||
}
|
||||
|
||||
/// Validate decimal precision, if precision is invalid, return error.
|
||||
fn validate_decimal_precision(&self, precision: u8) -> Result<()> {
|
||||
pub fn validate_decimal_precision(&self, precision: u8) -> Result<()> {
|
||||
self.array
|
||||
.validate_decimal_precision(precision)
|
||||
.context(ValueExceedsPrecisionSnafu { precision })
|
||||
@@ -564,7 +562,9 @@ pub mod tests {
|
||||
let decimal_vector = decimal_builder.finish();
|
||||
assert_eq!(decimal_vector.precision(), 38);
|
||||
assert_eq!(decimal_vector.scale(), 10);
|
||||
let result = decimal_vector.with_precision_and_scale(3, 2);
|
||||
let result = decimal_vector
|
||||
.with_precision_and_scale(3, 2)
|
||||
.and_then(|x| x.validate_decimal_precision(3));
|
||||
assert_eq!(
|
||||
"Value exceeds the precision 3 bound",
|
||||
result.unwrap_err().to_string()
|
||||
|
||||
@@ -170,10 +170,11 @@ impl Helper {
|
||||
ScalarValue::List(array) => {
|
||||
let item_type = Arc::new(ConcreteDataType::try_from(&array.value_type())?);
|
||||
let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1);
|
||||
let values = ScalarValue::convert_array_to_scalar_vec(array.as_ref())
|
||||
.context(ConvertArrowArrayToScalarsSnafu)?
|
||||
let scalar_values = ScalarValue::convert_array_to_scalar_vec(array.as_ref())
|
||||
.context(ConvertArrowArrayToScalarsSnafu)?;
|
||||
let values = scalar_values
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.flat_map(|v| v.unwrap_or_else(|| vec![ScalarValue::Null]))
|
||||
.map(ScalarValue::try_into)
|
||||
.collect::<Result<Vec<Value>>>()?;
|
||||
builder.push(Some(ListValueRef::Ref {
|
||||
|
||||
@@ -18,6 +18,7 @@ use common_datasource::file_format::Format;
|
||||
use common_datasource::file_format::csv::CsvFormat;
|
||||
use common_datasource::file_format::parquet::DefaultParquetFileReaderFactory;
|
||||
use datafusion::common::ToDFSchema;
|
||||
use datafusion::config::CsvOptions;
|
||||
use datafusion::datasource::listing::PartitionedFile;
|
||||
use datafusion::datasource::object_store::ObjectStoreUrl;
|
||||
use datafusion::datasource::physical_plan::{
|
||||
@@ -34,7 +35,6 @@ use datafusion::prelude::SessionContext;
|
||||
use datafusion_expr::expr::Expr;
|
||||
use datafusion_expr::utils::conjunction;
|
||||
use datafusion_orc::OrcSource;
|
||||
use datatypes::arrow::datatypes::Schema as ArrowSchema;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
@@ -45,7 +45,6 @@ const DEFAULT_BATCH_SIZE: usize = 8192;
|
||||
|
||||
fn build_record_batch_stream(
|
||||
scan_plan_config: &ScanPlanConfig,
|
||||
file_schema: Arc<ArrowSchema>,
|
||||
limit: Option<usize>,
|
||||
file_source: Arc<dyn FileSource>,
|
||||
) -> Result<DfSendableRecordBatchStream> {
|
||||
@@ -55,15 +54,12 @@ fn build_record_batch_stream(
|
||||
.map(|filename| PartitionedFile::new(filename.clone(), 0))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let config = FileScanConfigBuilder::new(
|
||||
ObjectStoreUrl::local_filesystem(),
|
||||
file_schema,
|
||||
file_source.clone(),
|
||||
)
|
||||
.with_projection(scan_plan_config.projection.cloned())
|
||||
.with_limit(limit)
|
||||
.with_file_group(FileGroup::new(files))
|
||||
.build();
|
||||
let config =
|
||||
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source.clone())
|
||||
.with_projection_indices(scan_plan_config.projection.cloned())
|
||||
.with_limit(limit)
|
||||
.with_file_group(FileGroup::new(files))
|
||||
.build();
|
||||
|
||||
let store = Arc::new(object_store_opendal::OpendalStore::new(
|
||||
scan_plan_config.store.clone(),
|
||||
@@ -89,11 +85,14 @@ fn new_csv_stream(
|
||||
// push down limit only if there is no filter
|
||||
let limit = config.filters.is_empty().then_some(config.limit).flatten();
|
||||
|
||||
let csv_source = CsvSource::new(format.has_header, format.delimiter, b'"')
|
||||
.with_schema(file_schema.clone())
|
||||
let options = CsvOptions::default()
|
||||
.with_has_header(format.has_header)
|
||||
.with_delimiter(format.delimiter);
|
||||
let csv_source = CsvSource::new(file_schema)
|
||||
.with_csv_options(options)
|
||||
.with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
|
||||
build_record_batch_stream(config, file_schema, limit, csv_source)
|
||||
build_record_batch_stream(config, limit, csv_source)
|
||||
}
|
||||
|
||||
fn new_json_stream(config: &ScanPlanConfig) -> Result<DfSendableRecordBatchStream> {
|
||||
@@ -102,8 +101,8 @@ fn new_json_stream(config: &ScanPlanConfig) -> Result<DfSendableRecordBatchStrea
|
||||
// push down limit only if there is no filter
|
||||
let limit = config.filters.is_empty().then_some(config.limit).flatten();
|
||||
|
||||
let file_source = JsonSource::new().with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
build_record_batch_stream(config, file_schema, limit, file_source)
|
||||
let file_source = JsonSource::new(file_schema).with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
build_record_batch_stream(config, limit, file_source)
|
||||
}
|
||||
|
||||
fn new_parquet_stream_with_exec_plan(
|
||||
@@ -126,9 +125,10 @@ fn new_parquet_stream_with_exec_plan(
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
|
||||
let mut parquet_source = ParquetSource::default().with_parquet_file_reader_factory(Arc::new(
|
||||
DefaultParquetFileReaderFactory::new(store.clone()),
|
||||
));
|
||||
let mut parquet_source = ParquetSource::new(file_schema.clone())
|
||||
.with_parquet_file_reader_factory(Arc::new(DefaultParquetFileReaderFactory::new(
|
||||
store.clone(),
|
||||
)));
|
||||
|
||||
// build predicate filter
|
||||
let filters = filters.to_vec();
|
||||
@@ -143,15 +143,12 @@ fn new_parquet_stream_with_exec_plan(
|
||||
parquet_source = parquet_source.with_predicate(filters);
|
||||
};
|
||||
|
||||
let file_scan_config = FileScanConfigBuilder::new(
|
||||
ObjectStoreUrl::local_filesystem(),
|
||||
file_schema,
|
||||
Arc::new(parquet_source),
|
||||
)
|
||||
.with_file_group(file_group)
|
||||
.with_projection(projection.cloned())
|
||||
.with_limit(*limit)
|
||||
.build();
|
||||
let file_scan_config =
|
||||
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), Arc::new(parquet_source))
|
||||
.with_file_group(file_group)
|
||||
.with_projection_indices(projection.cloned())
|
||||
.with_limit(*limit)
|
||||
.build();
|
||||
|
||||
// TODO(ruihang): get this from upper layer
|
||||
let task_ctx = SessionContext::default().task_ctx();
|
||||
@@ -170,8 +167,8 @@ fn new_orc_stream(config: &ScanPlanConfig) -> Result<DfSendableRecordBatchStream
|
||||
// push down limit only if there is no filter
|
||||
let limit = config.filters.is_empty().then_some(config.limit).flatten();
|
||||
|
||||
let file_source = OrcSource::default().with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
build_record_batch_stream(config, file_schema, limit, file_source)
|
||||
let file_source = OrcSource::new(file_schema.into()).with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
build_record_batch_stream(config, limit, file_source)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
|
||||
@@ -35,7 +35,7 @@ use index::result_cache::IndexResultCache;
|
||||
use moka::notification::RemovalCause;
|
||||
use moka::sync::Cache;
|
||||
use object_store::ObjectStore;
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
|
||||
use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
|
||||
use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
|
||||
|
||||
@@ -85,13 +85,13 @@ impl CacheStrategy {
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
metrics: &mut MetadataCacheMetrics,
|
||||
page_index_policy: PageIndexPolicy,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data(file_id, metrics).await
|
||||
}
|
||||
CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data(file_id, metrics).await
|
||||
CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager
|
||||
.get_parquet_meta_data(file_id, metrics, page_index_policy)
|
||||
.await
|
||||
}
|
||||
CacheStrategy::Disabled => {
|
||||
metrics.cache_miss += 1;
|
||||
@@ -340,6 +340,7 @@ impl CacheManager {
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
metrics: &mut MetadataCacheMetrics,
|
||||
page_index_policy: PageIndexPolicy,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
// Try to get metadata from sst meta cache
|
||||
if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) {
|
||||
@@ -352,7 +353,7 @@ impl CacheManager {
|
||||
if let Some(write_cache) = &self.write_cache
|
||||
&& let Some(metadata) = write_cache
|
||||
.file_cache()
|
||||
.get_parquet_meta_data(key, metrics)
|
||||
.get_parquet_meta_data(key, metrics, page_index_policy)
|
||||
.await
|
||||
{
|
||||
metrics.file_cache_hit += 1;
|
||||
@@ -893,7 +894,7 @@ mod tests {
|
||||
cache.put_parquet_meta_data(file_id, metadata);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.get_parquet_meta_data(file_id, &mut metrics, Default::default())
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
@@ -923,7 +924,7 @@ mod tests {
|
||||
let file_id = RegionFileId::new(region_id, FileId::random());
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.get_parquet_meta_data(file_id, &mut metrics, Default::default())
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
@@ -931,14 +932,14 @@ mod tests {
|
||||
cache.put_parquet_meta_data(file_id, metadata);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.get_parquet_meta_data(file_id, &mut metrics, Default::default())
|
||||
.await
|
||||
.is_some()
|
||||
);
|
||||
cache.remove_parquet_meta_data(file_id);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.get_parquet_meta_data(file_id, &mut metrics, Default::default())
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
|
||||
8
src/mito2/src/cache/cache_size.rs
vendored
8
src/mito2/src/cache/cache_size.rs
vendored
@@ -16,11 +16,13 @@
|
||||
|
||||
use std::mem;
|
||||
|
||||
use parquet::basic::ColumnOrder;
|
||||
use parquet::file::metadata::{
|
||||
FileMetaData, ParquetColumnIndex, ParquetMetaData, ParquetOffsetIndex, RowGroupMetaData,
|
||||
FileMetaData, KeyValue, ParquetColumnIndex, ParquetMetaData, ParquetOffsetIndex,
|
||||
RowGroupMetaData,
|
||||
};
|
||||
use parquet::file::page_index::index::Index;
|
||||
use parquet::format::{ColumnOrder, KeyValue, PageLocation};
|
||||
use parquet::file::page_index::column_index::ColumnIndexMetaData as Index;
|
||||
use parquet::file::page_index::offset_index::PageLocation;
|
||||
use parquet::schema::types::{ColumnDescriptor, SchemaDescriptor, Type};
|
||||
|
||||
/// Returns estimated size of [ParquetMetaData].
|
||||
|
||||
6
src/mito2/src/cache/file_cache.rs
vendored
6
src/mito2/src/cache/file_cache.rs
vendored
@@ -28,7 +28,7 @@ use moka::notification::RemovalCause;
|
||||
use moka::policy::EvictionPolicy;
|
||||
use object_store::util::join_path;
|
||||
use object_store::{ErrorKind, ObjectStore, Reader};
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::{FileId, RegionId};
|
||||
use tokio::sync::mpsc::{Sender, UnboundedReceiver};
|
||||
@@ -571,6 +571,7 @@ impl FileCache {
|
||||
&self,
|
||||
key: IndexKey,
|
||||
cache_metrics: &mut MetadataCacheMetrics,
|
||||
page_index_policy: PageIndexPolicy,
|
||||
) -> Option<ParquetMetaData> {
|
||||
// Check if file cache contains the key
|
||||
if let Some(index_value) = self.inner.parquet_index.get(&key).await {
|
||||
@@ -578,7 +579,8 @@ impl FileCache {
|
||||
let local_store = self.local_store();
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
let file_size = index_value.file_size as u64;
|
||||
let metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
|
||||
let mut metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
|
||||
metadata_loader.with_page_index_policy(page_index_policy);
|
||||
|
||||
match metadata_loader.load(cache_metrics).await {
|
||||
Ok(metadata) => {
|
||||
|
||||
58
src/mito2/src/cache/test_util.rs
vendored
58
src/mito2/src/cache/test_util.rs
vendored
@@ -24,6 +24,7 @@ use object_store::services::Fs;
|
||||
use parquet::arrow::ArrowWriter;
|
||||
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use parquet::file::statistics::Statistics;
|
||||
|
||||
/// Returns a parquet meta data.
|
||||
pub(crate) fn parquet_meta() -> Arc<ParquetMetaData> {
|
||||
@@ -49,3 +50,60 @@ pub(crate) fn new_fs_store(path: &str) -> ObjectStore {
|
||||
let builder = Fs::default();
|
||||
ObjectStore::new(builder.root(path)).unwrap().finish()
|
||||
}
|
||||
|
||||
pub(crate) fn assert_parquet_metadata_equal(x: Arc<ParquetMetaData>, y: Arc<ParquetMetaData>) {
|
||||
// Normalize the statistics in parquet metadata because the flag "min_max_backwards_compatible"
|
||||
// is not persisted across parquet metadata writer and reader.
|
||||
fn normalize_statistics(metadata: ParquetMetaData) -> ParquetMetaData {
|
||||
let unset_min_max_backwards_compatible_flag = |stats: Statistics| -> Statistics {
|
||||
match stats {
|
||||
Statistics::Boolean(stats) => {
|
||||
Statistics::Boolean(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
Statistics::Int32(stats) => {
|
||||
Statistics::Int32(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
Statistics::Int64(stats) => {
|
||||
Statistics::Int64(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
Statistics::Int96(stats) => {
|
||||
Statistics::Int96(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
Statistics::Float(stats) => {
|
||||
Statistics::Float(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
Statistics::Double(stats) => {
|
||||
Statistics::Double(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
Statistics::ByteArray(stats) => {
|
||||
Statistics::ByteArray(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
Statistics::FixedLenByteArray(stats) => {
|
||||
Statistics::FixedLenByteArray(stats.with_backwards_compatible_min_max(false))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut metadata_builder = metadata.into_builder();
|
||||
for rg in metadata_builder.take_row_groups() {
|
||||
let mut rg_builder = rg.into_builder();
|
||||
for col in rg_builder.take_columns() {
|
||||
let stats = col
|
||||
.statistics()
|
||||
.cloned()
|
||||
.map(unset_min_max_backwards_compatible_flag);
|
||||
let mut col_builder = col.into_builder().clear_statistics();
|
||||
if let Some(stats) = stats {
|
||||
col_builder = col_builder.set_statistics(stats);
|
||||
}
|
||||
rg_builder = rg_builder.add_column_metadata(col_builder.build().unwrap());
|
||||
}
|
||||
metadata_builder = metadata_builder.add_row_group(rg_builder.build().unwrap());
|
||||
}
|
||||
metadata_builder.build()
|
||||
}
|
||||
|
||||
let x = normalize_statistics(Arc::unwrap_or_clone(x));
|
||||
let y = normalize_statistics(Arc::unwrap_or_clone(y));
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
|
||||
11
src/mito2/src/cache/write_cache.rs
vendored
11
src/mito2/src/cache/write_cache.rs
vendored
@@ -470,11 +470,12 @@ impl UploadTracker {
|
||||
mod tests {
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use object_store::ATOMIC_WRITE_DIR;
|
||||
use parquet::file::metadata::PageIndexPolicy;
|
||||
use store_api::region_request::PathType;
|
||||
|
||||
use super::*;
|
||||
use crate::access_layer::OperationType;
|
||||
use crate::cache::test_util::new_fs_store;
|
||||
use crate::cache::test_util::{assert_parquet_metadata_equal, new_fs_store};
|
||||
use crate::cache::{CacheManager, CacheStrategy};
|
||||
use crate::error::InvalidBatchSnafu;
|
||||
use crate::read::Source;
|
||||
@@ -482,8 +483,7 @@ mod tests {
|
||||
use crate::sst::parquet::reader::ParquetReaderBuilder;
|
||||
use crate::test_util::TestEnv;
|
||||
use crate::test_util::sst_util::{
|
||||
assert_parquet_metadata_eq, new_batch_by_range, new_source, sst_file_handle_with_file_id,
|
||||
sst_region_metadata,
|
||||
new_batch_by_range, new_source, sst_file_handle_with_file_id, sst_region_metadata,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -652,11 +652,12 @@ mod tests {
|
||||
handle.clone(),
|
||||
mock_store.clone(),
|
||||
)
|
||||
.cache(CacheStrategy::EnableAll(cache_manager.clone()));
|
||||
.cache(CacheStrategy::EnableAll(cache_manager.clone()))
|
||||
.page_index_policy(PageIndexPolicy::Optional);
|
||||
let reader = builder.build().await.unwrap();
|
||||
|
||||
// Check parquet metadata
|
||||
assert_parquet_metadata_eq(write_parquet_metadata, reader.parquet_metadata());
|
||||
assert_parquet_metadata_equal(write_parquet_metadata, reader.parquet_metadata());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -601,14 +601,6 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Invalid file metadata"))]
|
||||
ConvertMetaData {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: parquet::errors::ParquetError,
|
||||
},
|
||||
|
||||
#[snafu(display("Column not found, column: {column}"))]
|
||||
ColumnNotFound {
|
||||
column: String,
|
||||
@@ -1284,7 +1276,6 @@ impl ErrorExt for Error {
|
||||
| Join { .. }
|
||||
| WorkerStopped { .. }
|
||||
| Recv { .. }
|
||||
| ConvertMetaData { .. }
|
||||
| DecodeWal { .. }
|
||||
| ComputeArrow { .. }
|
||||
| BiErrors { .. }
|
||||
|
||||
@@ -71,7 +71,6 @@ use crate::sst::index::IndexOutput;
|
||||
use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete};
|
||||
use crate::sst::parquet::flat_format::primary_key_column_index;
|
||||
use crate::sst::parquet::format::{PrimaryKeyArray, PrimaryKeyArrayBuilder, ReadFormat};
|
||||
use crate::sst::parquet::helper::parse_parquet_metadata;
|
||||
use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo};
|
||||
use crate::sst::{SeriesEstimator, to_sst_arrow_schema};
|
||||
|
||||
@@ -1197,7 +1196,7 @@ impl BulkPartEncoder {
|
||||
metrics.num_rows += total_rows;
|
||||
|
||||
let buf = Bytes::from(buf);
|
||||
let parquet_metadata = Arc::new(parse_parquet_metadata(file_metadata)?);
|
||||
let parquet_metadata = Arc::new(file_metadata);
|
||||
let num_series = series_estimator.finish();
|
||||
|
||||
Ok(Some(EncodedBulkPart {
|
||||
@@ -1232,7 +1231,7 @@ impl BulkPartEncoder {
|
||||
};
|
||||
|
||||
let buf = Bytes::from(buf);
|
||||
let parquet_metadata = Arc::new(parse_parquet_metadata(file_metadata)?);
|
||||
let parquet_metadata = Arc::new(file_metadata);
|
||||
|
||||
Ok(Some(EncodedBulkPart {
|
||||
data: buf,
|
||||
|
||||
@@ -115,7 +115,7 @@ mod tests {
|
||||
use object_store::ObjectStore;
|
||||
use parquet::arrow::AsyncArrowWriter;
|
||||
use parquet::basic::{Compression, Encoding, ZstdLevel};
|
||||
use parquet::file::metadata::KeyValue;
|
||||
use parquet::file::metadata::{KeyValue, PageIndexPolicy};
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
|
||||
@@ -126,6 +126,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::access_layer::{FilePathProvider, Metrics, RegionFilePathFactory, WriteType};
|
||||
use crate::cache::test_util::assert_parquet_metadata_equal;
|
||||
use crate::cache::{CacheManager, CacheStrategy, PageKey};
|
||||
use crate::config::IndexConfig;
|
||||
use crate::read::{BatchBuilder, BatchReader, FlatSource};
|
||||
@@ -143,9 +144,9 @@ mod tests {
|
||||
DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, location, to_flat_sst_arrow_schema,
|
||||
};
|
||||
use crate::test_util::sst_util::{
|
||||
assert_parquet_metadata_eq, build_test_binary_test_region_metadata, new_batch_by_range,
|
||||
new_batch_with_binary, new_batch_with_custom_sequence, new_primary_key, new_source,
|
||||
new_sparse_primary_key, sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata,
|
||||
build_test_binary_test_region_metadata, new_batch_by_range, new_batch_with_binary,
|
||||
new_batch_with_custom_sequence, new_primary_key, new_source, new_sparse_primary_key,
|
||||
sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata,
|
||||
sst_region_metadata_with_encoding,
|
||||
};
|
||||
use crate::test_util::{TestEnv, check_reader_result};
|
||||
@@ -377,11 +378,12 @@ mod tests {
|
||||
PathType::Bare,
|
||||
handle.clone(),
|
||||
object_store,
|
||||
);
|
||||
)
|
||||
.page_index_policy(PageIndexPolicy::Optional);
|
||||
let reader = builder.build().await.unwrap();
|
||||
let reader_metadata = reader.parquet_metadata();
|
||||
|
||||
assert_parquet_metadata_eq(writer_metadata, reader_metadata)
|
||||
assert_parquet_metadata_equal(writer_metadata, reader_metadata);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -13,82 +13,11 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use bytes::Bytes;
|
||||
use common_telemetry::trace;
|
||||
use object_store::ObjectStore;
|
||||
use parquet::basic::ColumnOrder;
|
||||
use parquet::file::metadata::{FileMetaData, ParquetMetaData, RowGroupMetaData};
|
||||
use parquet::format;
|
||||
use parquet::schema::types::{SchemaDescriptor, from_thrift};
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error;
|
||||
use crate::error::Result;
|
||||
|
||||
// Refer to https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/file/footer.rs#L74-L90
|
||||
/// Convert [format::FileMetaData] to [ParquetMetaData]
|
||||
pub fn parse_parquet_metadata(t_file_metadata: format::FileMetaData) -> Result<ParquetMetaData> {
|
||||
let schema = from_thrift(&t_file_metadata.schema).context(error::ConvertMetaDataSnafu)?;
|
||||
let schema_desc_ptr = Arc::new(SchemaDescriptor::new(schema));
|
||||
|
||||
let mut row_groups = Vec::with_capacity(t_file_metadata.row_groups.len());
|
||||
for rg in t_file_metadata.row_groups {
|
||||
row_groups.push(
|
||||
RowGroupMetaData::from_thrift(schema_desc_ptr.clone(), rg)
|
||||
.context(error::ConvertMetaDataSnafu)?,
|
||||
);
|
||||
}
|
||||
let column_orders = parse_column_orders(t_file_metadata.column_orders, &schema_desc_ptr);
|
||||
|
||||
let file_metadata = FileMetaData::new(
|
||||
t_file_metadata.version,
|
||||
t_file_metadata.num_rows,
|
||||
t_file_metadata.created_by,
|
||||
t_file_metadata.key_value_metadata,
|
||||
schema_desc_ptr,
|
||||
column_orders,
|
||||
);
|
||||
// There may be a problem owing to lacking of column_index and offset_index,
|
||||
// if we open page index in the future.
|
||||
Ok(ParquetMetaData::new(file_metadata, row_groups))
|
||||
}
|
||||
|
||||
// Port from https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/file/footer.rs#L106-L137
|
||||
/// Parses column orders from Thrift definition.
|
||||
/// If no column orders are defined, returns `None`.
|
||||
fn parse_column_orders(
|
||||
t_column_orders: Option<Vec<format::ColumnOrder>>,
|
||||
schema_descr: &SchemaDescriptor,
|
||||
) -> Option<Vec<ColumnOrder>> {
|
||||
match t_column_orders {
|
||||
Some(orders) => {
|
||||
// Should always be the case
|
||||
assert_eq!(
|
||||
orders.len(),
|
||||
schema_descr.num_columns(),
|
||||
"Column order length mismatch"
|
||||
);
|
||||
let mut res = Vec::with_capacity(schema_descr.num_columns());
|
||||
for (i, column) in schema_descr.columns().iter().enumerate() {
|
||||
match orders[i] {
|
||||
format::ColumnOrder::TYPEORDER(_) => {
|
||||
let sort_order = ColumnOrder::get_sort_order(
|
||||
column.logical_type(),
|
||||
column.converted_type(),
|
||||
column.physical_type(),
|
||||
);
|
||||
res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order));
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(res)
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
const FETCH_PARALLELISM: usize = 8;
|
||||
pub(crate) const MERGE_GAP: usize = 512 * 1024;
|
||||
|
||||
@@ -21,7 +21,7 @@ use futures::future::BoxFuture;
|
||||
use object_store::ObjectStore;
|
||||
use parquet::arrow::async_reader::MetadataFetch;
|
||||
use parquet::errors::{ParquetError, Result as ParquetResult};
|
||||
use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
|
||||
use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader};
|
||||
use snafu::{IntoError as _, ResultExt};
|
||||
|
||||
use crate::error::{self, Result};
|
||||
@@ -37,6 +37,7 @@ pub(crate) struct MetadataLoader<'a> {
|
||||
file_path: &'a str,
|
||||
// The size of parquet file
|
||||
file_size: u64,
|
||||
page_index_policy: PageIndexPolicy,
|
||||
}
|
||||
|
||||
impl<'a> MetadataLoader<'a> {
|
||||
@@ -50,9 +51,14 @@ impl<'a> MetadataLoader<'a> {
|
||||
object_store,
|
||||
file_path,
|
||||
file_size,
|
||||
page_index_policy: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn with_page_index_policy(&mut self, page_index_policy: PageIndexPolicy) {
|
||||
self.page_index_policy = page_index_policy;
|
||||
}
|
||||
|
||||
/// Get the size of parquet file. If file_size is 0, stat the object store to get the size.
|
||||
async fn get_file_size(&self) -> Result<u64> {
|
||||
let file_size = match self.file_size {
|
||||
@@ -70,8 +76,9 @@ impl<'a> MetadataLoader<'a> {
|
||||
pub async fn load(&self, cache_metrics: &mut MetadataCacheMetrics) -> Result<ParquetMetaData> {
|
||||
let path = self.file_path;
|
||||
let file_size = self.get_file_size().await?;
|
||||
let reader =
|
||||
ParquetMetaDataReader::new().with_prefetch_hint(Some(DEFAULT_PREFETCH_SIZE as usize));
|
||||
let reader = ParquetMetaDataReader::new()
|
||||
.with_prefetch_hint(Some(DEFAULT_PREFETCH_SIZE as usize))
|
||||
.with_page_index_policy(self.page_index_policy);
|
||||
|
||||
let num_reads = AtomicUsize::new(0);
|
||||
let bytes_read = AtomicU64::new(0);
|
||||
|
||||
@@ -33,8 +33,7 @@ use mito_codec::row_converter::build_primary_key_codec;
|
||||
use object_store::ObjectStore;
|
||||
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection};
|
||||
use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels};
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use parquet::format::KeyValue;
|
||||
use parquet::file::metadata::{KeyValue, PageIndexPolicy, ParquetMetaData};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef};
|
||||
use store_api::region_request::PathType;
|
||||
@@ -142,6 +141,7 @@ pub struct ParquetReaderBuilder {
|
||||
pre_filter_mode: PreFilterMode,
|
||||
/// Whether to decode primary key values eagerly when reading primary key format SSTs.
|
||||
decode_primary_key_values: bool,
|
||||
page_index_policy: PageIndexPolicy,
|
||||
}
|
||||
|
||||
impl ParquetReaderBuilder {
|
||||
@@ -172,6 +172,7 @@ impl ParquetReaderBuilder {
|
||||
compaction: false,
|
||||
pre_filter_mode: PreFilterMode::All,
|
||||
decode_primary_key_values: false,
|
||||
page_index_policy: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,6 +277,12 @@ impl ParquetReaderBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn page_index_policy(mut self, page_index_policy: PageIndexPolicy) -> Self {
|
||||
self.page_index_policy = page_index_policy;
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds a [ParquetReader].
|
||||
///
|
||||
/// This needs to perform IO operation.
|
||||
@@ -314,7 +321,12 @@ impl ParquetReaderBuilder {
|
||||
|
||||
// Loads parquet metadata of the file.
|
||||
let (parquet_meta, cache_miss) = self
|
||||
.read_parquet_metadata(&file_path, file_size, &mut metrics.metadata_cache_metrics)
|
||||
.read_parquet_metadata(
|
||||
&file_path,
|
||||
file_size,
|
||||
&mut metrics.metadata_cache_metrics,
|
||||
self.page_index_policy,
|
||||
)
|
||||
.await?;
|
||||
// Decodes region metadata.
|
||||
let key_value_meta = parquet_meta.file_metadata().key_value_metadata();
|
||||
@@ -479,6 +491,7 @@ impl ParquetReaderBuilder {
|
||||
file_path: &str,
|
||||
file_size: u64,
|
||||
cache_metrics: &mut MetadataCacheMetrics,
|
||||
page_index_policy: PageIndexPolicy,
|
||||
) -> Result<(Arc<ParquetMetaData>, bool)> {
|
||||
let start = Instant::now();
|
||||
let _t = READ_STAGE_ELAPSED
|
||||
@@ -489,7 +502,7 @@ impl ParquetReaderBuilder {
|
||||
// Tries to get from cache with metrics tracking.
|
||||
if let Some(metadata) = self
|
||||
.cache_strategy
|
||||
.get_parquet_meta_data(file_id, cache_metrics)
|
||||
.get_parquet_meta_data(file_id, cache_metrics, page_index_policy)
|
||||
.await
|
||||
{
|
||||
cache_metrics.metadata_load_cost += start.elapsed();
|
||||
@@ -497,7 +510,9 @@ impl ParquetReaderBuilder {
|
||||
}
|
||||
|
||||
// Cache miss, load metadata directly.
|
||||
let metadata_loader = MetadataLoader::new(self.object_store.clone(), file_path, file_size);
|
||||
let mut metadata_loader =
|
||||
MetadataLoader::new(self.object_store.clone(), file_path, file_size);
|
||||
metadata_loader.with_page_index_policy(page_index_policy);
|
||||
let metadata = metadata_loader.load(cache_metrics).await?;
|
||||
|
||||
let metadata = Arc::new(metadata);
|
||||
|
||||
@@ -55,7 +55,6 @@ use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::{IndexOutput, Indexer, IndexerBuilder};
|
||||
use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index};
|
||||
use crate::sst::parquet::format::PrimaryKeyWriteFormat;
|
||||
use crate::sst::parquet::helper::parse_parquet_metadata;
|
||||
use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo, WriteOptions};
|
||||
use crate::sst::{
|
||||
DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator,
|
||||
@@ -205,14 +204,12 @@ where
|
||||
}
|
||||
current_writer.flush().await.context(WriteParquetSnafu)?;
|
||||
|
||||
let file_meta = current_writer.close().await.context(WriteParquetSnafu)?;
|
||||
let parquet_metadata = current_writer.close().await.context(WriteParquetSnafu)?;
|
||||
let file_size = self.bytes_written.load(Ordering::Relaxed) as u64;
|
||||
|
||||
// Safety: num rows > 0 so we must have min/max.
|
||||
let time_range = stats.time_range.unwrap();
|
||||
|
||||
// convert FileMetaData to ParquetMetaData
|
||||
let parquet_metadata = parse_parquet_metadata(file_meta)?;
|
||||
let max_row_group_uncompressed_size: u64 = parquet_metadata
|
||||
.row_groups()
|
||||
.iter()
|
||||
|
||||
@@ -23,7 +23,6 @@ use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnSchema, SkippingIndexOptions};
|
||||
use datatypes::value::ValueRef;
|
||||
use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortField};
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use store_api::metadata::{
|
||||
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
|
||||
};
|
||||
@@ -277,30 +276,6 @@ pub fn new_batch_with_binary(tags: &[&str], start: usize, end: usize) -> Batch {
|
||||
builder.build().unwrap()
|
||||
}
|
||||
|
||||
/// ParquetMetaData doesn't implement `PartialEq` trait, check internal fields manually
|
||||
pub fn assert_parquet_metadata_eq(a: Arc<ParquetMetaData>, b: Arc<ParquetMetaData>) {
|
||||
macro_rules! assert_metadata {
|
||||
( $a:expr, $b:expr, $($method:ident,)+ ) => {
|
||||
$(
|
||||
assert_eq!($a.$method(), $b.$method());
|
||||
)+
|
||||
}
|
||||
}
|
||||
|
||||
assert_metadata!(
|
||||
a.file_metadata(),
|
||||
b.file_metadata(),
|
||||
version,
|
||||
num_rows,
|
||||
created_by,
|
||||
key_value_metadata,
|
||||
schema_descr,
|
||||
column_orders,
|
||||
);
|
||||
|
||||
assert_metadata!(a, b, row_groups, column_index, offset_index,);
|
||||
}
|
||||
|
||||
/// Creates a new region metadata for testing SSTs with binary datatype.
|
||||
///
|
||||
/// Schema: tag_0(string), field_0(binary), ts
|
||||
|
||||
@@ -21,6 +21,7 @@ use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::{info, warn};
|
||||
use parquet::file::metadata::PageIndexPolicy;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::RegionId;
|
||||
@@ -523,7 +524,11 @@ async fn edit_region(
|
||||
let mut cache_metrics = Default::default();
|
||||
let _ = write_cache
|
||||
.file_cache()
|
||||
.get_parquet_meta_data(index_key, &mut cache_metrics)
|
||||
.get_parquet_meta_data(
|
||||
index_key,
|
||||
&mut cache_metrics,
|
||||
PageIndexPolicy::Optional,
|
||||
)
|
||||
.await;
|
||||
|
||||
listener.on_file_cache_filled(index_key.file_id);
|
||||
|
||||
@@ -33,6 +33,7 @@ use common_telemetry::{debug, tracing};
|
||||
use datafusion::datasource::physical_plan::{CsvSource, FileSource, JsonSource};
|
||||
use datafusion::parquet::arrow::ParquetRecordBatchStreamBuilder;
|
||||
use datafusion::parquet::arrow::arrow_reader::ArrowReaderMetadata;
|
||||
use datafusion_common::config::CsvOptions;
|
||||
use datafusion_expr::Expr;
|
||||
use datatypes::arrow::compute::can_cast_types;
|
||||
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Schema, SchemaRef};
|
||||
@@ -214,13 +215,15 @@ impl StatementExecutor {
|
||||
.context(error::ProjectSchemaSnafu)?,
|
||||
);
|
||||
|
||||
let csv_source = CsvSource::new(format.has_header, format.delimiter, b'"')
|
||||
.with_schema(schema.clone())
|
||||
let options = CsvOptions::default()
|
||||
.with_has_header(format.has_header)
|
||||
.with_delimiter(format.delimiter);
|
||||
let csv_source = CsvSource::new(schema.clone())
|
||||
.with_csv_options(options)
|
||||
.with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
let stream = file_to_stream(
|
||||
object_store,
|
||||
path,
|
||||
schema.clone(),
|
||||
csv_source,
|
||||
Some(projection),
|
||||
format.compression_type,
|
||||
@@ -247,13 +250,11 @@ impl StatementExecutor {
|
||||
.context(error::ProjectSchemaSnafu)?,
|
||||
);
|
||||
|
||||
let json_source = JsonSource::new()
|
||||
.with_schema(schema.clone())
|
||||
.with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
let json_source =
|
||||
JsonSource::new(schema.clone()).with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
let stream = file_to_stream(
|
||||
object_store,
|
||||
path,
|
||||
schema.clone(),
|
||||
json_source,
|
||||
Some(projection),
|
||||
format.compression_type,
|
||||
|
||||
@@ -995,7 +995,7 @@ mod test {
|
||||
#[tokio::test]
|
||||
async fn interval_30s_range_90s() {
|
||||
let expected = String::from(
|
||||
"PrimitiveArray<Timestamp(Millisecond, None)>\n[\n \
|
||||
"PrimitiveArray<Timestamp(ms)>\n[\n \
|
||||
1970-01-01T00:00:00,\n \
|
||||
1970-01-01T00:00:30,\n \
|
||||
1970-01-01T00:01:00,\n \
|
||||
@@ -1015,7 +1015,7 @@ mod test {
|
||||
ranges: [Some(0..1), Some(0..2), Some(0..3), Some(0..4), Some(1..5), Some(2..5), Some(3..6), Some(4..6), Some(5..7), Some(5..8), Some(6..10)] \
|
||||
}\nStringArray\n[\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n]\n\
|
||||
RangeArray { \
|
||||
base array: PrimitiveArray<Timestamp(Millisecond, None)>\n[\n 1970-01-01T00:00:00,\n 1970-01-01T00:00:30,\n 1970-01-01T00:01:00,\n 1970-01-01T00:01:30,\n 1970-01-01T00:02:00,\n 1970-01-01T00:03:00,\n 1970-01-01T00:04:00,\n 1970-01-01T00:04:01,\n 1970-01-01T00:04:31,\n 1970-01-01T00:04:51,\n], \
|
||||
base array: PrimitiveArray<Timestamp(ms)>\n[\n 1970-01-01T00:00:00,\n 1970-01-01T00:00:30,\n 1970-01-01T00:01:00,\n 1970-01-01T00:01:30,\n 1970-01-01T00:02:00,\n 1970-01-01T00:03:00,\n 1970-01-01T00:04:00,\n 1970-01-01T00:04:01,\n 1970-01-01T00:04:31,\n 1970-01-01T00:04:51,\n], \
|
||||
ranges: [Some(0..1), Some(0..2), Some(0..3), Some(0..4), Some(1..5), Some(2..5), Some(3..6), Some(4..6), Some(5..7), Some(5..8), Some(6..10)] \
|
||||
}",
|
||||
);
|
||||
@@ -1028,7 +1028,7 @@ mod test {
|
||||
#[tokio::test]
|
||||
async fn small_empty_range() {
|
||||
let expected = String::from(
|
||||
"PrimitiveArray<Timestamp(Millisecond, None)>\n[\n \
|
||||
"PrimitiveArray<Timestamp(ms)>\n[\n \
|
||||
1970-01-01T00:00:00.001,\n \
|
||||
1970-01-01T00:00:03.001,\n \
|
||||
1970-01-01T00:00:06.001,\n \
|
||||
@@ -1041,7 +1041,7 @@ mod test {
|
||||
ranges: [Some(0..1), Some(0..0), Some(0..0), Some(0..0)] \
|
||||
}\nStringArray\n[\n \"foo\",\n \"foo\",\n \"foo\",\n \"foo\",\n]\n\
|
||||
RangeArray { \
|
||||
base array: PrimitiveArray<Timestamp(Millisecond, None)>\n[\n 1970-01-01T00:00:00,\n 1970-01-01T00:00:30,\n 1970-01-01T00:01:00,\n 1970-01-01T00:01:30,\n 1970-01-01T00:02:00,\n 1970-01-01T00:03:00,\n 1970-01-01T00:04:00,\n 1970-01-01T00:04:01,\n 1970-01-01T00:04:31,\n 1970-01-01T00:04:51,\n], \
|
||||
base array: PrimitiveArray<Timestamp(ms)>\n[\n 1970-01-01T00:00:00,\n 1970-01-01T00:00:30,\n 1970-01-01T00:01:00,\n 1970-01-01T00:01:30,\n 1970-01-01T00:02:00,\n 1970-01-01T00:03:00,\n 1970-01-01T00:04:00,\n 1970-01-01T00:04:01,\n 1970-01-01T00:04:31,\n 1970-01-01T00:04:51,\n], \
|
||||
ranges: [Some(0..1), Some(0..0), Some(0..0), Some(0..0)] \
|
||||
}",
|
||||
);
|
||||
|
||||
@@ -255,9 +255,9 @@ fn metrics_to_string(metrics: RecordBatchMetrics, format: AnalyzeFormat) -> DfRe
|
||||
match format {
|
||||
AnalyzeFormat::JSON => Ok(JsonMetrics::from_record_batch_metrics(metrics).to_string()),
|
||||
AnalyzeFormat::TEXT => Ok(metrics.to_string()),
|
||||
AnalyzeFormat::GRAPHVIZ => Err(DataFusionError::NotImplemented(
|
||||
"GRAPHVIZ format is not supported for metrics output".to_string(),
|
||||
)),
|
||||
format => Err(DataFusionError::NotImplemented(format!(
|
||||
"AnalyzeFormat {format}",
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -316,18 +316,15 @@ impl DatafusionQueryEngine {
|
||||
return state
|
||||
.create_physical_plan(logical_plan)
|
||||
.await
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu);
|
||||
.map_err(Into::into);
|
||||
}
|
||||
|
||||
// analyze first
|
||||
let analyzed_plan = state
|
||||
.analyzer()
|
||||
.execute_and_check(logical_plan.clone(), state.config_options(), |_, _| {})
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)?;
|
||||
let analyzed_plan = state.analyzer().execute_and_check(
|
||||
logical_plan.clone(),
|
||||
state.config_options(),
|
||||
|_, _| {},
|
||||
)?;
|
||||
|
||||
logger.after_analyze = Some(analyzed_plan.clone());
|
||||
|
||||
@@ -341,10 +338,7 @@ impl DatafusionQueryEngine {
|
||||
} else {
|
||||
state
|
||||
.optimizer()
|
||||
.optimize(analyzed_plan, state, |_, _| {})
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)?
|
||||
.optimize(analyzed_plan, state, |_, _| {})?
|
||||
};
|
||||
|
||||
common_telemetry::debug!("Create physical plan, optimized plan: {optimized_plan}");
|
||||
@@ -371,19 +365,10 @@ impl DatafusionQueryEngine {
|
||||
// Optimized by extension rules
|
||||
let optimized_plan = self
|
||||
.state
|
||||
.optimize_by_extension_rules(plan.clone(), context)
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)?;
|
||||
.optimize_by_extension_rules(plan.clone(), context)?;
|
||||
|
||||
// Optimized by datafusion optimizer
|
||||
let optimized_plan = self
|
||||
.state
|
||||
.session_state()
|
||||
.optimize(&optimized_plan)
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)?;
|
||||
let optimized_plan = self.state.session_state().optimize(&optimized_plan)?;
|
||||
|
||||
Ok(optimized_plan)
|
||||
}
|
||||
@@ -516,11 +501,7 @@ impl QueryEngine for DatafusionQueryEngine {
|
||||
}
|
||||
|
||||
fn read_table(&self, table: TableRef) -> Result<DataFrame> {
|
||||
self.state
|
||||
.read_table(table)
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)
|
||||
self.state.read_table(table).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn engine_context(&self, query_ctx: QueryContextRef) -> QueryEngineContext {
|
||||
@@ -543,7 +524,8 @@ impl QueryEngine for DatafusionQueryEngine {
|
||||
}
|
||||
|
||||
// configure execution options
|
||||
state.config_mut().options_mut().execution.time_zone = query_ctx.timezone().to_string();
|
||||
state.config_mut().options_mut().execution.time_zone =
|
||||
Some(query_ctx.timezone().to_string());
|
||||
|
||||
// usually it's impossible to have both `set variable` set by sql client and
|
||||
// hint in header by grpc client, so only need to deal with them separately
|
||||
@@ -619,11 +601,7 @@ impl QueryExecutor for DatafusionQueryEngine {
|
||||
Ok(Box::pin(EmptyRecordBatchStream::new(schema)))
|
||||
}
|
||||
1 => {
|
||||
let df_stream = plan
|
||||
.execute(0, task_ctx)
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)?;
|
||||
let df_stream = plan.execute(0, task_ctx)?;
|
||||
let mut stream = RecordBatchStreamAdapter::try_new_with_span(df_stream, span)
|
||||
.context(error::ConvertDfRecordBatchStreamSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
@@ -652,11 +630,7 @@ impl QueryExecutor for DatafusionQueryEngine {
|
||||
.output_partitioning()
|
||||
.partition_count()
|
||||
);
|
||||
let df_stream = merged_plan
|
||||
.execute(0, task_ctx)
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)?;
|
||||
let df_stream = merged_plan.execute(0, task_ctx)?;
|
||||
let mut stream = RecordBatchStreamAdapter::try_new_with_span(df_stream, span)
|
||||
.context(error::ConvertDfRecordBatchStreamSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
|
||||
@@ -25,7 +25,7 @@ use snafu::{Location, Snafu};
|
||||
#[snafu(visibility(pub))]
|
||||
#[stack_trace_debug]
|
||||
pub enum InnerError {
|
||||
#[snafu(display("DataFusion error"))]
|
||||
#[snafu(transparent)]
|
||||
Datafusion {
|
||||
#[snafu(source)]
|
||||
error: DataFusionError,
|
||||
|
||||
@@ -1170,7 +1170,7 @@ fn test_simplify_select_now_expression() {
|
||||
let expected = [
|
||||
"Projection: now()",
|
||||
" MergeScan [is_placeholder=false, remote_input=[",
|
||||
r#"Projection: TimestampNanosecond(<TIME>, Some("+00:00")) AS now()"#,
|
||||
r#"Projection: TimestampNanosecond(<TIME>, None) AS now()"#,
|
||||
" TableScan: t",
|
||||
"]]",
|
||||
]
|
||||
|
||||
@@ -143,7 +143,7 @@ mod tests {
|
||||
let plan = create_test_plan_with_project(proj);
|
||||
let result = StringNormalizationRule.analyze(plan, config).unwrap();
|
||||
let expected = format!(
|
||||
"Projection: CAST(Utf8(\"2017-07-23 13:10:11\") AS Timestamp({:#?}, None))\n TableScan: t",
|
||||
"Projection: CAST(Utf8(\"2017-07-23 13:10:11\") AS Timestamp({}))\n TableScan: t",
|
||||
time_unit
|
||||
);
|
||||
assert_eq!(expected, result.to_string());
|
||||
@@ -162,7 +162,7 @@ mod tests {
|
||||
.analyze(int_to_timestamp_plan, config)
|
||||
.unwrap();
|
||||
let expected = String::from(
|
||||
"Projection: CAST(Int64(158412331400600000) AS Timestamp(Nanosecond, None))\n TableScan: t",
|
||||
"Projection: CAST(Int64(158412331400600000) AS Timestamp(ns))\n TableScan: t",
|
||||
);
|
||||
assert_eq!(expected, result.to_string());
|
||||
|
||||
|
||||
@@ -4687,11 +4687,11 @@ mod test {
|
||||
assert_eq!(
|
||||
plan.display_indent_schema().to_string(),
|
||||
"PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-1000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(Millisecond, None)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
|
||||
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-1000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
|
||||
);
|
||||
let plan = PromPlanner::stmt_to_plan(
|
||||
DfTableSourceProvider::new(
|
||||
@@ -4717,14 +4717,14 @@ mod test {
|
||||
assert_eq!(
|
||||
plan.display_indent_schema().to_string(),
|
||||
"Filter: prom_avg_over_time(timestamp_range,field) IS NOT NULL [timestamp:Timestamp(Millisecond, None), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
|
||||
\n Projection: metrics.timestamp, prom_avg_over_time(timestamp_range, field) AS prom_avg_over_time(timestamp_range,field), metrics.tag [timestamp:Timestamp(Millisecond, None), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
|
||||
\n PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[5000], time index=[timestamp], values=[\"field\"] [field:Dictionary(Int64, Float64);N, tag:Utf8, timestamp:Timestamp(Millisecond, None), timestamp_range:Dictionary(Int64, Timestamp(Millisecond, None))]\
|
||||
\n PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-6000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(Millisecond, None)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
|
||||
\n Projection: metrics.timestamp, prom_avg_over_time(timestamp_range, field) AS prom_avg_over_time(timestamp_range,field), metrics.tag [timestamp:Timestamp(Millisecond, None), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
|
||||
\n PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[5000], time index=[timestamp], values=[\"field\"] [field:Dictionary(Int64, Float64);N, tag:Utf8, timestamp:Timestamp(Millisecond, None), timestamp_range:Dictionary(Int64, Timestamp(Millisecond, None))]\
|
||||
\n PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-6000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
|
||||
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -123,7 +123,7 @@ tokio-rustls.workspace = true
|
||||
tokio-stream = { workspace = true, features = ["net"] }
|
||||
tokio-util.workspace = true
|
||||
tonic.workspace = true
|
||||
tonic-reflection = "0.13"
|
||||
tonic-reflection = "0.14"
|
||||
tower = { workspace = true, features = ["full"] }
|
||||
tower-http = { version = "0.6", features = ["full"] }
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -715,7 +715,7 @@ fn replace_params_with_values(
|
||||
if let Some(Some(t)) = param_types.get(&format_placeholder(i + 1)) {
|
||||
let value = helper::convert_value(param, t)?;
|
||||
|
||||
values.push(value);
|
||||
values.push(value.into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -744,7 +744,7 @@ fn replace_params_with_exprs(
|
||||
if let Some(Some(t)) = param_types.get(&format_placeholder(i + 1)) {
|
||||
let value = helper::convert_expr_to_scalar_value(param, t)?;
|
||||
|
||||
values.push(value);
|
||||
values.push(value.into());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,8 +13,10 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use arrow_schema::Field;
|
||||
use chrono::NaiveDate;
|
||||
use common_query::prelude::ScalarValue;
|
||||
use common_sql::convert::sql_value_to_value;
|
||||
@@ -87,8 +89,8 @@ pub fn fix_placeholder_types(plan: &mut LogicalPlan) -> Result<()> {
|
||||
let give_placeholder_types = |mut e: datafusion_expr::Expr| {
|
||||
if let datafusion_expr::Expr::Cast(cast) = &mut e {
|
||||
if let datafusion_expr::Expr::Placeholder(ph) = &mut *cast.expr {
|
||||
if ph.data_type.is_none() {
|
||||
ph.data_type = Some(cast.data_type.clone());
|
||||
if ph.field.is_none() {
|
||||
ph.field = Some(Arc::new(Field::new("", cast.data_type.clone(), true)));
|
||||
common_telemetry::debug!(
|
||||
"give placeholder type {:?} to {:?}",
|
||||
cast.data_type,
|
||||
|
||||
@@ -324,11 +324,12 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner {
|
||||
}
|
||||
|
||||
let output = if let Some(plan) = &sql_plan.plan {
|
||||
let values = parameters_to_scalar_values(plan, portal)?;
|
||||
let plan = plan
|
||||
.clone()
|
||||
.replace_params_with_values(&ParamValues::List(parameters_to_scalar_values(
|
||||
plan, portal,
|
||||
)?))
|
||||
.replace_params_with_values(&ParamValues::List(
|
||||
values.into_iter().map(Into::into).collect(),
|
||||
))
|
||||
.context(DataFusionSnafu)
|
||||
.map_err(convert_err)?;
|
||||
self.query_handler
|
||||
|
||||
@@ -225,7 +225,7 @@ impl QueryContext {
|
||||
/// Create a new datafusion's ConfigOptions instance based on the current QueryContext.
|
||||
pub fn create_config_options(&self) -> ConfigOptions {
|
||||
let mut config = ConfigOptions::default();
|
||||
config.execution.time_zone = self.timezone().to_string();
|
||||
config.execution.time_zone = Some(self.timezone().to_string());
|
||||
config
|
||||
}
|
||||
|
||||
|
||||
@@ -271,7 +271,7 @@ pub fn sql_data_type_to_concrete_data_type(
|
||||
})?
|
||||
.map(|t| ConcreteDataType::timestamp_datatype(t.unit()))
|
||||
.unwrap_or(ConcreteDataType::timestamp_millisecond_datatype())),
|
||||
SqlDataType::Interval => Ok(ConcreteDataType::interval_month_day_nano_datatype()),
|
||||
SqlDataType::Interval { .. } => Ok(ConcreteDataType::interval_month_day_nano_datatype()),
|
||||
SqlDataType::Decimal(exact_info) => match exact_info {
|
||||
ExactNumberInfo::None => Ok(ConcreteDataType::decimal128_default_datatype()),
|
||||
// refer to https://dev.mysql.com/doc/refman/8.0/en/fixed-point-types.html
|
||||
@@ -333,7 +333,7 @@ pub fn concrete_data_type_to_sql_data_type(data_type: &ConcreteDataType) -> Resu
|
||||
ConcreteDataType::Int8(_) => Ok(SqlDataType::TinyInt(None)),
|
||||
ConcreteDataType::UInt8(_) => Ok(SqlDataType::TinyIntUnsigned(None)),
|
||||
ConcreteDataType::String(_) => Ok(SqlDataType::String(None)),
|
||||
ConcreteDataType::Float32(_) => Ok(SqlDataType::Float(None)),
|
||||
ConcreteDataType::Float32(_) => Ok(SqlDataType::Float(ExactNumberInfo::None)),
|
||||
ConcreteDataType::Float64(_) => Ok(SqlDataType::Double(ExactNumberInfo::None)),
|
||||
ConcreteDataType::Boolean(_) => Ok(SqlDataType::Boolean),
|
||||
ConcreteDataType::Date(_) => Ok(SqlDataType::Date),
|
||||
@@ -345,10 +345,13 @@ pub fn concrete_data_type_to_sql_data_type(data_type: &ConcreteDataType) -> Resu
|
||||
Some(time_type.precision()),
|
||||
TimezoneInfo::None,
|
||||
)),
|
||||
ConcreteDataType::Interval(_) => Ok(SqlDataType::Interval),
|
||||
ConcreteDataType::Interval(_) => Ok(SqlDataType::Interval {
|
||||
fields: None,
|
||||
precision: None,
|
||||
}),
|
||||
ConcreteDataType::Binary(_) => Ok(SqlDataType::Varbinary(None)),
|
||||
ConcreteDataType::Decimal128(d) => Ok(SqlDataType::Decimal(
|
||||
ExactNumberInfo::PrecisionAndScale(d.precision() as u64, d.scale() as u64),
|
||||
ExactNumberInfo::PrecisionAndScale(d.precision() as u64, d.scale() as i64),
|
||||
)),
|
||||
ConcreteDataType::Json(_) => Ok(SqlDataType::JSON),
|
||||
ConcreteDataType::Vector(v) => Ok(SqlDataType::Custom(
|
||||
@@ -412,7 +415,7 @@ mod tests {
|
||||
ConcreteDataType::string_datatype(),
|
||||
);
|
||||
check_type(
|
||||
SqlDataType::Float(None),
|
||||
SqlDataType::Float(ExactNumberInfo::None),
|
||||
ConcreteDataType::float32_datatype(),
|
||||
);
|
||||
check_type(
|
||||
@@ -450,7 +453,10 @@ mod tests {
|
||||
ConcreteDataType::timestamp_microsecond_datatype(),
|
||||
);
|
||||
check_type(
|
||||
SqlDataType::Interval,
|
||||
SqlDataType::Interval {
|
||||
fields: None,
|
||||
precision: None,
|
||||
},
|
||||
ConcreteDataType::interval_month_day_nano_datatype(),
|
||||
);
|
||||
check_type(SqlDataType::JSON, ConcreteDataType::json_datatype());
|
||||
|
||||
@@ -114,7 +114,7 @@ impl TransformRule for ExpandIntervalTransformRule {
|
||||
kind,
|
||||
format,
|
||||
} => {
|
||||
if DataType::Interval == *data_type {
|
||||
if matches!(data_type, DataType::Interval { .. }) {
|
||||
match &**cast_exp {
|
||||
Expr::Value(ValueWithSpan {
|
||||
value: Value::SingleQuotedString(value),
|
||||
@@ -129,7 +129,7 @@ impl TransformRule for ExpandIntervalTransformRule {
|
||||
*expr = Expr::Cast {
|
||||
kind: kind.clone(),
|
||||
expr: single_quoted_string_expr(interval_value),
|
||||
data_type: DataType::Interval,
|
||||
data_type: data_type.clone(),
|
||||
format: std::mem::take(format),
|
||||
}
|
||||
}
|
||||
@@ -392,7 +392,10 @@ mod tests {
|
||||
|
||||
let mut cast_to_interval_expr = Expr::Cast {
|
||||
expr: single_quoted_string_expr("3y2mon".to_string()),
|
||||
data_type: DataType::Interval,
|
||||
data_type: DataType::Interval {
|
||||
fields: None,
|
||||
precision: None,
|
||||
},
|
||||
format: None,
|
||||
kind: sqlparser::ast::CastKind::Cast,
|
||||
};
|
||||
@@ -407,7 +410,10 @@ mod tests {
|
||||
expr: Box::new(Expr::Value(
|
||||
Value::SingleQuotedString("3 years 2 months".to_string()).into()
|
||||
)),
|
||||
data_type: DataType::Interval,
|
||||
data_type: DataType::Interval {
|
||||
fields: None,
|
||||
precision: None,
|
||||
},
|
||||
format: None,
|
||||
}
|
||||
);
|
||||
|
||||
@@ -178,9 +178,9 @@ pub(crate) fn get_type_by_alias(data_type: &DataType) -> Option<DataType> {
|
||||
DataType::UInt16 => Some(DataType::SmallIntUnsigned(None)),
|
||||
DataType::UInt32 => Some(DataType::IntUnsigned(None)),
|
||||
DataType::UInt64 => Some(DataType::BigIntUnsigned(None)),
|
||||
DataType::Float4 => Some(DataType::Float(None)),
|
||||
DataType::Float4 => Some(DataType::Float(ExactNumberInfo::None)),
|
||||
DataType::Float8 => Some(DataType::Double(ExactNumberInfo::None)),
|
||||
DataType::Float32 => Some(DataType::Float(None)),
|
||||
DataType::Float32 => Some(DataType::Float(ExactNumberInfo::None)),
|
||||
DataType::Float64 => Some(DataType::Double(ExactNumberInfo::None)),
|
||||
DataType::Bool => Some(DataType::Boolean),
|
||||
DataType::Datetime(_) => Some(DataType::Timestamp(Some(6), TimezoneInfo::None)),
|
||||
@@ -222,9 +222,9 @@ pub(crate) fn get_data_type_by_alias_name(name: &str) -> Option<DataType> {
|
||||
"UINT16" => Some(DataType::SmallIntUnsigned(None)),
|
||||
"UINT32" => Some(DataType::IntUnsigned(None)),
|
||||
"UINT64" => Some(DataType::BigIntUnsigned(None)),
|
||||
"FLOAT4" => Some(DataType::Float(None)),
|
||||
"FLOAT4" => Some(DataType::Float(ExactNumberInfo::None)),
|
||||
"FLOAT8" => Some(DataType::Double(ExactNumberInfo::None)),
|
||||
"FLOAT32" => Some(DataType::Float(None)),
|
||||
"FLOAT32" => Some(DataType::Float(ExactNumberInfo::None)),
|
||||
"FLOAT64" => Some(DataType::Double(ExactNumberInfo::None)),
|
||||
// String type alias
|
||||
"TINYTEXT" | "MEDIUMTEXT" | "LONGTEXT" => Some(DataType::Text),
|
||||
@@ -256,7 +256,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
get_data_type_by_alias_name("float32"),
|
||||
Some(DataType::Float(None))
|
||||
Some(DataType::Float(ExactNumberInfo::None))
|
||||
);
|
||||
assert_eq!(
|
||||
get_data_type_by_alias_name("float8"),
|
||||
@@ -264,7 +264,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
get_data_type_by_alias_name("float4"),
|
||||
Some(DataType::Float(None))
|
||||
Some(DataType::Float(ExactNumberInfo::None))
|
||||
);
|
||||
assert_eq!(
|
||||
get_data_type_by_alias_name("int8"),
|
||||
@@ -370,7 +370,7 @@ mod tests {
|
||||
match &stmts[0] {
|
||||
Statement::Query(q) => assert_eq!(
|
||||
format!(
|
||||
"SELECT arrow_cast(TIMESTAMP '2020-01-01 01:23:45.12345678', 'Timestamp({expected}, None)')"
|
||||
"SELECT arrow_cast(TIMESTAMP '2020-01-01 01:23:45.12345678', 'Timestamp({expected})')"
|
||||
),
|
||||
q.to_string()
|
||||
),
|
||||
@@ -402,19 +402,19 @@ mod tests {
|
||||
#[test]
|
||||
fn test_transform_timestamp_alias() {
|
||||
// Timestamp[Second | Millisecond | Microsecond | Nanosecond]
|
||||
test_timestamp_alias("TimestampSecond", "Second");
|
||||
test_timestamp_alias("Timestamp_s", "Second");
|
||||
test_timestamp_alias("TimestampMillisecond", "Millisecond");
|
||||
test_timestamp_alias("Timestamp_ms", "Millisecond");
|
||||
test_timestamp_alias("TimestampMicrosecond", "Microsecond");
|
||||
test_timestamp_alias("Timestamp_us", "Microsecond");
|
||||
test_timestamp_alias("TimestampNanosecond", "Nanosecond");
|
||||
test_timestamp_alias("Timestamp_ns", "Nanosecond");
|
||||
test_timestamp_alias("TimestampSecond", "s");
|
||||
test_timestamp_alias("Timestamp_s", "s");
|
||||
test_timestamp_alias("TimestampMillisecond", "ms");
|
||||
test_timestamp_alias("Timestamp_ms", "ms");
|
||||
test_timestamp_alias("TimestampMicrosecond", "µs");
|
||||
test_timestamp_alias("Timestamp_us", "µs");
|
||||
test_timestamp_alias("TimestampNanosecond", "ns");
|
||||
test_timestamp_alias("Timestamp_ns", "ns");
|
||||
// Timestamp(precision)
|
||||
test_timestamp_precision_type(0, "Second");
|
||||
test_timestamp_precision_type(3, "Millisecond");
|
||||
test_timestamp_precision_type(6, "Microsecond");
|
||||
test_timestamp_precision_type(9, "Nanosecond");
|
||||
test_timestamp_precision_type(0, "s");
|
||||
test_timestamp_precision_type(3, "ms");
|
||||
test_timestamp_precision_type(6, "µs");
|
||||
test_timestamp_precision_type(9, "ns");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user