feat: optimize some parquet writer parameter (#1758)

This commit is contained in:
Lei, HUANG
2023-06-12 11:46:45 +08:00
committed by GitHub
parent ddcee052b2
commit 67c16dd631

View File

@@ -43,8 +43,9 @@ use parquet::basic::{Compression, Encoding, ZstdLevel};
use parquet::file::metadata::KeyValue;
use parquet::file::properties::WriterProperties;
use parquet::format::FileMetaData;
use parquet::schema::types::SchemaDescriptor;
use parquet::schema::types::{ColumnPath, SchemaDescriptor};
use snafu::{OptionExt, ResultExt};
use store_api::storage::consts::SEQUENCE_COLUMN_NAME;
use table::predicate::Predicate;
use tokio::io::BufReader;
@@ -87,7 +88,8 @@ impl<'a> ParquetWriter<'a> {
opts: &sst::WriteOptions,
) -> Result<Option<SstInfo>> {
let schema = self.source.schema();
let writer_props = WriterProperties::builder()
let mut props_builder = WriterProperties::builder()
.set_compression(Compression::ZSTD(ZstdLevel::default()))
.set_encoding(Encoding::PLAIN)
.set_max_row_group_size(self.max_row_group_size)
@@ -96,7 +98,23 @@ impl<'a> ParquetWriter<'a> {
.map(|(k, v)| KeyValue::new(k.clone(), v.clone()))
.collect::<Vec<_>>()
}))
.build();
.set_column_encoding(
ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
Encoding::DELTA_BINARY_PACKED,
)
.set_column_dictionary_enabled(
ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
false,
);
if let Some(ts_col) = schema.timestamp_column() {
props_builder = props_builder.set_column_encoding(
ColumnPath::new(vec![ts_col.name.clone()]),
Encoding::DELTA_BINARY_PACKED,
);
}
let writer_props = props_builder.build();
let mut buffered_writer = BufferedWriter::try_new(
self.file_path.to_string(),