feat: buffered parquet writer (#1263)

* wip: use

* rebase develop

* chore: fix typos

* feat: replace export parquet writer with buffered writer

* fix: some cr comments

* feat: add sst_write_buffer_size config item to config how many bytes to buffer before flush to underlying storage

* chore: reabse onto develop
This commit is contained in:
Lei, HUANG
2023-04-01 17:21:19 +08:00
committed by GitHub
parent 6a05f617a4
commit 0253136333
20 changed files with 346 additions and 172 deletions

View File

@@ -150,6 +150,8 @@ pub struct CompactionConfig {
pub max_files_in_level0: usize,
/// Max task number for SST purge task after compaction.
pub max_purge_tasks: usize,
/// Buffer threshold while writing SST files
pub sst_write_buffer_size: ReadableSize,
}
impl Default for CompactionConfig {
@@ -158,6 +160,7 @@ impl Default for CompactionConfig {
max_inflight_tasks: 4,
max_files_in_level0: 8,
max_purge_tasks: 32,
sst_write_buffer_size: ReadableSize::mb(8),
}
}
}
@@ -177,6 +180,7 @@ impl From<&DatanodeOptions> for StorageEngineConfig {
manifest_gc_duration: value.storage.manifest.gc_duration,
max_files_in_l0: value.storage.compaction.max_files_in_level0,
max_purge_tasks: value.storage.compaction.max_purge_tasks,
sst_write_buffer_size: value.storage.compaction.sst_write_buffer_size,
}
}
}

View File

@@ -439,12 +439,6 @@ pub enum Error {
backtrace: Backtrace,
},
#[snafu(display("Failed to write parquet file, source: {}", source))]
WriteParquet {
source: parquet::errors::ParquetError,
backtrace: Backtrace,
},
#[snafu(display("Failed to poll stream, source: {}", source))]
PollStream {
source: datafusion_common::DataFusionError,
@@ -514,6 +508,12 @@ pub enum Error {
#[snafu(backtrace)]
source: BoxedError,
},
#[snafu(display("Failed to copy table to parquet file, source: {}", source))]
WriteParquet {
#[snafu(backtrace)]
source: storage::error::Error,
},
}
pub type Result<T> = std::result::Result<T, Error>;

View File

@@ -12,24 +12,17 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::pin::Pin;
use common_datasource;
use common_datasource::object_store::{build_backend, parse_url};
use common_query::physical_plan::SessionContext;
use common_query::Output;
use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
use datafusion::parquet::arrow::ArrowWriter;
use datafusion::parquet::basic::{Compression, Encoding, ZstdLevel};
use datafusion::parquet::file::properties::WriterProperties;
use datafusion::physical_plan::RecordBatchStream;
use futures::TryStreamExt;
use object_store::ObjectStore;
use snafu::ResultExt;
use storage::sst::SstInfo;
use storage::{ParquetWriter, Source};
use table::engine::TableReference;
use table::requests::CopyTableRequest;
use crate::error::{self, Result};
use crate::error::{self, Result, WriteParquetSnafu};
use crate::sql::SqlHandler;
impl SqlHandler {
@@ -51,99 +44,20 @@ impl SqlHandler {
let stream = stream
.execute(0, SessionContext::default().task_ctx())
.context(error::TableScanExecSnafu)?;
let stream = Box::pin(DfRecordBatchStreamAdapter::new(stream));
let (_schema, _host, path) = parse_url(&req.location).context(error::ParseUrlSnafu)?;
let object_store =
build_backend(&req.location, req.connection).context(error::BuildBackendSnafu)?;
let mut parquet_writer = ParquetWriter::new(path.to_string(), stream, object_store);
// TODO(jiachun):
// For now, COPY is implemented synchronously.
// When copying large table, it will be blocked for a long time.
// Maybe we should make "copy" runs in background?
// Like PG: https://www.postgresql.org/docs/current/sql-copy.html
let rows = parquet_writer.flush().await?;
let writer = ParquetWriter::new(&path, Source::Stream(stream), object_store);
Ok(Output::AffectedRows(rows))
}
}
type DfRecordBatchStream = Pin<Box<DfRecordBatchStreamAdapter>>;
struct ParquetWriter {
file_name: String,
stream: DfRecordBatchStream,
object_store: ObjectStore,
max_row_group_size: usize,
max_rows_in_segment: usize,
}
impl ParquetWriter {
pub fn new(file_name: String, stream: DfRecordBatchStream, object_store: ObjectStore) -> Self {
Self {
file_name,
stream,
object_store,
// TODO(jiachun): make these configurable: WITH (max_row_group_size=xxx, max_rows_in_segment=xxx)
max_row_group_size: 4096,
max_rows_in_segment: 5000000, // default 5M rows per segment
}
}
pub async fn flush(&mut self) -> Result<usize> {
let schema = self.stream.as_ref().schema();
let writer_props = WriterProperties::builder()
.set_compression(Compression::ZSTD(ZstdLevel::default()))
.set_encoding(Encoding::PLAIN)
.set_max_row_group_size(self.max_row_group_size)
.build();
let mut total_rows = 0;
loop {
let mut buf = vec![];
let mut arrow_writer =
ArrowWriter::try_new(&mut buf, schema.clone(), Some(writer_props.clone()))
.context(error::WriteParquetSnafu)?;
let mut rows = 0;
let mut end_loop = true;
// TODO(hl & jiachun): Since OpenDAL's writer is async and ArrowWriter requires a `std::io::Write`,
// here we use a Vec<u8> to buffer all parquet bytes in memory and write to object store
// at a time. Maybe we should find a better way to bridge ArrowWriter and OpenDAL's object.
while let Some(batch) = self
.stream
.try_next()
.await
.context(error::PollStreamSnafu)?
{
arrow_writer
.write(&batch)
.context(error::WriteParquetSnafu)?;
rows += batch.num_rows();
if rows >= self.max_rows_in_segment {
end_loop = false;
break;
}
}
let start_row_num = total_rows + 1;
total_rows += rows;
arrow_writer.close().context(error::WriteParquetSnafu)?;
// if rows == 0, we just end up with an empty file.
//
// file_name like:
// "file_name_1_1000000" (row num: 1 ~ 1000000),
// "file_name_1000001_xxx" (row num: 1000001 ~ xxx)
let file_name = format!("{}_{}_{}", self.file_name, start_row_num, total_rows);
self.object_store
.write(&file_name, buf)
.await
.context(error::WriteObjectSnafu { path: file_name })?;
if end_loop {
return Ok(total_rows);
}
}
let rows_copied = writer
.write_sst(&storage::sst::WriteOptions::default())
.await
.context(WriteParquetSnafu)?
.map(|SstInfo { num_rows, .. }| num_rows)
.unwrap_or(0);
Ok(Output::AffectedRows(rows_copied))
}
}