mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-27 18:30:38 +00:00
feat: buffered parquet writer (#1263)
* wip: use * rebase develop * chore: fix typos * feat: replace export parquet writer with buffered writer * fix: some cr comments * feat: add sst_write_buffer_size config item to config how many bytes to buffer before flush to underlying storage * chore: reabse onto develop
This commit is contained in:
@@ -150,6 +150,8 @@ pub struct CompactionConfig {
|
||||
pub max_files_in_level0: usize,
|
||||
/// Max task number for SST purge task after compaction.
|
||||
pub max_purge_tasks: usize,
|
||||
/// Buffer threshold while writing SST files
|
||||
pub sst_write_buffer_size: ReadableSize,
|
||||
}
|
||||
|
||||
impl Default for CompactionConfig {
|
||||
@@ -158,6 +160,7 @@ impl Default for CompactionConfig {
|
||||
max_inflight_tasks: 4,
|
||||
max_files_in_level0: 8,
|
||||
max_purge_tasks: 32,
|
||||
sst_write_buffer_size: ReadableSize::mb(8),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -177,6 +180,7 @@ impl From<&DatanodeOptions> for StorageEngineConfig {
|
||||
manifest_gc_duration: value.storage.manifest.gc_duration,
|
||||
max_files_in_l0: value.storage.compaction.max_files_in_level0,
|
||||
max_purge_tasks: value.storage.compaction.max_purge_tasks,
|
||||
sst_write_buffer_size: value.storage.compaction.sst_write_buffer_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -439,12 +439,6 @@ pub enum Error {
|
||||
backtrace: Backtrace,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to write parquet file, source: {}", source))]
|
||||
WriteParquet {
|
||||
source: parquet::errors::ParquetError,
|
||||
backtrace: Backtrace,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to poll stream, source: {}", source))]
|
||||
PollStream {
|
||||
source: datafusion_common::DataFusionError,
|
||||
@@ -514,6 +508,12 @@ pub enum Error {
|
||||
#[snafu(backtrace)]
|
||||
source: BoxedError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to copy table to parquet file, source: {}", source))]
|
||||
WriteParquet {
|
||||
#[snafu(backtrace)]
|
||||
source: storage::error::Error,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
@@ -12,24 +12,17 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::pin::Pin;
|
||||
|
||||
use common_datasource;
|
||||
use common_datasource::object_store::{build_backend, parse_url};
|
||||
use common_query::physical_plan::SessionContext;
|
||||
use common_query::Output;
|
||||
use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
|
||||
use datafusion::parquet::arrow::ArrowWriter;
|
||||
use datafusion::parquet::basic::{Compression, Encoding, ZstdLevel};
|
||||
use datafusion::parquet::file::properties::WriterProperties;
|
||||
use datafusion::physical_plan::RecordBatchStream;
|
||||
use futures::TryStreamExt;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use storage::sst::SstInfo;
|
||||
use storage::{ParquetWriter, Source};
|
||||
use table::engine::TableReference;
|
||||
use table::requests::CopyTableRequest;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::error::{self, Result, WriteParquetSnafu};
|
||||
use crate::sql::SqlHandler;
|
||||
|
||||
impl SqlHandler {
|
||||
@@ -51,99 +44,20 @@ impl SqlHandler {
|
||||
let stream = stream
|
||||
.execute(0, SessionContext::default().task_ctx())
|
||||
.context(error::TableScanExecSnafu)?;
|
||||
let stream = Box::pin(DfRecordBatchStreamAdapter::new(stream));
|
||||
|
||||
let (_schema, _host, path) = parse_url(&req.location).context(error::ParseUrlSnafu)?;
|
||||
let object_store =
|
||||
build_backend(&req.location, req.connection).context(error::BuildBackendSnafu)?;
|
||||
|
||||
let mut parquet_writer = ParquetWriter::new(path.to_string(), stream, object_store);
|
||||
// TODO(jiachun):
|
||||
// For now, COPY is implemented synchronously.
|
||||
// When copying large table, it will be blocked for a long time.
|
||||
// Maybe we should make "copy" runs in background?
|
||||
// Like PG: https://www.postgresql.org/docs/current/sql-copy.html
|
||||
let rows = parquet_writer.flush().await?;
|
||||
let writer = ParquetWriter::new(&path, Source::Stream(stream), object_store);
|
||||
|
||||
Ok(Output::AffectedRows(rows))
|
||||
}
|
||||
}
|
||||
|
||||
type DfRecordBatchStream = Pin<Box<DfRecordBatchStreamAdapter>>;
|
||||
|
||||
struct ParquetWriter {
|
||||
file_name: String,
|
||||
stream: DfRecordBatchStream,
|
||||
object_store: ObjectStore,
|
||||
max_row_group_size: usize,
|
||||
max_rows_in_segment: usize,
|
||||
}
|
||||
|
||||
impl ParquetWriter {
|
||||
pub fn new(file_name: String, stream: DfRecordBatchStream, object_store: ObjectStore) -> Self {
|
||||
Self {
|
||||
file_name,
|
||||
stream,
|
||||
object_store,
|
||||
// TODO(jiachun): make these configurable: WITH (max_row_group_size=xxx, max_rows_in_segment=xxx)
|
||||
max_row_group_size: 4096,
|
||||
max_rows_in_segment: 5000000, // default 5M rows per segment
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn flush(&mut self) -> Result<usize> {
|
||||
let schema = self.stream.as_ref().schema();
|
||||
let writer_props = WriterProperties::builder()
|
||||
.set_compression(Compression::ZSTD(ZstdLevel::default()))
|
||||
.set_encoding(Encoding::PLAIN)
|
||||
.set_max_row_group_size(self.max_row_group_size)
|
||||
.build();
|
||||
let mut total_rows = 0;
|
||||
loop {
|
||||
let mut buf = vec![];
|
||||
let mut arrow_writer =
|
||||
ArrowWriter::try_new(&mut buf, schema.clone(), Some(writer_props.clone()))
|
||||
.context(error::WriteParquetSnafu)?;
|
||||
|
||||
let mut rows = 0;
|
||||
let mut end_loop = true;
|
||||
// TODO(hl & jiachun): Since OpenDAL's writer is async and ArrowWriter requires a `std::io::Write`,
|
||||
// here we use a Vec<u8> to buffer all parquet bytes in memory and write to object store
|
||||
// at a time. Maybe we should find a better way to bridge ArrowWriter and OpenDAL's object.
|
||||
while let Some(batch) = self
|
||||
.stream
|
||||
.try_next()
|
||||
.await
|
||||
.context(error::PollStreamSnafu)?
|
||||
{
|
||||
arrow_writer
|
||||
.write(&batch)
|
||||
.context(error::WriteParquetSnafu)?;
|
||||
rows += batch.num_rows();
|
||||
if rows >= self.max_rows_in_segment {
|
||||
end_loop = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let start_row_num = total_rows + 1;
|
||||
total_rows += rows;
|
||||
arrow_writer.close().context(error::WriteParquetSnafu)?;
|
||||
|
||||
// if rows == 0, we just end up with an empty file.
|
||||
//
|
||||
// file_name like:
|
||||
// "file_name_1_1000000" (row num: 1 ~ 1000000),
|
||||
// "file_name_1000001_xxx" (row num: 1000001 ~ xxx)
|
||||
let file_name = format!("{}_{}_{}", self.file_name, start_row_num, total_rows);
|
||||
self.object_store
|
||||
.write(&file_name, buf)
|
||||
.await
|
||||
.context(error::WriteObjectSnafu { path: file_name })?;
|
||||
|
||||
if end_loop {
|
||||
return Ok(total_rows);
|
||||
}
|
||||
}
|
||||
let rows_copied = writer
|
||||
.write_sst(&storage::sst::WriteOptions::default())
|
||||
.await
|
||||
.context(WriteParquetSnafu)?
|
||||
.map(|SstInfo { num_rows, .. }| num_rows)
|
||||
.unwrap_or(0);
|
||||
|
||||
Ok(Output::AffectedRows(rows_copied))
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user