Move functions for creating/extracting tarballs into utils

Useful for other code paths which will handle zstd compression and
decompression.
This commit is contained in:
Tristan Partin
2024-03-14 14:35:34 -05:00
committed by Tristan Partin
parent a8384a074e
commit 64c6dfd3e4
6 changed files with 97 additions and 74 deletions

3
Cargo.lock generated
View File

@@ -6468,6 +6468,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"async-trait",
"bincode",
"byteorder",
@@ -6506,12 +6507,14 @@ dependencies = [
"thiserror",
"tokio",
"tokio-stream",
"tokio-tar",
"tokio-util",
"tracing",
"tracing-error",
"tracing-subscriber",
"url",
"uuid",
"walkdir",
"workspace_hack",
]

View File

@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
[dependencies]
arc-swap.workspace = true
sentry.workspace = true
async-compression.workspace = true
async-trait.workspace = true
anyhow.workspace = true
bincode.workspace = true
@@ -36,6 +37,7 @@ serde_json.workspace = true
signal-hook.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-tar.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
@@ -46,6 +48,7 @@ strum.workspace = true
strum_macros.workspace = true
url.workspace = true
uuid.workspace = true
walkdir.workspace = true
pq_proto.workspace = true
postgres_connection.workspace = true

View File

@@ -87,6 +87,8 @@ pub mod failpoint_support;
pub mod yielding_loop;
pub mod zstd;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

78
libs/utils/src/zstd.rs Normal file
View File

@@ -0,0 +1,78 @@
use std::io::SeekFrom;
use anyhow::{Context, Result};
use async_compression::{
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
zstd::CParameter,
Level,
};
use camino::Utf8Path;
use nix::NixPath;
use tokio::{
fs::{File, OpenOptions},
io::AsyncBufRead,
io::AsyncSeekExt,
io::AsyncWriteExt,
};
use tokio_tar::{Archive, Builder, HeaderMode};
use walkdir::WalkDir;
/// Creates a Zstandard tarball.
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
let file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&tarball)
.await
.with_context(|| format!("tempfile creation {tarball}"))?;
let mut paths = Vec::new();
for entry in WalkDir::new(path) {
let entry = entry?;
let metadata = entry.metadata().expect("error getting dir entry metadata");
// Also allow directories so that we also get empty directories
if !(metadata.is_file() || metadata.is_dir()) {
continue;
}
let path = entry.into_path();
paths.push(path);
}
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
file,
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
let mut builder = Builder::new(zstd);
// Use reproducible header mode
builder.mode(HeaderMode::Deterministic);
for p in paths {
let rel_path = p.strip_prefix(path)?;
if rel_path.is_empty() {
// The top directory should not be compressed,
// the tar crate doesn't like that
continue;
}
builder.append_path_with_name(&p, rel_path).await?;
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let mut compressed = zstd.into_inner();
let compressed_len = compressed.metadata().await?.len();
compressed.seek(SeekFrom::Start(0)).await?;
Ok((compressed, compressed_len))
}
/// Creates a Zstandard tarball.
pub async fn extract_zst_tarball(
path: &Utf8Path,
tarball: impl AsyncBufRead + Unpin,
) -> Result<()> {
let decoder = Box::pin(ZstdDecoder::new(tarball));
let mut archive = Archive::new(decoder);
archive.unpack(path).await?;
Ok(())
}

View File

@@ -2,28 +2,20 @@
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
//! a neon Timeline.
//!
use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use anyhow::{bail, ensure, Context, Result};
use async_compression::tokio::bufread::ZstdDecoder;
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use nix::NixPath;
use tokio::fs::{File, OpenOptions};
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tokio_tar::Builder;
use tokio_tar::HeaderMode;
use tracing::*;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::DecodedWALRecord;
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
reader.read_to_end(&mut buf).await?;
Ok(Bytes::from(buf))
}
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
let file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&tmp_path)
.await
.with_context(|| format!("tempfile creation {tmp_path}"))?;
let mut paths = Vec::new();
for entry in WalkDir::new(pgdata_path) {
let entry = entry?;
let metadata = entry.metadata().expect("error getting dir entry metadata");
// Also allow directories so that we also get empty directories
if !(metadata.is_file() || metadata.is_dir()) {
continue;
}
let path = entry.into_path();
paths.push(path);
}
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
file,
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
let mut builder = Builder::new(zstd);
// Use reproducible header mode
builder.mode(HeaderMode::Deterministic);
for path in paths {
let rel_path = path.strip_prefix(pgdata_path)?;
if rel_path.is_empty() {
// The top directory should not be compressed,
// the tar crate doesn't like that
continue;
}
builder.append_path_with_name(&path, rel_path).await?;
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let mut compressed = zstd.into_inner();
let compressed_len = compressed.metadata().await?.len();
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
}
compressed.seek(SeekFrom::Start(0)).await?;
Ok((compressed, compressed_len))
}
pub async fn extract_tar_zst(
pgdata_path: &Utf8Path,
tar_zst: impl AsyncBufRead + Unpin,
) -> Result<()> {
let tar = Box::pin(ZstdDecoder::new(tar_zst));
let mut archive = Archive::new(tar);
archive.unpack(pgdata_path).await?;
Ok(())
}

View File

@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
use utils::sync::gate::GateGuard;
use utils::timeout::timeout_cancellable;
use utils::timeout::TimeoutCancellableError;
use utils::zstd::create_zst_tarball;
use utils::zstd::extract_zst_tarball;
use self::config::AttachedLocationConfig;
use self::config::AttachmentMode;
@@ -3042,8 +3044,13 @@ impl Tenant {
}
}
let (pgdata_zstd, tar_zst_size) =
import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
warn!(
"compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
);
}
pausable_failpoint!("before-initdb-upload");
@@ -3143,7 +3150,7 @@ impl Tenant {
let buf_read =
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
extract_zst_tarball(&pgdata_path, buf_read)
.await
.context("extract initdb tar")?;
} else {