mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 06:22:57 +00:00
Remove S3 archiving
This commit is contained in:
committed by
Kirill Bulatov
parent
44bfc529f6
commit
3e6087a12f
44
Cargo.lock
generated
44
Cargo.lock
generated
@@ -55,20 +55,6 @@ dependencies = [
|
||||
"backtrace",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2bf394cfbbe876f0ac67b13b6ca819f9c9f2fb9ec67223cceb1555fbab1c31a"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"zstd",
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream"
|
||||
version = "0.3.3"
|
||||
@@ -1508,7 +1494,6 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -3428,32 +3413,3 @@ name = "zeroize"
|
||||
version = "1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c88870063c39ee00ec285a2f8d6a966e5b6fb2becc4e8dac77ed0d370ed6006"
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.10.0+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b1365becbe415f3f0fcd024e2f7b45bacfb5bdd055f0dc113571394114e7bdd"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "4.1.4+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f7cd17c9af1a4d6c24beb1cc54b17e2ef7b593dc92f19e9d9acad8b182bbaee"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "1.6.3+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
@@ -46,7 +46,6 @@ fail = "0.5.0"
|
||||
rusoto_core = "0.47"
|
||||
rusoto_s3 = "0.47"
|
||||
async-trait = "0.1"
|
||||
async-compression = {version = "0.3", features = ["zstd", "tokio"]}
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
|
||||
@@ -293,7 +293,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"http_endpoint_thread",
|
||||
false,
|
||||
move || {
|
||||
let router = http::make_router(conf, auth_cloned, remote_index);
|
||||
let router = http::make_router(conf, auth_cloned, remote_index)?;
|
||||
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
|
||||
},
|
||||
)?;
|
||||
|
||||
@@ -1,334 +0,0 @@
|
||||
//! A CLI helper to deal with remote storage (S3, usually) blobs as archives.
|
||||
//! See [`compression`] for more details about the archives.
|
||||
|
||||
use std::{collections::BTreeSet, path::Path};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use clap::{App, Arg};
|
||||
use pageserver::{
|
||||
layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
remote_storage::compression,
|
||||
};
|
||||
use tokio::{fs, io};
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
const LIST_SUBCOMMAND: &str = "list";
|
||||
const ARCHIVE_ARG_NAME: &str = "archive";
|
||||
|
||||
const EXTRACT_SUBCOMMAND: &str = "extract";
|
||||
const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory";
|
||||
|
||||
const CREATE_SUBCOMMAND: &str = "create";
|
||||
const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory";
|
||||
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("pageserver zst blob [un]compressor utility")
|
||||
.version(GIT_VERSION)
|
||||
.subcommands(vec![
|
||||
App::new(LIST_SUBCOMMAND)
|
||||
.about("List the archive contents")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to list the contents of"),
|
||||
),
|
||||
App::new(EXTRACT_SUBCOMMAND)
|
||||
.about("Extracts the archive into the directory")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to extract"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to extract the archive into. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
App::new(CREATE_SUBCOMMAND)
|
||||
.about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)")
|
||||
.arg(
|
||||
Arg::new(SOURCE_DIRECTORY_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("A directory to use for creating the archive"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to create the archive in. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
])
|
||||
.get_matches();
|
||||
|
||||
let subcommand_name = match arg_matches.subcommand_name() {
|
||||
Some(name) => name,
|
||||
None => bail!("No subcommand specified"),
|
||||
};
|
||||
|
||||
let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) {
|
||||
Some(matches) => matches,
|
||||
None => bail!(
|
||||
"No subcommand arguments were recognized for subcommand '{}'",
|
||||
subcommand_name
|
||||
),
|
||||
};
|
||||
|
||||
let target_dir = Path::new(
|
||||
subcommand_matches
|
||||
.value_of(TARGET_DIRECTORY_ARG_NAME)
|
||||
.unwrap_or("./"),
|
||||
);
|
||||
|
||||
match subcommand_name {
|
||||
LIST_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
list_archive(archive).await
|
||||
}
|
||||
EXTRACT_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
extract_archive(archive, target_dir).await
|
||||
}
|
||||
CREATE_SUBCOMMAND => {
|
||||
let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) {
|
||||
Some(source) => Path::new(source),
|
||||
None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME),
|
||||
};
|
||||
create_archive(source_dir, target_dir).await
|
||||
}
|
||||
unknown => bail!("Unknown subcommand {}", unknown),
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_archive(archive: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
println!("Listing an archive at path '{}'", archive.display());
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
let archive_bytes = fs::read(&archive)
|
||||
.await
|
||||
.context("Failed to read the archive bytes")?;
|
||||
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice())
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
|
||||
let empty_path = Path::new("");
|
||||
println!("-------------------------------");
|
||||
|
||||
let longest_path_in_archive = header
|
||||
.files
|
||||
.iter()
|
||||
.filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len()))
|
||||
.max()
|
||||
.unwrap_or_default()
|
||||
.max(METADATA_FILE_NAME.len());
|
||||
|
||||
for regular_file in &header.files {
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
regular_file.subpath.as_path(empty_path).display(),
|
||||
regular_file.size,
|
||||
width = longest_path_in_archive,
|
||||
)
|
||||
}
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
METADATA_FILE_NAME,
|
||||
header.metadata_file_size,
|
||||
width = longest_path_in_archive,
|
||||
);
|
||||
println!("-------------------------------");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
let mut dir_contents = fs::read_dir(&target_dir)
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
let dir_entry = dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
ensure!(
|
||||
dir_entry.is_none(),
|
||||
"Target directory '{}' is not empty",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Extracting an archive at path '{}' into directory '{}'",
|
||||
archive.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut archive_file = fs::File::open(&archive).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to extract the archive")
|
||||
}
|
||||
|
||||
async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let source_dir = source_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the source dir path '{}'",
|
||||
source_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
source_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
source_dir.display()
|
||||
);
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Compressing directory '{}' and creating resulting archive in directory '{}'",
|
||||
source_dir.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut metadata_file_contents = None;
|
||||
let mut files_co_archive = Vec::new();
|
||||
|
||||
let mut source_dir_contents = fs::read_dir(&source_dir)
|
||||
.await
|
||||
.context("Failed to read the source directory contents")?;
|
||||
|
||||
while let Some(source_dir_entry) = source_dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to read a source dir entry")?
|
||||
{
|
||||
let entry_path = source_dir_entry.path();
|
||||
if entry_path.is_file() {
|
||||
if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) {
|
||||
let metadata_bytes = fs::read(entry_path)
|
||||
.await
|
||||
.context("Failed to read metata file bytes in the source dir")?;
|
||||
metadata_file_contents = Some(
|
||||
TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.context("Failed to parse metata file contents in the source dir")?,
|
||||
);
|
||||
} else {
|
||||
files_co_archive.push(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let metadata = match metadata_file_contents {
|
||||
Some(metadata) => metadata,
|
||||
None => bail!(
|
||||
"No metadata file found in the source dir '{}', cannot create the archive",
|
||||
source_dir.display()
|
||||
),
|
||||
};
|
||||
|
||||
let _ = compression::archive_files_as_stream(
|
||||
&source_dir,
|
||||
files_co_archive.iter(),
|
||||
&metadata,
|
||||
move |mut archive_streamer, archive_name| async move {
|
||||
let archive_target = target_dir.join(&archive_name);
|
||||
let mut archive_file = fs::File::create(&archive_target).await?;
|
||||
io::copy(&mut archive_streamer, &mut archive_file).await?;
|
||||
Ok(archive_target)
|
||||
},
|
||||
)
|
||||
.await
|
||||
.context("Failed to create an archive")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -409,6 +409,7 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- awaits_download
|
||||
- remote_consistent_lsn
|
||||
properties:
|
||||
awaits_download:
|
||||
type: boolean
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use tracing::*;
|
||||
@@ -21,7 +21,10 @@ use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId};
|
||||
use super::models::{
|
||||
StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest,
|
||||
};
|
||||
use crate::remote_storage::{schedule_timeline_download, RemoteIndex};
|
||||
use crate::config::RemoteStorageKind;
|
||||
use crate::remote_storage::{
|
||||
download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket,
|
||||
};
|
||||
use crate::repository::Repository;
|
||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
|
||||
@@ -31,6 +34,12 @@ struct State {
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
}
|
||||
|
||||
enum GenericRemoteStorage {
|
||||
Local(LocalFs),
|
||||
S3(S3Bucket),
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -38,17 +47,34 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> Self {
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
Self {
|
||||
// Note that this remote storage is created separately from the main one in the sync_loop.
|
||||
// It's fine since it's stateless and some code duplication saves us from bloating the code around with generics.
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3)
|
||||
}
|
||||
})
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_index,
|
||||
}
|
||||
remote_storage,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,8 +148,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -184,8 +210,8 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
})
|
||||
};
|
||||
|
||||
@@ -212,41 +238,105 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let span = info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id);
|
||||
info!(
|
||||
"Handling timeline {} attach for tenant: {}",
|
||||
timeline_id, tenant_id,
|
||||
);
|
||||
|
||||
let span = tokio::task::spawn_blocking(move || {
|
||||
let entered = span.entered();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() {
|
||||
// TODO: maybe answer with 309 Not Modified here?
|
||||
anyhow::bail!("Timeline is already present locally")
|
||||
};
|
||||
Ok(entered.exit())
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let mut remote_index_write = get_state(&request).remote_index.write().await;
|
||||
let sync_id = ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
|
||||
let _enter = span.entered(); // entered guard cannot live across awaits (non Send)
|
||||
let index_entry = remote_index_write
|
||||
.timeline_entry_mut(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.ok_or_else(|| ApiError::NotFound("Unknown remote timeline".to_string()))?;
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if index_entry.get_awaits_download() {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
remote_timeline.awaits_download = true;
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
} else {
|
||||
// no timeline in the index, release the lock to make the potentially lengthy download opetation
|
||||
drop(index_accessor);
|
||||
}
|
||||
|
||||
index_entry.set_awaits_download(true);
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
let new_timeline = match try_download_shard_data(state, sync_id).await {
|
||||
Ok(Some(mut new_timeline)) => {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
new_timeline.awaits_download = true;
|
||||
new_timeline
|
||||
}
|
||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote timeline data: {:?}", e);
|
||||
return Err(ApiError::NotFound(
|
||||
"Failed to retrieve remote timeline".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
match index_accessor.timeline_entry_mut(&sync_id) {
|
||||
Some(remote_timeline) => {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
remote_timeline.awaits_download = true;
|
||||
}
|
||||
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
|
||||
}
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn try_download_shard_data(
|
||||
state: &State,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<Option<RemoteTimeline>> {
|
||||
let shard = match state.remote_storage.as_ref() {
|
||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||
download_index_part(state.conf, local_storage, sync_id).await
|
||||
}
|
||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||
download_index_part(state.conf, s3_storage, sync_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
.with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?;
|
||||
|
||||
let timeline_path = state
|
||||
.conf
|
||||
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
RemoteTimeline::from_index_part(&timeline_path, shard)
|
||||
.map(Some)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to convert index shard into remote timeline for timeline {}",
|
||||
sync_id
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
@@ -317,7 +407,7 @@ pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
if auth.is_some() {
|
||||
@@ -331,8 +421,10 @@ pub fn make_router(
|
||||
}))
|
||||
}
|
||||
|
||||
router
|
||||
.data(Arc::new(State::new(conf, auth, remote_index)))
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_index).context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
@@ -350,5 +442,5 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
||||
timeline_detach_handler,
|
||||
)
|
||||
.any(handler_404)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -387,8 +387,6 @@ impl Repository for LayeredRepository {
|
||||
timeline_id, timeline_sync_status_update
|
||||
);
|
||||
match timeline_sync_status_update {
|
||||
TimelineSyncStatusUpdate::Uploaded => { /* nothing to do, remote consistent lsn is managed by the remote storage */
|
||||
}
|
||||
TimelineSyncStatusUpdate::Downloaded => {
|
||||
match self.timelines.lock().unwrap().entry(timeline_id) {
|
||||
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
|
||||
@@ -650,7 +648,8 @@ impl LayeredRepository {
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult> {
|
||||
let _span_guard =
|
||||
info_span!("gc iteration", tenant = %self.tenantid, timeline = ?target_timelineid);
|
||||
info_span!("gc iteration", tenant = %self.tenantid, timeline = ?target_timelineid)
|
||||
.entered();
|
||||
let mut totals: GcResult = Default::default();
|
||||
let now = Instant::now();
|
||||
|
||||
@@ -1548,7 +1547,7 @@ impl LayeredTimeline {
|
||||
schedule_timeline_checkpoint_upload(
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
vec![new_delta_path],
|
||||
new_delta_path,
|
||||
metadata,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
//!
|
||||
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
|
||||
//! Synchronization internals are split into submodules
|
||||
//! * [`storage_sync::compression`] for a custom remote storage format used to store timeline files in archives
|
||||
//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
|
||||
//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
|
||||
//!
|
||||
@@ -54,25 +53,32 @@
|
||||
//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
|
||||
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
|
||||
//!
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, [`TimelineSyncState`],
|
||||
//! submitted via [`crate::tenant_mgr::set_timeline_states`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`],
|
||||
//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Such submissions happen in two cases:
|
||||
//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
|
||||
//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
|
||||
//!
|
||||
//! When the pageserver terminates, the upload loop finishes a current sync task (if any) and exits.
|
||||
//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits.
|
||||
//!
|
||||
//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
|
||||
//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file).
|
||||
//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
|
||||
//! by the storage upload, if enabled.
|
||||
//! Yet timeline cannot alter already existing files, and normally cannot remote those too: only a GC process is capable of removing unused files.
|
||||
//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files.
|
||||
//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
|
||||
//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
|
||||
//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
|
||||
//! when the newer image is downloaded
|
||||
//!
|
||||
//! To optimize S3 storage (and access), the sync loop compresses the checkpoint files before placing them to S3, and uncompresses them back, keeping track of timeline files and metadata.
|
||||
//! Also, the remote file list is queried once only, at startup, to avoid possible extra costs and latency issues.
|
||||
//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure.
|
||||
//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files.
|
||||
//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download.
|
||||
//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`],
|
||||
//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files.
|
||||
//!
|
||||
//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed.
|
||||
//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only,
|
||||
//! when a new timeline is scheduled for the download.
|
||||
//!
|
||||
//! NOTES:
|
||||
//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
@@ -86,7 +92,7 @@ mod s3_bucket;
|
||||
mod storage_sync;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
collections::{HashMap, HashSet},
|
||||
ffi, fs,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
@@ -94,22 +100,36 @@ use std::{
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::io;
|
||||
use tracing::{debug, error, info};
|
||||
use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
pub use self::storage_sync::index::{RemoteIndex, TimelineIndexEntry};
|
||||
pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
|
||||
use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
|
||||
use crate::layered_repository::ephemeral_file::is_ephemeral_file;
|
||||
pub use self::{
|
||||
local_fs::LocalFs,
|
||||
s3_bucket::S3Bucket,
|
||||
storage_sync::{
|
||||
download_index_part,
|
||||
index::{IndexPart, RemoteIndex, RemoteTimeline},
|
||||
schedule_timeline_checkpoint_upload, schedule_timeline_download,
|
||||
},
|
||||
};
|
||||
use crate::{
|
||||
config::{PageServerConf, RemoteStorageKind},
|
||||
layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
layered_repository::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
},
|
||||
};
|
||||
use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
pub use storage_sync::compression;
|
||||
|
||||
/// A timeline status to share with pageserver's sync counterpart,
|
||||
/// after comparing local and remote timeline state.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum LocalTimelineInitStatus {
|
||||
/// The timeline has every remote layer present locally.
|
||||
/// There could be some layers requiring uploading,
|
||||
/// but this does not block the timeline from any user interaction.
|
||||
LocallyComplete,
|
||||
/// A timeline has some files remotely, that are not present locally and need downloading.
|
||||
/// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only,
|
||||
/// so the data needs to be downloaded first before the timeline can be used.
|
||||
NeedsSync,
|
||||
}
|
||||
|
||||
@@ -179,7 +199,7 @@ pub fn start_local_timeline_sync(
|
||||
|
||||
fn local_tenant_timeline_files(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
let mut local_tenant_timeline_files = HashMap::new();
|
||||
let tenants_dir = config.tenants_path();
|
||||
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
||||
@@ -214,9 +234,8 @@ fn local_tenant_timeline_files(
|
||||
fn collect_timelines_for_tenant(
|
||||
config: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
|
||||
let mut timelines: HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)> =
|
||||
HashMap::new();
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
let mut timelines = HashMap::new();
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(ffi::OsStr::to_str)
|
||||
@@ -265,8 +284,8 @@ fn collect_timelines_for_tenant(
|
||||
// NOTE: ephemeral files are excluded from the list
|
||||
fn collect_timeline_files(
|
||||
timeline_dir: &Path,
|
||||
) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec<PathBuf>)> {
|
||||
let mut timeline_files = Vec::new();
|
||||
) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet<PathBuf>)> {
|
||||
let mut timeline_files = HashSet::new();
|
||||
let mut timeline_metadata_path = None;
|
||||
|
||||
let timeline_id = timeline_dir
|
||||
@@ -286,7 +305,7 @@ fn collect_timeline_files(
|
||||
debug!("skipping ephemeral file {}", entry_path.display());
|
||||
continue;
|
||||
} else {
|
||||
timeline_files.push(entry_path);
|
||||
timeline_files.insert(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -307,7 +326,7 @@ fn collect_timeline_files(
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
trait RemoteStorage: Send + Sync {
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type StoragePath;
|
||||
|
||||
@@ -324,9 +343,9 @@ trait RemoteStorage: Send + Sync {
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
/// S3 PUT request requires the content length to be specified,
|
||||
/// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_kb: usize,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_bytes: usize,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
# Non-implementation details
|
||||
|
||||
This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
|
||||
Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules.
|
||||
Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
|
||||
|
||||
## Approach
|
||||
|
||||
Backup functionality is a new component, appeared way after the core DB functionality was implemented.
|
||||
Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
|
||||
|
||||
To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
|
||||
This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
|
||||
|
||||
## What's done
|
||||
|
||||
Current implementation
|
||||
* provides remote storage wrappers for AWS S3 and local FS
|
||||
* synchronizes the differences with local timelines and remote states as fast as possible
|
||||
* uploads new layer files
|
||||
* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc)
|
||||
* uses compression when deals with files, for better S3 usage
|
||||
* maintains an index of what's stored remotely
|
||||
* evicts failing tasks and stops the corresponding timelines
|
||||
|
||||
The tasks are delayed with every retry and the retries are capped, to avoid poisonous tasks.
|
||||
After any task eviction, or any error at startup checks (e.g. obviously different and wrong local and remote states fot the same timeline),
|
||||
the timeline has to be stopped from submitting further checkpoint upload tasks, which is done along the corresponding timeline status change.
|
||||
|
||||
No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
|
||||
It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
|
||||
|
||||
### Peculiarities
|
||||
|
||||
As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
|
||||
Here's the list of known compromises with comments:
|
||||
|
||||
* Remote storage file model is currently a custom archive format, that's not possible to deserialize without a particular Rust code of ours (including `serde`).
|
||||
We also don't optimize the archivation and pack every timeline checkpoint separately, so the resulting blob's size that gets on S3 could be arbitrary.
|
||||
But, it's a single blob, which is way better than storing ~780 small files separately.
|
||||
|
||||
* Archive index restoration requires reading every blob's head.
|
||||
This could be avoided by a background thread/future storing the serialized index in the remote storage.
|
||||
|
||||
* no proper file comparison
|
||||
|
||||
No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation)
|
||||
|
||||
* gc is ignored
|
||||
|
||||
So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
|
||||
Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.
|
||||
@@ -105,7 +105,7 @@ impl RemoteStorage for LocalFs {
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_kb: usize,
|
||||
from_size_bytes: usize,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -129,7 +129,11 @@ impl RemoteStorage for LocalFs {
|
||||
})?,
|
||||
);
|
||||
|
||||
io::copy(&mut from.take(from_size_kb as u64), &mut destination)
|
||||
let from_size_bytes = from_size_bytes as u64;
|
||||
// Require to read 1 byte more than the expected to check later, that the stream and its size match.
|
||||
let mut buffer_to_read = from.take(from_size_bytes + 1);
|
||||
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -138,6 +142,19 @@ impl RemoteStorage for LocalFs {
|
||||
)
|
||||
})?;
|
||||
|
||||
ensure!(
|
||||
bytes_read == from_size_bytes,
|
||||
"Provided stream has actual size {} fthat is smaller than the given stream size {}",
|
||||
bytes_read,
|
||||
from_size_bytes
|
||||
);
|
||||
|
||||
ensure!(
|
||||
buffer_to_read.read(&mut [0]).await? == 0,
|
||||
"Provided stream has bigger size than the given stream size {}",
|
||||
from_size_bytes
|
||||
);
|
||||
|
||||
destination.flush().await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload (flush temp) file to the local storage at '{}'",
|
||||
|
||||
@@ -17,7 +17,7 @@ use rusoto_s3::{
|
||||
};
|
||||
use tokio::io;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{debug, trace};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{
|
||||
config::S3Config,
|
||||
@@ -70,10 +70,6 @@ pub struct S3Bucket {
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
// TODO kb check this
|
||||
// Keeping a single client may cause issues due to timeouts.
|
||||
// https://github.com/rusoto/rusoto/issues/1686
|
||||
|
||||
debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
@@ -91,10 +87,10 @@ impl S3Bucket {
|
||||
let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
|
||||
let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none()
|
||||
{
|
||||
trace!("Using IAM-based AWS access");
|
||||
debug!("Using IAM-based AWS access");
|
||||
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
|
||||
} else {
|
||||
trace!("Using credentials-based AWS access");
|
||||
debug!("Using credentials-based AWS access");
|
||||
S3Client::new_with(
|
||||
request_dispatcher,
|
||||
StaticProvider::new_minimal(
|
||||
@@ -180,7 +176,7 @@ impl RemoteStorage for S3Bucket {
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_kb: usize,
|
||||
from_size_bytes: usize,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -188,7 +184,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.put_object(PutObjectRequest {
|
||||
body: Some(StreamingBody::new_with_size(
|
||||
ReaderStream::new(from),
|
||||
from_size_kb,
|
||||
from_size_bytes,
|
||||
)),
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: to.key().to_owned(),
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,612 +0,0 @@
|
||||
//! A set of structs to represent a compressed part of the timeline, and methods to asynchronously compress and uncompress a stream of data,
|
||||
//! without holding the entire data in memory.
|
||||
//! For the latter, both compress and uncompress functions operate buffered streams (currently hardcoded size of [`ARCHIVE_STREAM_BUFFER_SIZE_BYTES`]),
|
||||
//! not attempting to hold the entire archive in memory.
|
||||
//!
|
||||
//! The compression is done with <a href="https://datatracker.ietf.org/doc/html/rfc8878">zstd</a> streaming algorithm via the `async-compression` crate.
|
||||
//! The crate does not contain any knobs to tweak the compression, but otherwise is one of the only ones that's both async and has an API to manage the part of an archive.
|
||||
//! Zstd was picked as the best algorithm among the ones available in the crate, after testing the initial timeline file compression.
|
||||
//!
|
||||
//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code.
|
||||
//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file.
|
||||
//! When compressed, the metadata file is always required and stored as the last file in the archive stream.
|
||||
//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other layer files are decompressed successfully first.
|
||||
//!
|
||||
//! Archive structure:
|
||||
//! +----------------------------------------+
|
||||
//! | header | file_1, ..., file_k, metadata |
|
||||
//! +----------------------------------------+
|
||||
//!
|
||||
//! The archive consists of two separate zstd archives:
|
||||
//! * header archive, that contains all files names and their sizes and relative paths in the timeline directory
|
||||
//! Header is a Rust structure, serialized into bytes and compressed with zstd.
|
||||
//! * files archive, that has metadata file as the last one, all compressed with zstd into a single binary blob
|
||||
//!
|
||||
//! Header offset is stored in the file name, along with the `disk_consistent_lsn` from the metadata file.
|
||||
//! See [`parse_archive_name`] and [`ARCHIVE_EXTENSION`] for the name details, example: `00000000016B9150-.zst_9732`.
|
||||
//! This way, the header could be retrieved without reading an entire archive file.
|
||||
|
||||
use std::{
|
||||
collections::BTreeSet,
|
||||
future::Future,
|
||||
io::Cursor,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncWriteExt},
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME};
|
||||
|
||||
use super::index::RelativePath;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct ArchiveHeader {
|
||||
/// All regular timeline files, excluding the metadata file.
|
||||
pub files: Vec<FileEntry>,
|
||||
// Metadata file name is known to the system, as its location relative to the timeline dir,
|
||||
// so no need to store anything but its size in bytes.
|
||||
pub metadata_file_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct FileEntry {
|
||||
/// Uncompressed file size, bytes.
|
||||
pub size: u64,
|
||||
/// A path, relative to the directory root, used when compressing the directory contents.
|
||||
pub subpath: RelativePath,
|
||||
}
|
||||
|
||||
const ARCHIVE_EXTENSION: &str = "-.zst_";
|
||||
const ARCHIVE_STREAM_BUFFER_SIZE_BYTES: usize = 4 * 1024 * 1024;
|
||||
|
||||
/// Streams an archive of files given into a stream target, defined by the closure.
|
||||
///
|
||||
/// The closure approach is picked for cases like S3, where we would need a name of the file before we can get a stream to write the bytes into.
|
||||
/// Current idea is to place the header size in the name of the file, to enable the fast partial remote file index restoration without actually reading remote storage file contents.
|
||||
///
|
||||
/// Performs the compression in multiple steps:
|
||||
/// * prepares an archive header, stripping the `source_dir` prefix from the `files`
|
||||
/// * generates the name of the archive
|
||||
/// * prepares archive producer future, knowing the header and the file list
|
||||
/// An `impl AsyncRead` and `impl AsyncWrite` pair of connected streams is created to implement the partial contents streaming.
|
||||
/// The writer end gets into the archive producer future, to put the header and a stream of compressed files.
|
||||
/// * prepares archive consumer future, by executing the provided closure
|
||||
/// The closure gets the reader end stream and the name of the file to create a future that would stream the file contents elsewhere.
|
||||
/// * runs and waits for both futures to complete
|
||||
/// * on a successful completion of both futures, header, its size and the user-defined consumer future return data is returned
|
||||
/// Due to the design above, the archive name and related data is visible inside the consumer future only, so it's possible to return the data,
|
||||
/// needed for future processing.
|
||||
pub async fn archive_files_as_stream<Cons, ConsRet, Fut>(
|
||||
source_dir: &Path,
|
||||
files: impl Iterator<Item = &PathBuf>,
|
||||
metadata: &TimelineMetadata,
|
||||
create_archive_consumer: Cons,
|
||||
) -> anyhow::Result<(ArchiveHeader, u64, ConsRet)>
|
||||
where
|
||||
Cons: FnOnce(Box<dyn io::AsyncRead + Unpin + Send + Sync + 'static>, String) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
Fut: Future<Output = anyhow::Result<ConsRet>> + Send + 'static,
|
||||
ConsRet: Send + Sync + 'static,
|
||||
{
|
||||
let metadata_bytes = metadata
|
||||
.to_bytes()
|
||||
.context("Failed to create metadata bytes")?;
|
||||
let (archive_header, compressed_header_bytes) =
|
||||
prepare_header(source_dir, files, &metadata_bytes)
|
||||
.await
|
||||
.context("Failed to prepare file for archivation")?;
|
||||
|
||||
let header_size = compressed_header_bytes.len() as u64;
|
||||
let (write, read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
|
||||
let archive_filler = write_archive_contents(
|
||||
source_dir.to_path_buf(),
|
||||
archive_header.clone(),
|
||||
metadata_bytes,
|
||||
write,
|
||||
);
|
||||
let archive_name = archive_name(metadata.disk_consistent_lsn(), header_size);
|
||||
let archive_stream =
|
||||
Cursor::new(compressed_header_bytes).chain(ZstdEncoder::new(io::BufReader::new(read)));
|
||||
|
||||
let (archive_creation_result, archive_upload_result) = tokio::join!(
|
||||
tokio::spawn(archive_filler),
|
||||
tokio::spawn(async move {
|
||||
create_archive_consumer(Box::new(archive_stream), archive_name).await
|
||||
})
|
||||
);
|
||||
archive_creation_result
|
||||
.context("Failed to spawn archive creation future")?
|
||||
.context("Failed to create an archive")?;
|
||||
let upload_return_value = archive_upload_result
|
||||
.context("Failed to spawn archive upload future")?
|
||||
.context("Failed to upload the archive")?;
|
||||
|
||||
Ok((archive_header, header_size, upload_return_value))
|
||||
}
|
||||
|
||||
/// Similar to [`archive_files_as_stream`], creates a pair of streams to uncompress the 2nd part of the archive,
|
||||
/// that contains files and is located after the header.
|
||||
/// S3 allows downloading partial file contents for a given file key (i.e. name), to accommodate this retrieval,
|
||||
/// a closure is used.
|
||||
/// Same concepts with two concurrent futures, user-defined closure, future and return value apply here, but the
|
||||
/// consumer and the receiver ends are swapped, since the uncompression happens.
|
||||
pub async fn uncompress_file_stream_with_index<Prod, ProdRet, Fut>(
|
||||
destination_dir: PathBuf,
|
||||
files_to_skip: Arc<BTreeSet<PathBuf>>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
header: ArchiveHeader,
|
||||
header_size: u64,
|
||||
create_archive_file_part: Prod,
|
||||
) -> anyhow::Result<ProdRet>
|
||||
where
|
||||
Prod: FnOnce(Box<dyn io::AsyncWrite + Unpin + Send + Sync + 'static>, String) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
Fut: Future<Output = anyhow::Result<ProdRet>> + Send + 'static,
|
||||
ProdRet: Send + Sync + 'static,
|
||||
{
|
||||
let (write, mut read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
|
||||
let archive_name = archive_name(disk_consistent_lsn, header_size);
|
||||
|
||||
let (archive_download_result, archive_uncompress_result) = tokio::join!(
|
||||
tokio::spawn(async move { create_archive_file_part(Box::new(write), archive_name).await }),
|
||||
tokio::spawn(async move {
|
||||
uncompress_with_header(&files_to_skip, &destination_dir, header, &mut read).await
|
||||
})
|
||||
);
|
||||
|
||||
let download_value = archive_download_result
|
||||
.context("Failed to spawn archive download future")?
|
||||
.context("Failed to download an archive")?;
|
||||
archive_uncompress_result
|
||||
.context("Failed to spawn archive uncompress future")?
|
||||
.context("Failed to uncompress the archive")?;
|
||||
|
||||
Ok(download_value)
|
||||
}
|
||||
|
||||
/// Reads archive header from the stream given:
|
||||
/// * parses the file name to get the header size
|
||||
/// * reads the exact amount of bytes
|
||||
/// * uncompresses and deserializes those
|
||||
pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
|
||||
archive_name: &str,
|
||||
from: &mut A,
|
||||
) -> anyhow::Result<ArchiveHeader> {
|
||||
let (_, header_size) = parse_archive_name(Path::new(archive_name))?;
|
||||
|
||||
let mut compressed_header_bytes = vec![0; header_size as usize];
|
||||
from.read_exact(&mut compressed_header_bytes)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to read header header from the archive {}",
|
||||
archive_name
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut header_bytes = Vec::new();
|
||||
ZstdDecoder::new(io::BufReader::new(compressed_header_bytes.as_slice()))
|
||||
.read_to_end(&mut header_bytes)
|
||||
.await
|
||||
.context("Failed to decompress a header from the archive")?;
|
||||
|
||||
ArchiveHeader::des(&header_bytes).context("Failed to deserialize a header from the archive")
|
||||
}
|
||||
|
||||
/// Reads the archive metadata out of the archive name:
|
||||
/// * `disk_consistent_lsn` of the checkpoint that was archived
|
||||
/// * size of the archive header
|
||||
pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> {
|
||||
let archive_name = archive_path
|
||||
.file_name()
|
||||
.with_context(|| format!("Archive '{}' has no file name", archive_path.display()))?
|
||||
.to_string_lossy();
|
||||
let (lsn_str, header_size_str) =
|
||||
archive_name
|
||||
.rsplit_once(ARCHIVE_EXTENSION)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has incorrect extension, expected to contain '{}'",
|
||||
archive_path.display(),
|
||||
ARCHIVE_EXTENSION
|
||||
)
|
||||
})?;
|
||||
let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has an invalid disk consistent lsn in its extension",
|
||||
archive_path.display(),
|
||||
)
|
||||
})?;
|
||||
let header_size = header_size_str.parse::<u64>().with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has an invalid a header offset number in its extension",
|
||||
archive_path.display(),
|
||||
)
|
||||
})?;
|
||||
Ok((disk_consistent_lsn, header_size))
|
||||
}
|
||||
|
||||
fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String {
|
||||
let archive_name = format!(
|
||||
"{:016X}{ARCHIVE_EXTENSION}{}",
|
||||
u64::from(disk_consistent_lsn),
|
||||
header_size,
|
||||
ARCHIVE_EXTENSION = ARCHIVE_EXTENSION,
|
||||
);
|
||||
archive_name
|
||||
}
|
||||
|
||||
pub async fn uncompress_with_header(
|
||||
files_to_skip: &BTreeSet<PathBuf>,
|
||||
destination_dir: &Path,
|
||||
header: ArchiveHeader,
|
||||
archive_after_header: impl io::AsyncRead + Send + Sync + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Uncompressing archive into {}", destination_dir.display());
|
||||
let mut archive = ZstdDecoder::new(io::BufReader::new(archive_after_header));
|
||||
|
||||
if !destination_dir.exists() {
|
||||
fs::create_dir_all(&destination_dir)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create target directory at {}",
|
||||
destination_dir.display()
|
||||
)
|
||||
})?;
|
||||
} else if !destination_dir.is_dir() {
|
||||
bail!(
|
||||
"Destination path '{}' is not a valid directory",
|
||||
destination_dir.display()
|
||||
);
|
||||
}
|
||||
debug!("Will extract {} files from the archive", header.files.len());
|
||||
for entry in header.files {
|
||||
uncompress_entry(
|
||||
&mut archive,
|
||||
&entry.subpath.as_path(destination_dir),
|
||||
entry.size,
|
||||
files_to_skip,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to uncompress archive entry {:?}", entry))?;
|
||||
}
|
||||
uncompress_entry(
|
||||
&mut archive,
|
||||
&destination_dir.join(METADATA_FILE_NAME),
|
||||
header.metadata_file_size,
|
||||
files_to_skip,
|
||||
)
|
||||
.await
|
||||
.context("Failed to uncompress the metadata entry")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn uncompress_entry(
|
||||
archive: &mut ZstdDecoder<io::BufReader<impl io::AsyncRead + Send + Sync + Unpin>>,
|
||||
destination_path: &Path,
|
||||
entry_size: u64,
|
||||
files_to_skip: &BTreeSet<PathBuf>,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(parent) = destination_path.parent() {
|
||||
fs::create_dir_all(parent).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create parent directory for {}",
|
||||
destination_path.display()
|
||||
)
|
||||
})?;
|
||||
};
|
||||
|
||||
if files_to_skip.contains(destination_path) {
|
||||
debug!("Skipping {}", destination_path.display());
|
||||
copy_n_bytes(entry_size, archive, &mut io::sink())
|
||||
.await
|
||||
.context("Failed to skip bytes in the archive")?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut destination =
|
||||
io::BufWriter::new(fs::File::create(&destination_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open file {} for extraction",
|
||||
destination_path.display()
|
||||
)
|
||||
})?);
|
||||
copy_n_bytes(entry_size, archive, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to write extracted archive contents into file {}",
|
||||
destination_path.display()
|
||||
)
|
||||
})?;
|
||||
destination
|
||||
.flush()
|
||||
.await
|
||||
.context("Failed to flush the streaming archive bytes")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn write_archive_contents(
|
||||
source_dir: PathBuf,
|
||||
header: ArchiveHeader,
|
||||
metadata_bytes: Vec<u8>,
|
||||
mut archive_input: io::DuplexStream,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Starting writing files into archive");
|
||||
for file_entry in header.files {
|
||||
let path = file_entry.subpath.as_path(&source_dir);
|
||||
let mut source_file =
|
||||
io::BufReader::new(fs::File::open(&path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open file for archiving to path {}",
|
||||
path.display()
|
||||
)
|
||||
})?);
|
||||
let bytes_written = io::copy(&mut source_file, &mut archive_input)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open add a file into archive, file path {}",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
file_entry.size == bytes_written,
|
||||
"File {} was written to the archive incompletely",
|
||||
path.display()
|
||||
);
|
||||
trace!(
|
||||
"Added file '{}' ({} bytes) into the archive",
|
||||
path.display(),
|
||||
bytes_written
|
||||
);
|
||||
}
|
||||
let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input)
|
||||
.await
|
||||
.context("Failed to add metadata into the archive")?;
|
||||
ensure!(
|
||||
header.metadata_file_size == metadata_bytes_written,
|
||||
"Metadata file was written to the archive incompletely",
|
||||
);
|
||||
|
||||
archive_input
|
||||
.shutdown()
|
||||
.await
|
||||
.context("Failed to finalize the archive")?;
|
||||
debug!("Successfully streamed all files into the archive");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn prepare_header(
|
||||
source_dir: &Path,
|
||||
files: impl Iterator<Item = &PathBuf>,
|
||||
metadata_bytes: &[u8],
|
||||
) -> anyhow::Result<(ArchiveHeader, Vec<u8>)> {
|
||||
let mut archive_files = Vec::new();
|
||||
for file_path in files {
|
||||
let file_metadata = fs::metadata(file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata during archive indexing for {}",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
file_metadata.is_file(),
|
||||
"Archive indexed path {} is not a file",
|
||||
file_path.display()
|
||||
);
|
||||
|
||||
if file_path.file_name().and_then(|name| name.to_str()) != Some(METADATA_FILE_NAME) {
|
||||
let entry = FileEntry {
|
||||
subpath: RelativePath::new(source_dir, file_path).with_context(|| {
|
||||
format!(
|
||||
"File '{}' does not belong to pageserver workspace",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
size: file_metadata.len(),
|
||||
};
|
||||
archive_files.push(entry);
|
||||
}
|
||||
}
|
||||
|
||||
let header = ArchiveHeader {
|
||||
files: archive_files,
|
||||
metadata_file_size: metadata_bytes.len() as u64,
|
||||
};
|
||||
|
||||
debug!("Appending a header for {} files", header.files.len());
|
||||
let header_bytes = header.ser().context("Failed to serialize a header")?;
|
||||
debug!("Header bytes len {}", header_bytes.len());
|
||||
let mut compressed_header_bytes = Vec::new();
|
||||
ZstdEncoder::new(io::BufReader::new(header_bytes.as_slice()))
|
||||
.read_to_end(&mut compressed_header_bytes)
|
||||
.await
|
||||
.context("Failed to compress header bytes")?;
|
||||
debug!(
|
||||
"Compressed header bytes len {}",
|
||||
compressed_header_bytes.len()
|
||||
);
|
||||
Ok((header, compressed_header_bytes))
|
||||
}
|
||||
|
||||
async fn copy_n_bytes(
|
||||
n: u64,
|
||||
from: &mut (impl io::AsyncRead + Send + Sync + Unpin),
|
||||
into: &mut (impl io::AsyncWrite + Send + Sync + Unpin),
|
||||
) -> anyhow::Result<()> {
|
||||
let bytes_written = io::copy(&mut from.take(n), into).await?;
|
||||
ensure!(
|
||||
bytes_written == n,
|
||||
"Failed to read exactly {} bytes from the input, bytes written: {}",
|
||||
n,
|
||||
bytes_written,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokio::{fs, io::AsyncSeekExt};
|
||||
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn compress_and_uncompress() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("compress_and_uncompress")?;
|
||||
let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
init_directory(
|
||||
&timeline_dir,
|
||||
vec![
|
||||
("first", "first_contents"),
|
||||
("second", "second_contents"),
|
||||
(METADATA_FILE_NAME, "wrong_metadata"),
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
let timeline_files = list_file_paths_with_contents(&timeline_dir).await?;
|
||||
assert_eq!(
|
||||
timeline_files,
|
||||
vec![
|
||||
(
|
||||
timeline_dir.join("first"),
|
||||
FileContents::Text("first_contents".to_string())
|
||||
),
|
||||
(
|
||||
timeline_dir.join(METADATA_FILE_NAME),
|
||||
FileContents::Text("wrong_metadata".to_string())
|
||||
),
|
||||
(
|
||||
timeline_dir.join("second"),
|
||||
FileContents::Text("second_contents".to_string())
|
||||
),
|
||||
],
|
||||
"Initial timeline contents should contain two normal files and a wrong metadata file"
|
||||
);
|
||||
|
||||
let metadata = TimelineMetadata::new(Lsn(0x30), None, None, Lsn(0), Lsn(0), Lsn(0));
|
||||
let paths_to_archive = timeline_files
|
||||
.into_iter()
|
||||
.map(|(path, _)| path)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let tempdir = tempfile::tempdir()?;
|
||||
let base_path = tempdir.path().to_path_buf();
|
||||
let (header, header_size, archive_target) = archive_files_as_stream(
|
||||
&timeline_dir,
|
||||
paths_to_archive.iter(),
|
||||
&metadata,
|
||||
move |mut archive_streamer, archive_name| async move {
|
||||
let archive_target = base_path.join(&archive_name);
|
||||
let mut archive_file = fs::File::create(&archive_target).await?;
|
||||
io::copy(&mut archive_streamer, &mut archive_file).await?;
|
||||
Ok(archive_target)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut file = fs::File::open(&archive_target).await?;
|
||||
file.seek(io::SeekFrom::Start(header_size)).await?;
|
||||
let target_dir = tempdir.path().join("extracted");
|
||||
uncompress_with_header(&BTreeSet::new(), &target_dir, header, file).await?;
|
||||
|
||||
let extracted_files = list_file_paths_with_contents(&target_dir).await?;
|
||||
|
||||
assert_eq!(
|
||||
extracted_files,
|
||||
vec![
|
||||
(
|
||||
target_dir.join("first"),
|
||||
FileContents::Text("first_contents".to_string())
|
||||
),
|
||||
(
|
||||
target_dir.join(METADATA_FILE_NAME),
|
||||
FileContents::Binary(metadata.to_bytes()?)
|
||||
),
|
||||
(
|
||||
target_dir.join("second"),
|
||||
FileContents::Text("second_contents".to_string())
|
||||
),
|
||||
],
|
||||
"Extracted files should contain all local timeline files besides its metadata, which should be taken from the arguments"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn init_directory(
|
||||
root: &Path,
|
||||
files_with_contents: Vec<(&str, &str)>,
|
||||
) -> anyhow::Result<()> {
|
||||
fs::create_dir_all(root).await?;
|
||||
for (file_name, contents) in files_with_contents {
|
||||
fs::File::create(root.join(file_name))
|
||||
.await?
|
||||
.write_all(contents.as_bytes())
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum FileContents {
|
||||
Text(String),
|
||||
Binary(Vec<u8>),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for FileContents {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Text(text) => f.debug_tuple("Text").field(text).finish(),
|
||||
Self::Binary(bytes) => f
|
||||
.debug_tuple("Binary")
|
||||
.field(&format!("{} bytes", bytes.len()))
|
||||
.finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_file_paths_with_contents(
|
||||
root: &Path,
|
||||
) -> anyhow::Result<Vec<(PathBuf, FileContents)>> {
|
||||
let mut file_paths = Vec::new();
|
||||
|
||||
let mut dir_listings = vec![fs::read_dir(root).await?];
|
||||
while let Some(mut dir_listing) = dir_listings.pop() {
|
||||
while let Some(entry) = dir_listing.next_entry().await? {
|
||||
let entry_path = entry.path();
|
||||
if entry_path.is_file() {
|
||||
let contents = match String::from_utf8(fs::read(&entry_path).await?) {
|
||||
Ok(text) => FileContents::Text(text),
|
||||
Err(e) => FileContents::Binary(e.into_bytes()),
|
||||
};
|
||||
file_paths.push((entry_path, contents));
|
||||
} else if entry_path.is_dir() {
|
||||
dir_listings.push(fs::read_dir(entry_path).await?);
|
||||
} else {
|
||||
info!(
|
||||
"Skipping path '{}' as it's not a file or a directory",
|
||||
entry_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file_paths.sort();
|
||||
Ok(file_paths)
|
||||
}
|
||||
}
|
||||
@@ -1,30 +1,76 @@
|
||||
//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
|
||||
//! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory.
|
||||
|
||||
use std::{collections::BTreeSet, path::PathBuf, sync::Arc};
|
||||
use std::fmt::Debug;
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, trace, warn};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::{metadata_path, TimelineMetadata},
|
||||
layered_repository::metadata::metadata_path,
|
||||
remote_storage::{
|
||||
storage_sync::{
|
||||
compression, fetch_full_index, index::TimelineIndexEntryInner, sync_queue, SyncKind,
|
||||
SyncTask,
|
||||
},
|
||||
storage_sync::{sync_queue, SyncTask},
|
||||
RemoteStorage, ZTenantTimelineId,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
index::{ArchiveId, RemoteTimeline},
|
||||
RemoteIndex, TimelineDownload,
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
SyncData, TimelineDownload,
|
||||
};
|
||||
|
||||
/// Retrieves index data from the remote storage for a given timeline.
|
||||
pub async fn download_index_part<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<IndexPart>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut index_part_bytes = Vec::new();
|
||||
storage
|
||||
.download(&part_storage_path, &mut index_part_bytes)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download an index part from storage path '{:?}'",
|
||||
part_storage_path
|
||||
)
|
||||
})?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize index part file from storage path '{:?}'",
|
||||
part_storage_path
|
||||
)
|
||||
})?;
|
||||
|
||||
let missing_files = index_part.missing_files();
|
||||
if !missing_files.is_empty() {
|
||||
warn!(
|
||||
"Found missing layers in index part for timeline {}: {:?}",
|
||||
sync_id, missing_files
|
||||
);
|
||||
}
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
/// Timeline download result, with extra data, needed for downloading.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum DownloadedTimeline {
|
||||
/// Remote timeline data is either absent or corrupt, no download possible.
|
||||
Abort,
|
||||
@@ -33,222 +79,136 @@ pub(super) enum DownloadedTimeline {
|
||||
FailedAndRescheduled,
|
||||
/// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
|
||||
/// Initial download successful.
|
||||
Successful,
|
||||
Successful(SyncData<TimelineDownload>),
|
||||
}
|
||||
|
||||
/// Attempts to download and uncompress files from all remote archives for the timeline given.
|
||||
/// Attempts to download all given timeline's layers.
|
||||
/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
|
||||
/// updated in the end of every checkpoint archive extraction.
|
||||
/// updated in the end, if the remote one contains a newer disk_consistent_lsn.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the download, with updated archive skip list
|
||||
/// (for any new successful archive downloads and extractions).
|
||||
pub(super) async fn download_timeline<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
conf: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
/// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task.
|
||||
pub(super) async fn download_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut download: TimelineDownload,
|
||||
retries: u32,
|
||||
) -> DownloadedTimeline {
|
||||
debug!("Downloading layers for sync id {}", sync_id);
|
||||
|
||||
let ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = sync_id;
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let index_read = index.read().await;
|
||||
let remote_timeline = match index_read.timeline_entry(&sync_id) {
|
||||
mut download_data: SyncData<TimelineDownload>,
|
||||
) -> DownloadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let remote_timeline = match remote_timeline {
|
||||
Some(remote_timeline) => {
|
||||
if !remote_timeline.awaits_download {
|
||||
error!("Timeline with sync id {} is not awaiting download", sync_id);
|
||||
return DownloadedTimeline::Abort;
|
||||
}
|
||||
remote_timeline
|
||||
}
|
||||
None => {
|
||||
error!("Cannot download: no timeline is present in the index for given id");
|
||||
drop(index_read);
|
||||
error!(
|
||||
"Timeline with sync id {} is not present in the remote index",
|
||||
sync_id
|
||||
);
|
||||
return DownloadedTimeline::Abort;
|
||||
}
|
||||
|
||||
Some(index_entry) => match index_entry.inner() {
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => {
|
||||
let cloned = remote_timeline.clone();
|
||||
drop(index_read);
|
||||
cloned
|
||||
}
|
||||
TimelineIndexEntryInner::Description(_) => {
|
||||
// we do not check here for awaits_download because it is ok
|
||||
// to call this function while the download is in progress
|
||||
// so it is not a concurrent download, it is the same one
|
||||
|
||||
let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn();
|
||||
drop(index_read);
|
||||
debug!("Found timeline description for the given ids, downloading the full index");
|
||||
match fetch_full_index(
|
||||
remote_assets.as_ref(),
|
||||
&conf.timeline_path(&timeline_id, &tenant_id),
|
||||
sync_id,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(remote_timeline) => remote_timeline,
|
||||
Err(e) => {
|
||||
error!("Failed to download full timeline index: {:?}", e);
|
||||
|
||||
return match remote_disk_consistent_lsn {
|
||||
Some(_) => {
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Download(download),
|
||||
));
|
||||
DownloadedTimeline::FailedAndRescheduled
|
||||
}
|
||||
None => {
|
||||
error!("Cannot download: no disk consistent Lsn is present for the index entry");
|
||||
DownloadedTimeline::Abort
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
if remote_timeline.checkpoints().max().is_none() {
|
||||
debug!("Cannot download: no disk consistent Lsn is present for the remote timeline");
|
||||
return DownloadedTimeline::Abort;
|
||||
};
|
||||
|
||||
debug!("Downloading timeline archives");
|
||||
let archives_to_download = remote_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.filter(|remote_archive| !download.archives_to_skip.contains(remote_archive))
|
||||
debug!("Downloading timeline layers for sync id {}", sync_id);
|
||||
let download = &mut download_data.data;
|
||||
|
||||
let layers_to_download = remote_timeline
|
||||
.stored_files()
|
||||
.difference(&download.layers_to_skip)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let archives_total = archives_to_download.len();
|
||||
debug!("Downloading {} archives of a timeline", archives_total);
|
||||
trace!("Archives to download: {:?}", archives_to_download);
|
||||
trace!("Layers to download: {:?}", layers_to_download);
|
||||
|
||||
for (archives_downloaded, archive_id) in archives_to_download.into_iter().enumerate() {
|
||||
match try_download_archive(
|
||||
conf,
|
||||
sync_id,
|
||||
Arc::clone(&remote_assets),
|
||||
&remote_timeline,
|
||||
archive_id,
|
||||
Arc::clone(&download.files_to_skip),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
let archives_left = archives_total - archives_downloaded;
|
||||
error!(
|
||||
"Failed to download archive {:?} (archives downloaded: {}; archives left: {}) for tenant {} timeline {}, requeueing the download: {:?}",
|
||||
archive_id, archives_downloaded, archives_left, tenant_id, timeline_id, e
|
||||
let mut download_tasks = layers_to_download
|
||||
.into_iter()
|
||||
.map(|layer_desination_path| async move {
|
||||
if layer_desination_path.exists() {
|
||||
debug!(
|
||||
"Layer already exists locally, skipping download: {}",
|
||||
layer_desination_path.display()
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Download(download),
|
||||
));
|
||||
return DownloadedTimeline::FailedAndRescheduled;
|
||||
} else {
|
||||
let layer_storage_path = storage
|
||||
.storage_path(&layer_desination_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
layer_desination_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut destination_file = fs::File::create(&layer_desination_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
layer_desination_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.download(&layer_storage_path, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download a layer from storage path '{:?}'",
|
||||
layer_storage_path
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok(()) => {
|
||||
debug!("Successfully downloaded archive {:?}", archive_id);
|
||||
download.archives_to_skip.insert(archive_id);
|
||||
Ok::<_, anyhow::Error>(layer_desination_path)
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
debug!("Downloading {} layers of a timeline", download_tasks.len());
|
||||
|
||||
let mut errors_happened = false;
|
||||
while let Some(download_result) = download_tasks.next().await {
|
||||
match download_result {
|
||||
Ok(downloaded_path) => {
|
||||
download.layers_to_skip.insert(downloaded_path);
|
||||
}
|
||||
Err(e) => {
|
||||
errors_happened = true;
|
||||
error!(
|
||||
"Failed to download a layer for timeline {}: {:?}",
|
||||
sync_id, e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Finished downloading all timeline's archives");
|
||||
DownloadedTimeline::Successful
|
||||
}
|
||||
|
||||
async fn try_download_archive<
|
||||
P: Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
conf: &'static PageServerConf,
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: ZTenantTimelineId,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
remote_timeline: &RemoteTimeline,
|
||||
archive_id: ArchiveId,
|
||||
files_to_skip: Arc<BTreeSet<PathBuf>>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Downloading archive {:?}", archive_id);
|
||||
let archive_to_download = remote_timeline
|
||||
.archive_data(archive_id)
|
||||
.with_context(|| format!("Archive {:?} not found in remote storage", archive_id))?;
|
||||
let (archive_header, header_size) = remote_timeline
|
||||
.restore_header(archive_id)
|
||||
.context("Failed to restore header when downloading an archive")?;
|
||||
|
||||
match read_local_metadata(conf, timeline_id, tenant_id).await {
|
||||
Ok(local_metadata) => ensure!(
|
||||
// need to allow `<=` instead of `<` due to cases when a failed archive can be redownloaded
|
||||
local_metadata.disk_consistent_lsn() <= archive_to_download.disk_consistent_lsn(),
|
||||
"Cannot download archive with Lsn {} since it's earlier than local Lsn {}",
|
||||
archive_to_download.disk_consistent_lsn(),
|
||||
local_metadata.disk_consistent_lsn()
|
||||
),
|
||||
Err(e) => warn!("Failed to read local metadata file, assuming it's safe to override its with the download. Read: {:#}", e),
|
||||
if errors_happened {
|
||||
debug!("Reenqueuing failed download task for timeline {}", sync_id);
|
||||
download_data.retries += 1;
|
||||
sync_queue::push(sync_id, SyncTask::Download(download_data));
|
||||
DownloadedTimeline::FailedAndRescheduled
|
||||
} else {
|
||||
debug!("Finished downloading all timeline's layers");
|
||||
DownloadedTimeline::Successful(download_data)
|
||||
}
|
||||
compression::uncompress_file_stream_with_index(
|
||||
conf.timeline_path(&timeline_id, &tenant_id),
|
||||
files_to_skip,
|
||||
archive_to_download.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
move |mut archive_target, archive_name| async move {
|
||||
let archive_local_path = conf
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(&archive_name);
|
||||
let remote_storage = &remote_assets.0;
|
||||
remote_storage
|
||||
.download_range(
|
||||
&remote_storage.storage_path(&archive_local_path)?,
|
||||
header_size,
|
||||
None,
|
||||
&mut archive_target,
|
||||
)
|
||||
.await
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_local_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: zenith_utils::zid::ZTimelineId,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let local_metadata_path = metadata_path(conf, timeline_id, tenant_id);
|
||||
let local_metadata_bytes = fs::read(&local_metadata_path)
|
||||
.await
|
||||
.context("Failed to read local metadata file bytes")?;
|
||||
TimelineMetadata::from_bytes(&local_metadata_bytes)
|
||||
.context("Failed to read local metadata files bytes")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
|
||||
use tempfile::tempdir;
|
||||
use tokio::fs;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
local_fs::LocalFs,
|
||||
storage_sync::test_utils::{
|
||||
assert_index_descriptions, assert_timeline_files_match, create_local_timeline,
|
||||
dummy_metadata, ensure_correct_timeline_upload, expect_timeline,
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
LocalFs,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
@@ -256,80 +216,185 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_download_timeline() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("test_download_timeline")?;
|
||||
let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
async fn download_timeline() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
let remote_path = storage.storage_path(&local_path)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
}
|
||||
fs::copy(&local_path, &remote_path).await?;
|
||||
}
|
||||
let mut read_dir = fs::read_dir(&local_timeline_path).await?;
|
||||
while let Some(dir_entry) = read_dir.next_entry().await? {
|
||||
if dir_entry.file_name().to_str() == Some("layer_to_keep_locally") {
|
||||
continue;
|
||||
} else {
|
||||
fs::remove_file(dir_entry.path()).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut remote_timeline = RemoteTimeline::new(metadata.clone());
|
||||
remote_timeline.awaits_download = true;
|
||||
remote_timeline.add_timeline_layers(
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer| local_timeline_path.join(layer)),
|
||||
);
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let storage = &remote_assets.0;
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let regular_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
let regular_timeline = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
dummy_metadata(Lsn(0x30)),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
regular_timeline,
|
||||
)
|
||||
.await;
|
||||
// upload multiple checkpoints for the same timeline
|
||||
let regular_timeline = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["c", "d"],
|
||||
dummy_metadata(Lsn(0x40)),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
regular_timeline,
|
||||
)
|
||||
.await;
|
||||
|
||||
fs::remove_dir_all(®ular_timeline_path).await?;
|
||||
let remote_regular_timeline = expect_timeline(index, sync_id).await;
|
||||
|
||||
download_timeline(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
let download_data = match download_timeline_layers(
|
||||
&storage,
|
||||
Some(&remote_timeline),
|
||||
sync_id,
|
||||
TimelineDownload {
|
||||
files_to_skip: Arc::new(BTreeSet::new()),
|
||||
archives_to_skip: BTreeSet::new(),
|
||||
},
|
||||
0,
|
||||
SyncData::new(
|
||||
current_retries,
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
|
||||
},
|
||||
),
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(
|
||||
index,
|
||||
&RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
remote_assets
|
||||
.0
|
||||
.list()
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
.await
|
||||
{
|
||||
DownloadedTimeline::Successful(data) => data,
|
||||
wrong_result => panic!(
|
||||
"Expected a successful download for timeline, but got: {:?}",
|
||||
wrong_result
|
||||
),
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
current_retries, download_data.retries,
|
||||
"On successful download, retries are not expected to change"
|
||||
);
|
||||
assert_eq!(
|
||||
download_data
|
||||
.data
|
||||
.layers_to_skip
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer| local_timeline_path.join(layer))
|
||||
.collect(),
|
||||
"On successful download, layers to skip should contain all downloaded files and present layers that were skipped"
|
||||
);
|
||||
|
||||
let mut downloaded_files = BTreeSet::new();
|
||||
let mut read_dir = fs::read_dir(&local_timeline_path).await?;
|
||||
while let Some(dir_entry) = read_dir.next_entry().await? {
|
||||
downloaded_files.insert(dir_entry.path());
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
downloaded_files,
|
||||
layer_files
|
||||
.iter()
|
||||
.filter(|layer| layer != &&"layer_to_skip")
|
||||
.map(|layer| local_timeline_path.join(layer))
|
||||
.collect(),
|
||||
"On successful download, all layers that were not skipped, should be downloaded"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_timeline_negatives() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline_negatives")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
|
||||
let empty_remote_timeline_download = download_timeline_layers(
|
||||
&storage,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
),
|
||||
)
|
||||
.await;
|
||||
assert_timeline_files_match(&repo_harness, TIMELINE_ID, remote_regular_timeline);
|
||||
assert!(
|
||||
matches!(empty_remote_timeline_download, DownloadedTimeline::Abort),
|
||||
"Should not allow downloading for empty remote timeline"
|
||||
);
|
||||
|
||||
let not_expecting_download_remote_timeline = RemoteTimeline::new(dummy_metadata(Lsn(5)));
|
||||
assert!(
|
||||
!not_expecting_download_remote_timeline.awaits_download,
|
||||
"Should not expect download for the timeline"
|
||||
);
|
||||
let already_downloading_remote_timeline_download = download_timeline_layers(
|
||||
&storage,
|
||||
Some(¬_expecting_download_remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
matches!(
|
||||
dbg!(already_downloading_remote_timeline_download),
|
||||
DownloadedTimeline::Abort,
|
||||
),
|
||||
"Should not allow downloading for remote timeline that does not expect it"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_download_index_part() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("test_download_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
HashSet::from([
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?,
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?,
|
||||
]),
|
||||
HashSet::from([RelativePath::new(
|
||||
&local_timeline_path,
|
||||
local_timeline_path.join("three"),
|
||||
)?]),
|
||||
metadata.disk_consistent_lsn(),
|
||||
metadata.to_bytes()?,
|
||||
);
|
||||
|
||||
let local_index_part_path =
|
||||
metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let storage_path = storage.storage_path(&local_index_part_path)?;
|
||||
fs::create_dir_all(storage_path.parent().unwrap()).await?;
|
||||
fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?;
|
||||
|
||||
let downloaded_index_part = download_index_part(harness.conf, &storage, sync_id).await?;
|
||||
|
||||
assert_eq!(
|
||||
downloaded_index_part, index_part,
|
||||
"Downloaded index part should be the same as the one in storage"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,63 +1,56 @@
|
||||
//! In-memory index to track the tenant files on the remote strorage, mitigating the storage format differences between the local and remote files.
|
||||
//! Able to restore itself from the storage archive data and reconstruct archive indices on demand.
|
||||
//!
|
||||
//! The index is intended to be portable, so deliberately does not store any local paths inside.
|
||||
//! This way in the future, the index could be restored fast from its serialized stored form.
|
||||
//! In-memory index to track the tenant files on the remote storage.
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet, HashMap},
|
||||
collections::{HashMap, HashSet},
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use anyhow::{Context, Ok};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::*;
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::TIMELINES_SEGMENT_NAME,
|
||||
remote_storage::{
|
||||
storage_sync::compression::{parse_archive_name, FileEntry},
|
||||
ZTenantTimelineId,
|
||||
},
|
||||
config::PageServerConf, layered_repository::metadata::TimelineMetadata,
|
||||
remote_storage::ZTenantTimelineId,
|
||||
};
|
||||
|
||||
use super::compression::ArchiveHeader;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// A part of the filesystem path, that needs a root to become a path again.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct RelativePath(String);
|
||||
|
||||
impl RelativePath {
|
||||
/// Attempts to strip off the base from path, producing a relative path or an error.
|
||||
pub fn new<P: AsRef<Path>>(base: &Path, path: P) -> anyhow::Result<Self> {
|
||||
let relative = path
|
||||
.as_ref()
|
||||
.strip_prefix(base)
|
||||
.context("path is not relative to base")?;
|
||||
let path = path.as_ref();
|
||||
let relative = path.strip_prefix(base).with_context(|| {
|
||||
format!(
|
||||
"path '{}' is not relative to base '{}'",
|
||||
path.display(),
|
||||
base.display()
|
||||
)
|
||||
})?;
|
||||
Ok(RelativePath(relative.to_string_lossy().to_string()))
|
||||
}
|
||||
|
||||
/// Joins the relative path with the base path.
|
||||
pub fn as_path(&self, base: &Path) -> PathBuf {
|
||||
fn as_path(&self, base: &Path) -> PathBuf {
|
||||
base.join(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// An index to track tenant files that exist on the remote storage.
|
||||
/// Currently, timeline archive files are tracked only.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteTimelineIndex {
|
||||
timeline_entries: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
|
||||
timeline_entries: HashMap<ZTenantTimelineId, RemoteTimeline>,
|
||||
}
|
||||
|
||||
/// A wrapper to synchrnize access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||
pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
|
||||
|
||||
impl RemoteIndex {
|
||||
@@ -67,27 +60,22 @@ impl RemoteIndex {
|
||||
})))
|
||||
}
|
||||
|
||||
/// Attempts to parse file paths (not checking the file contents) and find files
|
||||
/// that can be tracked wiht the index.
|
||||
/// On parse falures, logs the error and continues, so empty index can be created from not suitable paths.
|
||||
pub fn try_parse_descriptions_from_paths<P: AsRef<Path>>(
|
||||
pub fn from_parts(
|
||||
conf: &'static PageServerConf,
|
||||
paths: impl Iterator<Item = P>,
|
||||
) -> Self {
|
||||
let mut index = RemoteTimelineIndex {
|
||||
timeline_entries: HashMap::new(),
|
||||
};
|
||||
for path in paths {
|
||||
if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) {
|
||||
debug!(
|
||||
"Failed to parse path '{}' as index entry: {:#}",
|
||||
path.as_ref().display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
index_parts: HashMap<ZTenantTimelineId, IndexPart>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut timeline_entries = HashMap::new();
|
||||
|
||||
for (sync_id, index_part) in index_parts {
|
||||
let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.context("Failed to restore remote timeline data from index part")?;
|
||||
timeline_entries.insert(sync_id, remote_timeline);
|
||||
}
|
||||
|
||||
Self(Arc::new(RwLock::new(index)))
|
||||
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex {
|
||||
timeline_entries,
|
||||
}))))
|
||||
}
|
||||
|
||||
pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
|
||||
@@ -106,39 +94,18 @@ impl Clone for RemoteIndex {
|
||||
}
|
||||
|
||||
impl RemoteTimelineIndex {
|
||||
pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> {
|
||||
pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> {
|
||||
self.timeline_entries.get(id)
|
||||
}
|
||||
|
||||
pub fn timeline_entry_mut(
|
||||
&mut self,
|
||||
id: &ZTenantTimelineId,
|
||||
) -> Option<&mut TimelineIndexEntry> {
|
||||
pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> {
|
||||
self.timeline_entries.get_mut(id)
|
||||
}
|
||||
|
||||
pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) {
|
||||
pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) {
|
||||
self.timeline_entries.insert(id, entry);
|
||||
}
|
||||
|
||||
pub fn upgrade_timeline_entry(
|
||||
&mut self,
|
||||
id: &ZTenantTimelineId,
|
||||
remote_timeline: RemoteTimeline,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut entry = self.timeline_entries.get_mut(id).ok_or(anyhow::anyhow!(
|
||||
"timeline is unexpectedly missing from remote index"
|
||||
))?;
|
||||
|
||||
if !matches!(entry.inner, TimelineIndexEntryInner::Description(_)) {
|
||||
anyhow::bail!("timeline entry is not a description entry")
|
||||
};
|
||||
|
||||
entry.inner = TimelineIndexEntryInner::Full(remote_timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
|
||||
self.timeline_entries.keys().copied()
|
||||
}
|
||||
@@ -150,351 +117,295 @@ impl RemoteTimelineIndex {
|
||||
) -> anyhow::Result<()> {
|
||||
self.timeline_entry_mut(id)
|
||||
.ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))?
|
||||
.set_awaits_download(awaits_download);
|
||||
.awaits_download = awaits_download;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||
pub struct DescriptionTimelineIndexEntry {
|
||||
pub description: BTreeMap<ArchiveId, ArchiveDescription>,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct FullTimelineIndexEntry {
|
||||
pub remote_timeline: RemoteTimeline,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum TimelineIndexEntryInner {
|
||||
Description(BTreeMap<ArchiveId, ArchiveDescription>),
|
||||
Full(RemoteTimeline),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineIndexEntry {
|
||||
inner: TimelineIndexEntryInner,
|
||||
awaits_download: bool,
|
||||
}
|
||||
|
||||
impl TimelineIndexEntry {
|
||||
pub fn new(inner: TimelineIndexEntryInner, awaits_download: bool) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
awaits_download,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inner(&self) -> &TimelineIndexEntryInner {
|
||||
&self.inner
|
||||
}
|
||||
|
||||
pub fn inner_mut(&mut self) -> &mut TimelineIndexEntryInner {
|
||||
&mut self.inner
|
||||
}
|
||||
|
||||
pub fn uploaded_checkpoints(&self) -> BTreeSet<Lsn> {
|
||||
match &self.inner {
|
||||
TimelineIndexEntryInner::Description(description) => {
|
||||
description.keys().map(|archive_id| archive_id.0).collect()
|
||||
}
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
|
||||
.checkpoint_archives
|
||||
.keys()
|
||||
.map(|archive_id| archive_id.0)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline.
|
||||
pub fn disk_consistent_lsn(&self) -> Option<Lsn> {
|
||||
match &self.inner {
|
||||
TimelineIndexEntryInner::Description(description) => {
|
||||
description.keys().map(|archive_id| archive_id.0).max()
|
||||
}
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
|
||||
.checkpoint_archives
|
||||
.keys()
|
||||
.map(|archive_id| archive_id.0)
|
||||
.max(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_awaits_download(&self) -> bool {
|
||||
self.awaits_download
|
||||
}
|
||||
|
||||
pub fn set_awaits_download(&mut self, awaits_download: bool) {
|
||||
self.awaits_download = awaits_download;
|
||||
}
|
||||
}
|
||||
|
||||
/// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct ArchiveId(pub(super) Lsn);
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
struct FileId(ArchiveId, ArchiveEntryNumber);
|
||||
|
||||
type ArchiveEntryNumber = usize;
|
||||
|
||||
/// All archives and files in them, representing a certain timeline.
|
||||
/// Uses file and archive IDs to reference those without ownership issues.
|
||||
/// Restored index part data about the timeline, stored in the remote index.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct RemoteTimeline {
|
||||
timeline_files: BTreeMap<FileId, FileEntry>,
|
||||
checkpoint_archives: BTreeMap<ArchiveId, CheckpointArchive>,
|
||||
}
|
||||
timeline_layers: HashSet<PathBuf>,
|
||||
missing_layers: HashSet<PathBuf>,
|
||||
|
||||
/// Archive metadata, enough to restore a header with the timeline data.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct CheckpointArchive {
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_file_size: u64,
|
||||
files: BTreeSet<FileId>,
|
||||
archive_header_size: u64,
|
||||
}
|
||||
|
||||
impl CheckpointArchive {
|
||||
pub fn disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
pub metadata: TimelineMetadata,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
impl RemoteTimeline {
|
||||
pub fn empty() -> Self {
|
||||
pub fn new(metadata: TimelineMetadata) -> Self {
|
||||
Self {
|
||||
timeline_files: BTreeMap::new(),
|
||||
checkpoint_archives: BTreeMap::new(),
|
||||
timeline_layers: HashSet::new(),
|
||||
missing_layers: HashSet::new(),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn checkpoints(&self) -> impl Iterator<Item = Lsn> + '_ {
|
||||
self.checkpoint_archives
|
||||
.values()
|
||||
.map(CheckpointArchive::disk_consistent_lsn)
|
||||
pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator<Item = PathBuf>) {
|
||||
self.timeline_layers.extend(new_layers.into_iter());
|
||||
}
|
||||
|
||||
pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator<Item = PathBuf>) {
|
||||
self.missing_layers.extend(upload_failures.into_iter());
|
||||
}
|
||||
|
||||
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
||||
pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet<PathBuf> {
|
||||
self.timeline_files
|
||||
.values()
|
||||
.map(|file_entry| file_entry.subpath.as_path(timeline_dir))
|
||||
.collect()
|
||||
pub fn stored_files(&self) -> &HashSet<PathBuf> {
|
||||
&self.timeline_layers
|
||||
}
|
||||
|
||||
pub fn contains_checkpoint_at(&self, disk_consistent_lsn: Lsn) -> bool {
|
||||
self.checkpoint_archives
|
||||
.contains_key(&ArchiveId(disk_consistent_lsn))
|
||||
pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
|
||||
let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
|
||||
Ok(Self {
|
||||
timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers),
|
||||
missing_layers: to_local_paths(timeline_path, index_part.missing_layers),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn archive_data(&self, archive_id: ArchiveId) -> Option<&CheckpointArchive> {
|
||||
self.checkpoint_archives.get(&archive_id)
|
||||
}
|
||||
/// Part of the remote index, corresponding to a certain timeline.
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexPart {
|
||||
timeline_layers: HashSet<RelativePath>,
|
||||
/// Currently is not really used in pageserver,
|
||||
/// present to manually keep track of the layer files that pageserver might never retrieve.
|
||||
///
|
||||
/// Such "holes" might appear if any upload task was evicted on an error threshold:
|
||||
/// the this layer will only be rescheduled for upload on pageserver restart.
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Restores a header of a certain remote archive from the memory data.
|
||||
/// Returns the header and its compressed size in the archive, both can be used to uncompress that archive.
|
||||
pub fn restore_header(&self, archive_id: ArchiveId) -> anyhow::Result<(ArchiveHeader, u64)> {
|
||||
let archive = self
|
||||
.checkpoint_archives
|
||||
.get(&archive_id)
|
||||
.with_context(|| format!("Archive {:?} not found", archive_id))?;
|
||||
impl IndexPart {
|
||||
pub const FILE_NAME: &'static str = "index_part";
|
||||
pub const FILE_EXTENSION: &'static str = "json";
|
||||
|
||||
let mut header_files = Vec::with_capacity(archive.files.len());
|
||||
for (expected_archive_position, archive_file) in archive.files.iter().enumerate() {
|
||||
let &FileId(archive_id, archive_position) = archive_file;
|
||||
ensure!(
|
||||
expected_archive_position == archive_position,
|
||||
"Archive header is corrupt, file # {} from archive {:?} header is missing",
|
||||
expected_archive_position,
|
||||
archive_id,
|
||||
);
|
||||
|
||||
let timeline_file = self.timeline_files.get(archive_file).with_context(|| {
|
||||
format!(
|
||||
"File with id {:?} not found for archive {:?}",
|
||||
archive_file, archive_id
|
||||
)
|
||||
})?;
|
||||
header_files.push(timeline_file.clone());
|
||||
}
|
||||
|
||||
Ok((
|
||||
ArchiveHeader {
|
||||
files: header_files,
|
||||
metadata_file_size: archive.metadata_file_size,
|
||||
},
|
||||
archive.archive_header_size,
|
||||
))
|
||||
}
|
||||
|
||||
/// Updates (creates, if necessary) the data about certain archive contents.
|
||||
pub fn update_archive_contents(
|
||||
&mut self,
|
||||
#[cfg(test)]
|
||||
pub fn new(
|
||||
timeline_layers: HashSet<RelativePath>,
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
header: ArchiveHeader,
|
||||
header_size: u64,
|
||||
) {
|
||||
let archive_id = ArchiveId(disk_consistent_lsn);
|
||||
let mut common_archive_files = BTreeSet::new();
|
||||
for (file_index, file_entry) in header.files.into_iter().enumerate() {
|
||||
let file_id = FileId(archive_id, file_index);
|
||||
self.timeline_files.insert(file_id, file_entry);
|
||||
common_archive_files.insert(file_id);
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
Self {
|
||||
timeline_layers,
|
||||
missing_layers,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
}
|
||||
}
|
||||
|
||||
let metadata_file_size = header.metadata_file_size;
|
||||
self.checkpoint_archives
|
||||
.entry(archive_id)
|
||||
.or_insert_with(|| CheckpointArchive {
|
||||
metadata_file_size,
|
||||
files: BTreeSet::new(),
|
||||
archive_header_size: header_size,
|
||||
disk_consistent_lsn,
|
||||
})
|
||||
.files
|
||||
.extend(common_archive_files.into_iter());
|
||||
pub fn missing_files(&self) -> &HashSet<RelativePath> {
|
||||
&self.missing_layers
|
||||
}
|
||||
|
||||
pub fn from_remote_timeline(
|
||||
timeline_path: &Path,
|
||||
remote_timeline: RemoteTimeline,
|
||||
) -> anyhow::Result<Self> {
|
||||
let metadata_bytes = remote_timeline.metadata.to_bytes()?;
|
||||
Ok(Self {
|
||||
timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers)
|
||||
.context("Failed to convert timeline layers' paths to relative ones")?,
|
||||
missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers)
|
||||
.context("Failed to convert missing layers' paths to relative ones")?,
|
||||
disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
|
||||
metadata_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata abput timeline checkpoint archive, parsed from its remote storage path.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ArchiveDescription {
|
||||
pub header_size: u64,
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
pub archive_name: String,
|
||||
fn to_local_paths(
|
||||
timeline_path: &Path,
|
||||
paths: impl IntoIterator<Item = RelativePath>,
|
||||
) -> HashSet<PathBuf> {
|
||||
paths
|
||||
.into_iter()
|
||||
.map(|path| path.as_path(timeline_path))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn try_parse_index_entry(
|
||||
index: &mut RemoteTimelineIndex,
|
||||
conf: &'static PageServerConf,
|
||||
path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenants_dir = conf.tenants_path();
|
||||
let tenant_id = path
|
||||
.strip_prefix(&tenants_dir)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Path '{}' does not belong to tenants directory '{}'",
|
||||
path.display(),
|
||||
tenants_dir.display(),
|
||||
)
|
||||
})?
|
||||
.iter()
|
||||
.next()
|
||||
.with_context(|| format!("Found no tenant id in path '{}'", path.display()))?
|
||||
.to_string_lossy()
|
||||
.parse::<ZTenantId>()
|
||||
.with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant_id);
|
||||
match path.strip_prefix(&timelines_path) {
|
||||
Ok(timelines_subpath) => {
|
||||
let mut segments = timelines_subpath.iter();
|
||||
let timeline_id = segments
|
||||
.next()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"{} directory of tenant {} (path '{}') is not an index entry",
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
tenant_id,
|
||||
path.display()
|
||||
)
|
||||
})?
|
||||
.to_string_lossy()
|
||||
.parse::<ZTimelineId>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse timeline id from path '{}'", path.display())
|
||||
})?;
|
||||
|
||||
let (disk_consistent_lsn, header_size) =
|
||||
parse_archive_name(path).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse archive name out in path '{}'",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let archive_name = path
|
||||
.file_name()
|
||||
.with_context(|| format!("Archive '{}' has no file name", path.display()))?
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
let sync_id = ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let timeline_index_entry = index.timeline_entries.entry(sync_id).or_insert_with(|| {
|
||||
TimelineIndexEntry::new(
|
||||
TimelineIndexEntryInner::Description(BTreeMap::default()),
|
||||
false,
|
||||
)
|
||||
});
|
||||
match timeline_index_entry.inner_mut() {
|
||||
TimelineIndexEntryInner::Description(description) => {
|
||||
description.insert(
|
||||
ArchiveId(disk_consistent_lsn),
|
||||
ArchiveDescription {
|
||||
header_size,
|
||||
disk_consistent_lsn,
|
||||
archive_name,
|
||||
},
|
||||
);
|
||||
}
|
||||
TimelineIndexEntryInner::Full(_) => {
|
||||
bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(timelines_strip_error) => {
|
||||
bail!(
|
||||
"Path '{}' is not an archive entry '{}'",
|
||||
path.display(),
|
||||
timelines_strip_error,
|
||||
)
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
fn to_relative_paths(
|
||||
timeline_path: &Path,
|
||||
paths: impl IntoIterator<Item = PathBuf>,
|
||||
) -> anyhow::Result<HashSet<RelativePath>> {
|
||||
paths
|
||||
.into_iter()
|
||||
.map(|path| RelativePath::new(timeline_path, path))
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
#[test]
|
||||
fn header_restoration_preserves_file_order() {
|
||||
let header = ArchiveHeader {
|
||||
files: vec![
|
||||
FileEntry {
|
||||
size: 5,
|
||||
subpath: RelativePath("one".to_string()),
|
||||
},
|
||||
FileEntry {
|
||||
size: 1,
|
||||
subpath: RelativePath("two".to_string()),
|
||||
},
|
||||
FileEntry {
|
||||
size: 222,
|
||||
subpath: RelativePath("zero".to_string()),
|
||||
},
|
||||
],
|
||||
metadata_file_size: 5,
|
||||
fn index_part_conversion() {
|
||||
let harness = RepoHarness::create("index_part_conversion").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
let remote_timeline = RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
timeline_path.join("layer_1"),
|
||||
timeline_path.join("layer_2"),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
timeline_path.join("missing_1"),
|
||||
timeline_path.join("missing_2"),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
};
|
||||
|
||||
let lsn = Lsn(1);
|
||||
let mut remote_timeline = RemoteTimeline::empty();
|
||||
remote_timeline.update_archive_contents(lsn, header.clone(), 15);
|
||||
|
||||
let (restored_header, _) = remote_timeline
|
||||
.restore_header(ArchiveId(lsn))
|
||||
.expect("Should be able to restore header from a valid remote timeline");
|
||||
let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone())
|
||||
.expect("Correct remote timeline should be convertable to index part");
|
||||
|
||||
assert_eq!(
|
||||
header, restored_header,
|
||||
"Header restoration should preserve file order"
|
||||
index_part.timeline_layers.iter().collect::<BTreeSet<_>>(),
|
||||
BTreeSet::from([
|
||||
&RelativePath("layer_1".to_string()),
|
||||
&RelativePath("layer_2".to_string())
|
||||
]),
|
||||
"Index part should have all remote timeline layers after the conversion"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.missing_layers.iter().collect::<BTreeSet<_>>(),
|
||||
BTreeSet::from([
|
||||
&RelativePath("missing_1".to_string()),
|
||||
&RelativePath("missing_2".to_string())
|
||||
]),
|
||||
"Index part should have all missing remote timeline layers after the conversion"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.disk_consistent_lsn,
|
||||
metadata.disk_consistent_lsn(),
|
||||
"Index part should have disk consistent lsn from the timeline"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.metadata_bytes,
|
||||
metadata
|
||||
.to_bytes()
|
||||
.expect("Failed to serialize correct metadata into bytes"),
|
||||
"Index part should have all missing remote timeline layers after the conversion"
|
||||
);
|
||||
|
||||
let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.expect("Correct index part should be convertable to remote timeline");
|
||||
|
||||
let original_metadata = &remote_timeline.metadata;
|
||||
let restored_metadata = &restored_timeline.metadata;
|
||||
// we have to compare the metadata this way, since its header is different after creation and restoration,
|
||||
// but that is now consireded ok.
|
||||
assert_eq!(
|
||||
original_metadata.disk_consistent_lsn(),
|
||||
restored_metadata.disk_consistent_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.prev_record_lsn(),
|
||||
restored_metadata.prev_record_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.ancestor_timeline(),
|
||||
restored_metadata.ancestor_timeline(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.ancestor_lsn(),
|
||||
restored_metadata.ancestor_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.latest_gc_cutoff_lsn(),
|
||||
restored_metadata.latest_gc_cutoff_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.initdb_lsn(),
|
||||
restored_metadata.initdb_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remote_timeline.awaits_download, restored_timeline.awaits_download,
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose download flag"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remote_timeline
|
||||
.timeline_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
restored_timeline
|
||||
.timeline_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose layer data"
|
||||
);
|
||||
assert_eq!(
|
||||
remote_timeline
|
||||
.missing_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
restored_timeline
|
||||
.missing_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose missing file data"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion_negatives() {
|
||||
let harness = RepoHarness::create("index_part_conversion_negatives").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
PathBuf::from("bad_path"),
|
||||
timeline_path.join("layer_2"),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
timeline_path.join("missing_1"),
|
||||
timeline_path.join("missing_2"),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
},
|
||||
);
|
||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with layer paths that are not in the timeline directory");
|
||||
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
timeline_path.join("layer_1"),
|
||||
timeline_path.join("layer_2"),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
PathBuf::from("bad_path"),
|
||||
timeline_path.join("missing_2"),
|
||||
]),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
},
|
||||
);
|
||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,520 +1,456 @@
|
||||
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
|
||||
|
||||
use std::{collections::BTreeSet, path::PathBuf, sync::Arc};
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
|
||||
use tracing::{debug, error, warn};
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, trace, warn};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::metadata_path,
|
||||
remote_storage::{
|
||||
storage_sync::{
|
||||
compression, fetch_full_index,
|
||||
index::{RemoteTimeline, TimelineIndexEntry, TimelineIndexEntryInner},
|
||||
sync_queue, SyncKind, SyncTask,
|
||||
},
|
||||
storage_sync::{index::RemoteTimeline, sync_queue, SyncTask},
|
||||
RemoteStorage, ZTenantTimelineId,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{compression::ArchiveHeader, NewCheckpoint, RemoteIndex};
|
||||
use super::{index::IndexPart, SyncData, TimelineUpload};
|
||||
|
||||
/// Attempts to compress and upload given checkpoint files.
|
||||
/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
sync_id: ZTenantTimelineId,
|
||||
index_part: IndexPart,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let index_part_bytes = serde_json::to_vec(&index_part)
|
||||
.context("Failed to serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let index_part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(
|
||||
index_part_bytes,
|
||||
index_part_size,
|
||||
&index_part_storage_path,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload index part to the storage path '{:?}'",
|
||||
index_part_storage_path
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Timeline upload result, with extra data, needed for uploading.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum UploadedTimeline {
|
||||
/// Upload failed due to some error, the upload task is rescheduled for another retry.
|
||||
FailedAndRescheduled,
|
||||
/// No issues happened during the upload, all task files were put into the remote storage.
|
||||
Successful(SyncData<TimelineUpload>),
|
||||
/// No failures happened during the upload, but some files were removed locally before the upload task completed
|
||||
/// (could happen due to retries, for instance, if GC happens in the interim).
|
||||
/// Such files are considered "not needed" and ignored, but the task's metadata should be discarded and the new one loaded from the local file.
|
||||
SuccessfulAfterLocalFsUpdate(SyncData<TimelineUpload>),
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
/// On success, populates index data with new downloads.
|
||||
pub(super) async fn upload_timeline_checkpoint<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
pub(super) async fn upload_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
new_checkpoint: NewCheckpoint,
|
||||
retries: u32,
|
||||
) -> Option<bool> {
|
||||
debug!("Uploading checkpoint for sync id {}", sync_id);
|
||||
let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();
|
||||
mut upload_data: SyncData<TimelineUpload>,
|
||||
) -> UploadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let upload = &mut upload_data.data;
|
||||
let new_upload_lsn = upload.metadata.disk_consistent_lsn();
|
||||
debug!(
|
||||
"Uploading timeline layers for sync id {}, new lsn: {}",
|
||||
sync_id, new_upload_lsn
|
||||
);
|
||||
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = sync_id;
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let index_read = index.read().await;
|
||||
let remote_timeline = match index_read.timeline_entry(&sync_id) {
|
||||
None => {
|
||||
drop(index_read);
|
||||
None
|
||||
}
|
||||
Some(entry) => match entry.inner() {
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => {
|
||||
let r = Some(remote_timeline.clone());
|
||||
drop(index_read);
|
||||
r
|
||||
}
|
||||
TimelineIndexEntryInner::Description(_) => {
|
||||
drop(index_read);
|
||||
debug!("Found timeline description for the given ids, downloading the full index");
|
||||
match fetch_full_index(remote_assets.as_ref(), &timeline_dir, sync_id).await {
|
||||
Ok(remote_timeline) => Some(remote_timeline),
|
||||
Err(e) => {
|
||||
error!("Failed to download full timeline index: {:?}", e);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Upload(new_checkpoint),
|
||||
));
|
||||
return Some(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
let already_contains_upload_lsn = remote_timeline
|
||||
.as_ref()
|
||||
.map(|remote_timeline| remote_timeline.contains_checkpoint_at(new_upload_lsn))
|
||||
.unwrap_or(false);
|
||||
if already_contains_upload_lsn {
|
||||
warn!(
|
||||
"Received a checkpoint with Lsn {} that's already been uploaded to remote storage, skipping the upload.",
|
||||
new_upload_lsn
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
let already_uploaded_files = remote_timeline
|
||||
.map(|timeline| timeline.stored_files(&timeline_dir))
|
||||
let already_uploaded_layers = remote_timeline
|
||||
.map(|timeline| timeline.stored_files())
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
match try_upload_checkpoint(
|
||||
config,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
&new_checkpoint,
|
||||
already_uploaded_files,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok((archive_header, header_size))) => {
|
||||
let mut index_write = index.write().await;
|
||||
match index_write
|
||||
.timeline_entry_mut(&sync_id)
|
||||
.map(|e| e.inner_mut())
|
||||
{
|
||||
None => {
|
||||
let mut new_timeline = RemoteTimeline::empty();
|
||||
new_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
index_write.add_timeline_entry(
|
||||
sync_id,
|
||||
TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
|
||||
let layers_to_upload = upload
|
||||
.layers_to_upload
|
||||
.difference(&already_uploaded_layers)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
trace!("Layers to upload: {:?}", layers_to_upload);
|
||||
|
||||
let mut upload_tasks = layers_to_upload
|
||||
.into_iter()
|
||||
.map(|source_path| async move {
|
||||
let storage_path = storage
|
||||
.storage_path(&source_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}
|
||||
Some(TimelineIndexEntryInner::Full(remote_timeline)) => {
|
||||
remote_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
}
|
||||
Some(TimelineIndexEntryInner::Description(_)) => {
|
||||
let mut new_timeline = RemoteTimeline::empty();
|
||||
new_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
index_write.add_timeline_entry(
|
||||
sync_id,
|
||||
TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
|
||||
})
|
||||
.map_err(UploadError::Other)?;
|
||||
|
||||
let source_file = match fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upen a source file for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
};
|
||||
|
||||
let source_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the source file metadata for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}
|
||||
})
|
||||
.map_err(UploadError::Other)?
|
||||
.len() as usize;
|
||||
|
||||
match storage
|
||||
.upload(source_file, source_size, &storage_path, None)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload a layer from local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(()) => Ok(source_path),
|
||||
Err(e) => Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
}
|
||||
debug!("Checkpoint uploaded successfully");
|
||||
Some(true)
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
debug!("uploading {} layers of a timeline", upload_tasks.len());
|
||||
|
||||
let mut errors_happened = false;
|
||||
let mut local_fs_updated = false;
|
||||
while let Some(upload_result) = upload_tasks.next().await {
|
||||
match upload_result {
|
||||
Ok(uploaded_path) => {
|
||||
upload.layers_to_upload.remove(&uploaded_path);
|
||||
upload.uploaded_layers.insert(uploaded_path);
|
||||
}
|
||||
Err(e) => match e {
|
||||
UploadError::Other(e) => {
|
||||
errors_happened = true;
|
||||
error!("Failed to upload a layer for timeline {}: {:?}", sync_id, e);
|
||||
}
|
||||
UploadError::MissingLocalFile(source_path, e) => {
|
||||
if source_path.exists() {
|
||||
errors_happened = true;
|
||||
error!("Failed to upload a layer for timeline {}: {:?}", sync_id, e);
|
||||
} else {
|
||||
local_fs_updated = true;
|
||||
upload.layers_to_upload.remove(&source_path);
|
||||
warn!("Missing locally a layer file scheduled for upload, skipping");
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(
|
||||
"Failed to upload checkpoint: {:?}, requeueing the upload",
|
||||
e
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Upload(new_checkpoint),
|
||||
));
|
||||
Some(false)
|
||||
}
|
||||
|
||||
if errors_happened {
|
||||
debug!("Reenqueuing failed upload task for timeline {}", sync_id);
|
||||
upload_data.retries += 1;
|
||||
sync_queue::push(sync_id, SyncTask::Upload(upload_data));
|
||||
UploadedTimeline::FailedAndRescheduled
|
||||
} else {
|
||||
debug!("Finished uploading all timeline's layers");
|
||||
if local_fs_updated {
|
||||
UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data)
|
||||
} else {
|
||||
UploadedTimeline::Successful(upload_data)
|
||||
}
|
||||
None => Some(true),
|
||||
}
|
||||
}
|
||||
|
||||
async fn try_upload_checkpoint<
|
||||
P: Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
new_checkpoint: &NewCheckpoint,
|
||||
files_to_skip: BTreeSet<PathBuf>,
|
||||
) -> Option<anyhow::Result<(ArchiveHeader, u64)>> {
|
||||
let ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = sync_id;
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let files_to_upload = new_checkpoint
|
||||
.layers
|
||||
.iter()
|
||||
.filter(|&path_to_upload| {
|
||||
if files_to_skip.contains(path_to_upload) {
|
||||
warn!(
|
||||
"Skipping file upload '{}', since it was already uploaded",
|
||||
path_to_upload.display()
|
||||
);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if files_to_upload.is_empty() {
|
||||
warn!(
|
||||
"No files to upload. Upload request was: {:?}, already uploaded files: {:?}",
|
||||
new_checkpoint.layers, files_to_skip
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
let upload_result = compression::archive_files_as_stream(
|
||||
&timeline_dir,
|
||||
files_to_upload.into_iter(),
|
||||
&new_checkpoint.metadata,
|
||||
move |archive_streamer, archive_name| async move {
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
let remote_storage = &remote_assets.0;
|
||||
remote_storage
|
||||
.upload(
|
||||
archive_streamer,
|
||||
&remote_storage.storage_path(&timeline_dir.join(&archive_name))?,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map(|(header, header_size, _)| (header, header_size));
|
||||
|
||||
Some(upload_result)
|
||||
enum UploadError {
|
||||
MissingLocalFile(PathBuf, anyhow::Error),
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
|
||||
use tempfile::tempdir;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
local_fs::LocalFs,
|
||||
storage_sync::{
|
||||
index::ArchiveId,
|
||||
test_utils::{
|
||||
assert_index_descriptions, create_local_timeline, dummy_metadata,
|
||||
ensure_correct_timeline_upload, expect_timeline,
|
||||
},
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
LocalFs,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use super::{upload_index_part, *};
|
||||
|
||||
#[tokio::test]
|
||||
async fn reupload_timeline() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("reupload_timeline")?;
|
||||
let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
);
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let index = &remote_assets.1;
|
||||
async fn regular_layer_upload() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("regular_layer_upload")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let first_upload_metadata = dummy_metadata(Lsn(0x10));
|
||||
let first_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
let local_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
first_checkpoint,
|
||||
let layer_files = ["a", "b"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
)
|
||||
.await;
|
||||
|
||||
let uploaded_timeline = expect_timeline(index, sync_id).await;
|
||||
let uploaded_archives = uploaded_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
let upload_data = match upload_result {
|
||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||
wrong_result => panic!(
|
||||
"Expected a successful upload for timeline, but got: {:?}",
|
||||
wrong_result
|
||||
),
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
uploaded_archives.len(),
|
||||
1,
|
||||
"Only one archive is expected after a first upload"
|
||||
current_retries, upload_data.retries,
|
||||
"On successful upload, retries are not expected to change"
|
||||
);
|
||||
let first_uploaded_archive = uploaded_archives.first().copied().unwrap();
|
||||
assert_eq!(
|
||||
uploaded_timeline.checkpoints().last(),
|
||||
Some(first_upload_metadata.disk_consistent_lsn()),
|
||||
"Metadata that was uploaded, should have its Lsn stored"
|
||||
let upload = &upload_data.data;
|
||||
assert!(
|
||||
upload.layers_to_upload.is_empty(),
|
||||
"Successful upload should have no layers left to upload"
|
||||
);
|
||||
assert_eq!(
|
||||
uploaded_timeline
|
||||
.archive_data(uploaded_archives.first().copied().unwrap())
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
uploaded_timeline.stored_files(&local_timeline_path),
|
||||
vec![local_timeline_path.join("a"), local_timeline_path.join("b")]
|
||||
.into_iter()
|
||||
upload
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_file| local_timeline_path.join(layer_file))
|
||||
.collect(),
|
||||
"Should have all files from the first checkpoint"
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
upload.metadata, metadata,
|
||||
"Successful upload should not chage its metadata"
|
||||
);
|
||||
|
||||
let second_upload_metadata = dummy_metadata(Lsn(0x40));
|
||||
let second_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["b", "c"],
|
||||
second_upload_metadata.clone(),
|
||||
)?;
|
||||
assert!(
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
< second_upload_metadata.disk_consistent_lsn()
|
||||
let storage_files = storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
"All layers should be uploaded"
|
||||
);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
second_checkpoint,
|
||||
assert_eq!(
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
.map(|file| local_timeline_path.join(file))
|
||||
.collect(),
|
||||
"Uploaded files should match with the local ones"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario.
|
||||
#[tokio::test]
|
||||
async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("layer_upload_after_local_fs_update")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a1", "b1"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
let current_retries = 5;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let layers_to_upload = {
|
||||
let mut layers = layer_files.to_vec();
|
||||
layers.push("layer_to_remove");
|
||||
layers
|
||||
};
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone())
|
||||
.await?;
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
|
||||
fs::remove_file(local_timeline_path.join("layer_to_remove")).await?;
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
)
|
||||
.await;
|
||||
|
||||
let updated_timeline = expect_timeline(index, sync_id).await;
|
||||
let mut updated_archives = updated_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
let upload_data = match upload_result {
|
||||
UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data,
|
||||
wrong_result => panic!(
|
||||
"Expected a successful after local fs upload for timeline, but got: {:?}",
|
||||
wrong_result
|
||||
),
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
2,
|
||||
"Two archives are expected after a successful update of the upload"
|
||||
current_retries, upload_data.retries,
|
||||
"On successful upload, retries are not expected to change"
|
||||
);
|
||||
updated_archives.retain(|archive_id| archive_id != &first_uploaded_archive);
|
||||
let upload = &upload_data.data;
|
||||
assert!(
|
||||
upload.layers_to_upload.is_empty(),
|
||||
"Successful upload should have no layers left to upload, even those that were removed from the local fs"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
1,
|
||||
"Only one new archive is expected among the uploaded"
|
||||
);
|
||||
let second_uploaded_archive = updated_archives.last().copied().unwrap();
|
||||
assert_eq!(
|
||||
updated_timeline.checkpoints().max(),
|
||||
Some(second_upload_metadata.disk_consistent_lsn()),
|
||||
"Metadata that was uploaded, should have its Lsn stored"
|
||||
upload
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_file| local_timeline_path.join(layer_file))
|
||||
.collect(),
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline
|
||||
.archive_data(second_uploaded_archive)
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
second_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline.stored_files(&local_timeline_path),
|
||||
vec![
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("b"),
|
||||
local_timeline_path.join("c"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from both checkpoints without duplicates"
|
||||
upload.metadata, metadata,
|
||||
"Successful upload should not chage its metadata"
|
||||
);
|
||||
|
||||
let third_upload_metadata = dummy_metadata(Lsn(0x20));
|
||||
let third_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["d"],
|
||||
third_upload_metadata.clone(),
|
||||
)?;
|
||||
assert_ne!(
|
||||
third_upload_metadata.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
assert!(
|
||||
third_upload_metadata.disk_consistent_lsn()
|
||||
< second_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
third_checkpoint,
|
||||
)
|
||||
.await;
|
||||
|
||||
let updated_timeline = expect_timeline(index, sync_id).await;
|
||||
let mut updated_archives = updated_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
let storage_files = storage.list().await?;
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
3,
|
||||
"Three archives are expected after two successful updates of the upload"
|
||||
);
|
||||
updated_archives.retain(|archive_id| {
|
||||
archive_id != &first_uploaded_archive && archive_id != &second_uploaded_archive
|
||||
});
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
1,
|
||||
"Only one new archive is expected among the uploaded"
|
||||
);
|
||||
let third_uploaded_archive = updated_archives.last().copied().unwrap();
|
||||
assert!(
|
||||
updated_timeline.checkpoints().max().unwrap()
|
||||
> third_upload_metadata.disk_consistent_lsn(),
|
||||
"Should not influence the last lsn by uploading an older checkpoint"
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
"All layers should be uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline
|
||||
.archive_data(third_uploaded_archive)
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
third_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline.stored_files(&local_timeline_path),
|
||||
vec![
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("b"),
|
||||
local_timeline_path.join("c"),
|
||||
local_timeline_path.join("d"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from three checkpoints without duplicates"
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
.map(|file| local_timeline_path.join(file))
|
||||
.collect(),
|
||||
"Uploaded files should match with the local ones"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reupload_timeline_rejected() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("reupload_timeline_rejected")?;
|
||||
let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
);
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let storage = &remote_assets.0;
|
||||
let index = &remote_assets.1;
|
||||
async fn test_upload_index_part() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("test_upload_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let first_upload_metadata = dummy_metadata(Lsn(0x10));
|
||||
let first_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
first_checkpoint,
|
||||
)
|
||||
.await;
|
||||
let after_first_uploads = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
remote_assets
|
||||
.0
|
||||
.list()
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
HashSet::from([
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?,
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?,
|
||||
]),
|
||||
HashSet::from([RelativePath::new(
|
||||
&local_timeline_path,
|
||||
local_timeline_path.join("three"),
|
||||
)?]),
|
||||
metadata.disk_consistent_lsn(),
|
||||
metadata.to_bytes()?,
|
||||
);
|
||||
|
||||
let normal_upload_metadata = dummy_metadata(Lsn(0x20));
|
||||
assert_ne!(
|
||||
normal_upload_metadata.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?;
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
1,
|
||||
"Should have only the index part file uploaded"
|
||||
);
|
||||
|
||||
let checkpoint_with_no_files = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&[],
|
||||
normal_upload_metadata.clone(),
|
||||
)?;
|
||||
upload_timeline_checkpoint(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
checkpoint_with_no_files,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(index, &after_first_uploads).await;
|
||||
let index_part_path = storage_files.first().unwrap();
|
||||
assert_eq!(
|
||||
index_part_path.file_stem().and_then(|name| name.to_str()),
|
||||
Some(IndexPart::FILE_NAME),
|
||||
"Remote index part should have the correct name"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part_path
|
||||
.extension()
|
||||
.and_then(|extension| extension.to_str()),
|
||||
Some(IndexPart::FILE_EXTENSION),
|
||||
"Remote index part should have the correct extension"
|
||||
);
|
||||
|
||||
let checkpoint_with_uploaded_lsn = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["something", "new"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
upload_timeline_checkpoint(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
checkpoint_with_uploaded_lsn,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(index, &after_first_uploads).await;
|
||||
let remote_index_part: IndexPart =
|
||||
serde_json::from_slice(&fs::read(&index_part_path).await?)?;
|
||||
assert_eq!(
|
||||
index_part, remote_index_part,
|
||||
"Remote index part should match the local one"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -182,14 +182,12 @@ impl Value {
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum TimelineSyncStatusUpdate {
|
||||
Uploaded,
|
||||
Downloaded,
|
||||
}
|
||||
|
||||
impl Display for TimelineSyncStatusUpdate {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
TimelineSyncStatusUpdate::Uploaded => "Uploaded",
|
||||
TimelineSyncStatusUpdate::Downloaded => "Downloaded",
|
||||
};
|
||||
f.write_str(s)
|
||||
|
||||
@@ -95,7 +95,7 @@ pub fn load_local_repo(
|
||||
/// Updates tenants' repositories, changing their timelines state in memory.
|
||||
pub fn apply_timeline_sync_status_updates(
|
||||
conf: &'static PageServerConf,
|
||||
remote_index: RemoteIndex,
|
||||
remote_index: &RemoteIndex,
|
||||
sync_status_updates: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>,
|
||||
) {
|
||||
if sync_status_updates.is_empty() {
|
||||
@@ -109,7 +109,7 @@ pub fn apply_timeline_sync_status_updates(
|
||||
trace!("Sync status updates: {:?}", sync_status_updates);
|
||||
|
||||
for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates {
|
||||
let repo = load_local_repo(conf, tenant_id, &remote_index);
|
||||
let repo = load_local_repo(conf, tenant_id, remote_index);
|
||||
|
||||
for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates {
|
||||
match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update)
|
||||
|
||||
@@ -114,8 +114,8 @@ impl LocalTimelineInfo {
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct RemoteTimelineInfo {
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
|
||||
@@ -305,7 +305,7 @@ fn walreceiver_main(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.and_then(|e| e.disk_consistent_lsn())
|
||||
.map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn())
|
||||
.unwrap_or(Lsn(0)) // no checkpoint was uploaded
|
||||
});
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ import pytest
|
||||
# * starts a pageserver with remote storage, stores specific data in its tables
|
||||
# * triggers a checkpoint (which produces a local data scheduled for backup), gets the corresponding timeline id
|
||||
# * polls the timeline status to ensure it's copied remotely
|
||||
# * inserts more data in the pageserver and repeats the process, to check multiple checkpoints case
|
||||
# * stops the pageserver, clears all local directories
|
||||
#
|
||||
# 2. Second pageserver
|
||||
@@ -50,27 +51,30 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
|
||||
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
|
||||
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(f'''
|
||||
CREATE TABLE t1(id int primary key, secret text);
|
||||
INSERT INTO t1 VALUES ({data_id}, '{data_secret}');
|
||||
''')
|
||||
cur.execute("SELECT pg_current_wal_flush_lsn()")
|
||||
current_lsn = lsn_from_hex(cur.fetchone()[0])
|
||||
checkpoint_numbers = range(1, 3)
|
||||
|
||||
# wait until pageserver receives that data
|
||||
wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn)
|
||||
for checkpoint_number in checkpoint_numbers:
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(f'''
|
||||
CREATE TABLE t{checkpoint_number}(id int primary key, secret text);
|
||||
INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}');
|
||||
''')
|
||||
cur.execute("SELECT pg_current_wal_flush_lsn()")
|
||||
current_lsn = lsn_from_hex(cur.fetchone()[0])
|
||||
|
||||
# run checkpoint manually to be sure that data landed in remote storage
|
||||
with closing(env.pageserver.connect()) as psconn:
|
||||
with psconn.cursor() as pscur:
|
||||
pscur.execute(f"checkpoint {tenant_id} {timeline_id}")
|
||||
# wait until pageserver receives that data
|
||||
wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn)
|
||||
|
||||
log.info("waiting for upload")
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn)
|
||||
log.info("upload is done")
|
||||
# run checkpoint manually to be sure that data landed in remote storage
|
||||
with closing(env.pageserver.connect()) as psconn:
|
||||
with psconn.cursor() as pscur:
|
||||
pscur.execute(f"checkpoint {tenant_id} {timeline_id}")
|
||||
|
||||
log.info(f'waiting for checkpoint {checkpoint_number} upload')
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn)
|
||||
log.info(f'upload of checkpoint {checkpoint_number} is done')
|
||||
|
||||
##### Stop the first pageserver instance, erase all its data
|
||||
env.postgres.stop_all()
|
||||
@@ -93,5 +97,6 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
|
||||
pg = env.postgres.create_start('main')
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};')
|
||||
assert cur.fetchone() == (data_secret, )
|
||||
for checkpoint_number in checkpoint_numbers:
|
||||
cur.execute(f'SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};')
|
||||
assert cur.fetchone() == (f'{data_secret}|{checkpoint_number}', )
|
||||
|
||||
@@ -550,7 +550,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
let tenant_id = get_tenant_id(create_match, env)?;
|
||||
let new_branch_name = create_match
|
||||
.value_of("branch-name")
|
||||
.ok_or(anyhow!("No branch name provided"))?;
|
||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||
let timeline = pageserver
|
||||
.timeline_create(tenant_id, None, None, None)?
|
||||
.ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?;
|
||||
@@ -571,7 +571,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
let tenant_id = get_tenant_id(branch_match, env)?;
|
||||
let new_branch_name = branch_match
|
||||
.value_of("branch-name")
|
||||
.ok_or(anyhow!("No branch name provided"))?;
|
||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||
let ancestor_branch_name = branch_match
|
||||
.value_of("ancestor-branch-name")
|
||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||
|
||||
Reference in New Issue
Block a user