mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 11:40:38 +00:00
Add core relish backup and restore functionality
This commit is contained in:
committed by
Kirill Bulatov
parent
8c42dcc041
commit
04fb0a0342
426
Cargo.lock
generated
426
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt", "io-util"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
@@ -45,3 +45,4 @@ workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
|
||||
@@ -41,7 +41,7 @@ Legend:
|
||||
+--+
|
||||
|
||||
....
|
||||
. . Component that we will need, but doesn't exist at the moment. A TODO.
|
||||
. . Component at its early development phase.
|
||||
....
|
||||
|
||||
---> Data flow
|
||||
@@ -116,13 +116,49 @@ Remove old on-disk layer files that are no longer needed according to the
|
||||
PITR retention policy
|
||||
|
||||
|
||||
TODO: Backup service
|
||||
--------------------
|
||||
### Backup service
|
||||
|
||||
The backup service is responsible for periodically pushing the chunks to S3.
|
||||
The backup service, responsible for storing pageserver recovery data externally.
|
||||
|
||||
TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
|
||||
a chunk we don't currently have? Or when an external Control Plane tells us?
|
||||
Currently, pageserver stores its files in a filesystem directory it's pointed to.
|
||||
That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
|
||||
Therefore, the server interacts with external, more reliable storage to back up and restore its state.
|
||||
|
||||
The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
|
||||
There are the following implementations present:
|
||||
* local filesystem — to use in tests mainly
|
||||
* AWS S3 - to use in production
|
||||
|
||||
Implementation details are covered in the [storage readme](./src/relish_storage/README.md) and corresponding Rust file docs.
|
||||
|
||||
The backup service is disabled by default and can be enabled to interact with a single remote storage.
|
||||
|
||||
CLI examples:
|
||||
* Local FS: `${PAGESERVER_BIN} --relish-storage-local-path="/some/local/path/"`
|
||||
* AWS S3 : `${PAGESERVER_BIN} --relish-storage-s3-bucket="some-sample-bucket" --relish-storage-region="eu-north-1" --relish-storage-access-key="SOMEKEYAAAAASADSAH*#" --relish-storage-secret-access-key="SOMEsEcReTsd292v"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
|
||||
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup backup targets.
|
||||
Required sections are:
|
||||
|
||||
```toml
|
||||
[relish_storage]
|
||||
local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```toml
|
||||
[relish_storage]
|
||||
bucket_name = 'some-sample-bucket'
|
||||
bucket_region = 'eu-north-1'
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
```
|
||||
|
||||
Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
|
||||
|
||||
TODO: Sharding
|
||||
--------------------
|
||||
|
||||
@@ -113,6 +113,9 @@ lazy_static! {
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
||||
pub const TENANTS_SEGMENT_NAME: &str = "tenants";
|
||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
|
||||
///
|
||||
/// Repository consists of multiple timelines. Keep them in a hash table.
|
||||
@@ -266,7 +269,7 @@ impl LayeredRepository {
|
||||
|
||||
let mut timeline = LayeredTimeline::new(
|
||||
self.conf,
|
||||
metadata,
|
||||
metadata.clone(),
|
||||
ancestor,
|
||||
timelineid,
|
||||
self.tenantid,
|
||||
@@ -276,15 +279,9 @@ impl LayeredRepository {
|
||||
)?;
|
||||
|
||||
// List the layers on disk, and load them into the layer map
|
||||
let _loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
|
||||
let loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_upload(());
|
||||
// schedule_timeline_upload(
|
||||
// self.tenantid,
|
||||
// timelineid,
|
||||
// loaded_layers,
|
||||
// disk_consistent_lsn,
|
||||
// );
|
||||
schedule_timeline_upload(self.tenantid, timelineid, loaded_layers, metadata);
|
||||
}
|
||||
|
||||
// needs to be after load_layer_map
|
||||
@@ -412,13 +409,7 @@ impl LayeredRepository {
|
||||
.create_new(first_save)
|
||||
.open(&path)?;
|
||||
|
||||
let mut metadata_bytes = TimelineMetadata::ser(data)?;
|
||||
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
|
||||
|
||||
if file.write(&metadata_bytes)? != metadata_bytes.len() {
|
||||
bail!("Could not write all the metadata bytes in a single call");
|
||||
@@ -445,20 +436,7 @@ impl LayeredRepository {
|
||||
) -> Result<TimelineMetadata> {
|
||||
let path = metadata_path(conf, timelineid, tenantid);
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(calculated_checksum == expected_checksum);
|
||||
|
||||
let data = TimelineMetadata::des_prefix(data)?;
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
}
|
||||
|
||||
//
|
||||
@@ -586,9 +564,11 @@ impl LayeredRepository {
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
/// [`Lsn`] that corresponds to the corresponding timeline directory
|
||||
/// contents, stored locally in the pageserver workdir.
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
@@ -600,10 +580,45 @@ pub struct TimelineMetadata {
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
pub prev_record_lsn: Option<Lsn>,
|
||||
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
pub ancestor_timeline: Option<ZTimelineId>,
|
||||
pub ancestor_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
let data = TimelineMetadata::des_prefix(data)?;
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let mut metadata_bytes = TimelineMetadata::ser(self)?;
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayeredTimeline {
|
||||
@@ -1374,6 +1389,7 @@ impl LayeredTimeline {
|
||||
ancestor_timeline: ancestor_timelineid,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
};
|
||||
|
||||
LayeredRepository::save_metadata(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
@@ -1381,19 +1397,12 @@ impl LayeredTimeline {
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_upload(self.tenantid, self.timelineid, layer_uploads, metadata);
|
||||
}
|
||||
|
||||
// Also update the in-memory copy
|
||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_upload(())
|
||||
// schedule_timeline_upload(
|
||||
// self.tenantid,
|
||||
// self.timelineid,
|
||||
// layer_uploads,
|
||||
// disk_consistent_lsn,
|
||||
// });
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1947,7 +1956,7 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn metadata_path(
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
|
||||
@@ -442,12 +442,7 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
);
|
||||
let path = self.path();
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
@@ -91,7 +92,7 @@ impl PageServerConf {
|
||||
//
|
||||
|
||||
fn tenants_path(&self) -> PathBuf {
|
||||
self.workdir.join("tenants")
|
||||
self.workdir.join(TENANTS_SEGMENT_NAME)
|
||||
}
|
||||
|
||||
fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
|
||||
@@ -115,7 +116,7 @@ impl PageServerConf {
|
||||
}
|
||||
|
||||
fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.tenant_path(tenantid).join("timelines")
|
||||
self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
|
||||
@@ -1,60 +1,138 @@
|
||||
//! Abstractions for the page server to store its relish layer data in the external storage.
|
||||
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
|
||||
//! This particular module serves as a public API border between pageserver and the internal storage machinery.
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
|
||||
//! in a way, optimal for page server.
|
||||
//! There are a few components the storage machinery consists of:
|
||||
//! * [`RelishStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
|
||||
//!
|
||||
//! The abstractions hide multiple custom external storage API implementations,
|
||||
//! such as AWS S3, local filesystem, etc., located in the submodules.
|
||||
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
|
||||
//!
|
||||
//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_upload`]
|
||||
//!
|
||||
//! Here's a schematic overview of all interactions relish storage and the rest of the pageserver perform:
|
||||
//!
|
||||
//! +------------------------+ +--------->-------+
|
||||
//! | | - - - (init async loop) - - - -> | |
|
||||
//! | | | |
|
||||
//! | | -------------------------------> | async |
|
||||
//! | pageserver | (schedule frozen layer upload) | upload/download |
|
||||
//! | | | loop |
|
||||
//! | | <------------------------------- | |
|
||||
//! | | (register downloaded layers) | |
|
||||
//! +------------------------+ +---------<-------+
|
||||
//! |
|
||||
//! |
|
||||
//! CRUD layer file operations |
|
||||
//! (upload/download/delete/list, etc.) |
|
||||
//! V
|
||||
//! +------------------------+
|
||||
//! | |
|
||||
//! | [`RelishStorage`] impl |
|
||||
//! | |
|
||||
//! | pageserver assumes it |
|
||||
//! | owns exclusive write |
|
||||
//! | access to this storage |
|
||||
//! +------------------------+
|
||||
//!
|
||||
//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
|
||||
//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
|
||||
//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
|
||||
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
|
||||
//!
|
||||
//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
|
||||
//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
|
||||
//! by the storage upload, if enabled.
|
||||
//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
|
||||
//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
|
||||
//! when the newer timeline is downloaded.
|
||||
//!
|
||||
//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
|
||||
//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
|
||||
//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
|
||||
//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
|
||||
//!
|
||||
//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
|
||||
//!
|
||||
//! NOTES:
|
||||
//! * pageserver assumes it has exclusive write access to the relish storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
|
||||
//!
|
||||
//! * the uploads do not happen right after pageserver startup, they are registered when
|
||||
//! 1. pageserver does the checkpoint, which happens further in the future after the server start
|
||||
//! 2. pageserver loads the timeline from disk for the first time
|
||||
//!
|
||||
//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
|
||||
//!
|
||||
//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of relish files,
|
||||
//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
|
||||
//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
/// A queue-based storage with the background machinery behind it to synchronize
|
||||
/// local page server layer files with external storage.
|
||||
mod synced_storage;
|
||||
mod storage_sync;
|
||||
|
||||
use std::{path::Path, thread};
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
pub use self::synced_storage::schedule_timeline_upload;
|
||||
use self::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
use crate::{PageServerConf, RelishStorageKind};
|
||||
pub use self::storage_sync::schedule_timeline_upload;
|
||||
use self::{local_fs::LocalFs, rust_s3::S3};
|
||||
use crate::{
|
||||
layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
|
||||
PageServerConf, RelishStorageKind,
|
||||
};
|
||||
|
||||
/// Based on the config, initiates the remote storage connection and starts a separate thread
|
||||
/// that ensures that pageserver and the remote storage are in sync with each other.
|
||||
/// If no external configuraion connection given, no thread or storage initialization is done.
|
||||
pub fn run_storage_sync_thread(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
match &config.relish_storage_config {
|
||||
Some(relish_storage_config) => {
|
||||
let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
|
||||
match &relish_storage_config.storage {
|
||||
RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
|
||||
let handle = match &relish_storage_config.storage {
|
||||
RelishStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone())?,
|
||||
LocalFs::new(root.clone(), &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
|
||||
RelishStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
RustS3::new(s3_config)?,
|
||||
S3::new(s3_config, &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
}
|
||||
};
|
||||
handle.map(Some)
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations with storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RelishStorage: Send + Sync {
|
||||
trait RelishStorage: Send + Sync {
|
||||
/// A way to uniquely reference relish in the remote storage.
|
||||
type RelishStoragePath;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath>;
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath>;
|
||||
|
||||
/// Gets the layered storage information about the given entry.
|
||||
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
@@ -65,6 +143,7 @@ pub trait RelishStorage: Send + Sync {
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut tokio::io::BufReader<R>,
|
||||
@@ -72,16 +151,173 @@ pub trait RelishStorage: Send + Sync {
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
fn strip_workspace_prefix<'a>(
|
||||
page_server_workdir: &'a Path,
|
||||
relish_local_path: &'a Path,
|
||||
) -> anyhow::Result<&'a Path> {
|
||||
relish_local_path
|
||||
.strip_prefix(page_server_workdir)
|
||||
.with_context(|| {
|
||||
/// Information about a certain remote storage entry.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
struct RemoteRelishInfo {
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
/// Path in the pageserver workdir where the file should go to.
|
||||
download_destination: PathBuf,
|
||||
is_metadata: bool,
|
||||
}
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Unexpected: relish local path '{}' is not relevant to server workdir",
|
||||
relish_local_path.display(),
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_ids_from_path<'a, R: std::fmt::Display>(
|
||||
path_segments: impl Iterator<Item = &'a str>,
|
||||
path_log_representation: &R,
|
||||
) -> anyhow::Result<(ZTenantId, ZTimelineId)> {
|
||||
let mut segments = path_segments.skip_while(|&segment| segment != TENANTS_SEGMENT_NAME);
|
||||
let tenants_segment = segments.next().ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no '{}' segment in the storage path '{}'",
|
||||
TENANTS_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
tenants_segment == TENANTS_SEGMENT_NAME,
|
||||
"Failed to extract '{}' segment from storage path '{}'",
|
||||
TENANTS_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
);
|
||||
let tenant_id = segments
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no tenant id in the storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?
|
||||
.parse::<ZTenantId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to parse tenant id from storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines_segment = segments.next().ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no '{}' segment in the storage path '{}'",
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
timelines_segment == TIMELINES_SEGMENT_NAME,
|
||||
"Failed to extract '{}' segment from storage path '{}'",
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
);
|
||||
let timeline_id = segments
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no timeline id in the storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?
|
||||
.parse::<ZTimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to parse timeline id from storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok((tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
/// A set of common test utils to share in unit tests inside the module tree.
|
||||
#[cfg(test)]
|
||||
mod test_utils {
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::ensure;
|
||||
|
||||
use crate::{
|
||||
layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
/// Gives a timeline path with pageserver workdir stripped off.
|
||||
pub fn relative_timeline_path(harness: &RepoHarness) -> anyhow::Result<PathBuf> {
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
Ok(timeline_path
|
||||
.strip_prefix(&harness.conf.workdir)?
|
||||
.to_path_buf())
|
||||
}
|
||||
|
||||
/// Creates a path with custom tenant id in one of its segments.
|
||||
/// Useful for emulating paths with wrong ids.
|
||||
pub fn custom_tenant_id_path(
|
||||
path_with_tenant_id: &Path,
|
||||
new_tenant_id: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let mut new_path = PathBuf::new();
|
||||
let mut is_tenant_id = false;
|
||||
let mut tenant_id_replaced = false;
|
||||
for segment in path_with_tenant_id {
|
||||
match segment.to_str() {
|
||||
Some(TENANTS_SEGMENT_NAME) => is_tenant_id = true,
|
||||
Some(_tenant_id_str) if is_tenant_id => {
|
||||
is_tenant_id = false;
|
||||
new_path.push(new_tenant_id);
|
||||
tenant_id_replaced = true;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
new_path.push(segment)
|
||||
}
|
||||
|
||||
ensure!(tenant_id_replaced, "Found no tenant id segment to replace");
|
||||
Ok(new_path)
|
||||
}
|
||||
|
||||
/// Creates a path with custom timeline id in one of its segments.
|
||||
/// Useful for emulating paths with wrong ids.
|
||||
pub fn custom_timeline_id_path(
|
||||
path_with_timeline_id: &Path,
|
||||
new_timeline_id: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let mut new_path = PathBuf::new();
|
||||
let mut is_timeline_id = false;
|
||||
let mut timeline_id_replaced = false;
|
||||
for segment in path_with_timeline_id {
|
||||
match segment.to_str() {
|
||||
Some(TIMELINES_SEGMENT_NAME) => is_timeline_id = true,
|
||||
Some(_timeline_id_str) if is_timeline_id => {
|
||||
is_timeline_id = false;
|
||||
new_path.push(new_timeline_id);
|
||||
timeline_id_replaced = true;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
new_path.push(segment)
|
||||
}
|
||||
|
||||
ensure!(
|
||||
timeline_id_replaced,
|
||||
"Found no timeline id segment to replace"
|
||||
);
|
||||
Ok(new_path)
|
||||
}
|
||||
}
|
||||
|
||||
82
pageserver/src/relish_storage/README.md
Normal file
82
pageserver/src/relish_storage/README.md
Normal file
@@ -0,0 +1,82 @@
|
||||
# Non-implementation details
|
||||
|
||||
This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
|
||||
Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../relish_storage.rs) and its submodules.
|
||||
Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
|
||||
|
||||
## Approach
|
||||
|
||||
Backup functionality is a new component, appeared way after the core DB functionality was implemented.
|
||||
Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
|
||||
|
||||
To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
|
||||
This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
|
||||
|
||||
## What's done
|
||||
|
||||
Current implementation
|
||||
* provides remote storage wrappers for AWS S3 and local FS
|
||||
* uploads layers, frozen by pageserver checkpoint thread
|
||||
* downloads and registers layers, found on the remote storage, but missing locally
|
||||
|
||||
No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
|
||||
It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
|
||||
|
||||
### Peculiarities
|
||||
|
||||
As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
|
||||
Here's the list of known compromises with comments:
|
||||
|
||||
* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
|
||||
This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish file and a metadata file, ~31 MB combined.
|
||||
AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
|
||||
Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
|
||||
|
||||
Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
|
||||
|
||||
* no proper file comparison
|
||||
|
||||
Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
|
||||
Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
|
||||
No file contents assertion is done currently, but should be.
|
||||
AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
|
||||
|
||||
For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
|
||||
|
||||
* no proper retry management
|
||||
|
||||
Now, the storage sync attempts to redo the upload/download operation for the image files that failed.
|
||||
No proper task eviction or backpressure is implemented currently: the tasks will stay in the queue forever, reattempting the downloads.
|
||||
|
||||
This will be fixed when more details on the file consistency model will be agreed on.
|
||||
|
||||
* sad rust-s3 api
|
||||
|
||||
rust-s3 is not very pleasant to use:
|
||||
1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
|
||||
2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
|
||||
3. it's a prerelease library with unclear maintenance status
|
||||
4. noisy on debug level
|
||||
|
||||
But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
|
||||
Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
|
||||
|
||||
|
||||
* gc and branches are ignored
|
||||
|
||||
So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
|
||||
Only checkpointer loop affects the remote storage.
|
||||
|
||||
* more layers should be downloaded on demand
|
||||
|
||||
Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
|
||||
Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
|
||||
so the pageserver is unable to respond property on requests to such ancestors.
|
||||
|
||||
To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
|
||||
[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
|
||||
|
||||
* no IT tests
|
||||
|
||||
Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
|
||||
After it's fixed, benchmark runs should also be carried out to find bottlenecks.
|
||||
@@ -1,13 +1,11 @@
|
||||
//! Local filesystem relish storage.
|
||||
//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
|
||||
//!
|
||||
//! Page server already stores layer data on the server, when freezing it.
|
||||
//! This storage serves a way to
|
||||
//!
|
||||
//! * test things locally simply
|
||||
//! * allow to compabre both binary sets
|
||||
//! * help validating the relish storage API
|
||||
//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
ffi::OsStr,
|
||||
future::Future,
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
@@ -16,25 +14,31 @@ use std::{
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::{fs, io};
|
||||
use tracing::*;
|
||||
|
||||
use super::{strip_workspace_prefix, RelishStorage};
|
||||
use super::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo};
|
||||
use crate::layered_repository::METADATA_FILE_NAME;
|
||||
|
||||
pub struct LocalFs {
|
||||
pageserver_workdir: &'static Path,
|
||||
root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
|
||||
pub fn new(root: PathBuf) -> anyhow::Result<Self> {
|
||||
/// Attempts to create local FS relish storage, along with the storage root directory.
|
||||
pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
format!(
|
||||
"Failed to create all directories in the given root path {}",
|
||||
"Failed to create all directories in the given root path '{}'",
|
||||
root.display(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok(Self { root })
|
||||
Ok(Self {
|
||||
pageserver_workdir,
|
||||
root,
|
||||
})
|
||||
}
|
||||
|
||||
fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
|
||||
@@ -55,11 +59,29 @@ impl LocalFs {
|
||||
impl RelishStorage for LocalFs {
|
||||
type RelishStoragePath = PathBuf;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
Ok(self.root.join(
|
||||
strip_path_prefix(self.pageserver_workdir, local_path)
|
||||
.context("local path does not belong to this storage")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
|
||||
let is_metadata =
|
||||
storage_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME);
|
||||
let relative_path = strip_path_prefix(&self.root, storage_path)
|
||||
.context("local path does not belong to this storage")?;
|
||||
let download_destination = self.pageserver_workdir.join(relative_path);
|
||||
let (tenant_id, timeline_id) = parse_ids_from_path(
|
||||
relative_path.iter().filter_map(|segment| segment.to_str()),
|
||||
&relative_path.display(),
|
||||
)?;
|
||||
Ok(RemoteRelishInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
download_destination,
|
||||
is_metadata,
|
||||
})
|
||||
}
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
@@ -72,6 +94,7 @@ impl RelishStorage for LocalFs {
|
||||
mut to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let updated_buffer = tokio::task::spawn_blocking(move || {
|
||||
let mut source = std::io::BufReader::new(
|
||||
@@ -104,7 +127,7 @@ impl RelishStorage for LocalFs {
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(tokio::fs::remove_file(file_path).await?)
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
@@ -152,12 +175,12 @@ where
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path = dir_entry.path();
|
||||
if file_type.is_symlink() {
|
||||
log::debug!("{:?} us a symlink, skipping", entry_path)
|
||||
debug!("{:?} us a symlink, skipping", entry_path)
|
||||
} else if file_type.is_dir() {
|
||||
paths.extend(get_all_files(entry_path).await?.into_iter())
|
||||
} else {
|
||||
@@ -183,7 +206,369 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
||||
),
|
||||
};
|
||||
if !target_dir.exists() {
|
||||
tokio::fs::create_dir_all(target_dir).await?;
|
||||
fs::create_dir_all(target_dir).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use crate::{
|
||||
relish_storage::test_utils::{
|
||||
custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("relish_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
|
||||
|
||||
assert_eq!(
|
||||
expected_path,
|
||||
storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
|
||||
"Relish paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
|
||||
match storage.storage_path(mismatching_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected path '{}' to error, but got storage path: {:?}",
|
||||
mismatching_path.display(),
|
||||
wrong_path,
|
||||
),
|
||||
Err(e) => format!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("storage_path_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
|
||||
assert!(error_string.contains("does not belong to this storage"));
|
||||
assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
|
||||
|
||||
let mismatching_path_str = "/something/else";
|
||||
let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
|
||||
assert!(
|
||||
error_message.contains(mismatching_path_str),
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(error_message.contains("does not belong to this storage"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("info_positive")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let name = "not a metadata";
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
|
||||
assert_eq!(
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: local_path.clone(),
|
||||
is_metadata: false,
|
||||
},
|
||||
storage
|
||||
.info(&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?))
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta relish"
|
||||
);
|
||||
|
||||
let local_metadata_path = repo_harness
|
||||
.timeline_path(&TIMELINE_ID)
|
||||
.join(METADATA_FILE_NAME);
|
||||
let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
|
||||
assert_eq!(
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: local_metadata_path,
|
||||
is_metadata: true,
|
||||
},
|
||||
storage
|
||||
.info(&remote_metadata_path)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
#[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.info` parameter requirements
|
||||
fn storage_info_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
|
||||
match storage.info(storage_path) {
|
||||
Ok(wrong_info) => panic!(
|
||||
"Expected storage path input {:?} to cause an error, but got relish info: {:?}",
|
||||
storage_path, wrong_info,
|
||||
),
|
||||
Err(e) => format!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("info_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
let error_message = storage_info_error(&storage, &PathBuf::from(totally_wrong_path));
|
||||
assert!(error_message.contains(totally_wrong_path));
|
||||
|
||||
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
|
||||
|
||||
let relative_relish_path =
|
||||
custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?
|
||||
.join("wrong_tenant_id_name");
|
||||
let wrong_tenant_id_path = storage_root.join(&relative_relish_path);
|
||||
let error_message = storage_info_error(&storage, &wrong_tenant_id_path);
|
||||
assert!(
|
||||
error_message.contains(relative_relish_path.to_str().unwrap()),
|
||||
"Error message '{}' does not contain the expected substring",
|
||||
error_message
|
||||
);
|
||||
|
||||
let relative_relish_path =
|
||||
custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?
|
||||
.join("wrong_timeline_id_name");
|
||||
let wrong_timeline_id_path = storage_root.join(&relative_relish_path);
|
||||
let error_message = storage_info_error(&storage, &wrong_timeline_id_path);
|
||||
assert!(
|
||||
error_message.contains(relative_relish_path.to_str().unwrap()),
|
||||
"Error message '{}' does not contain the expected substring",
|
||||
error_message
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let dummy_storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let storage_path = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.info(&storage_path)?.download_destination;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
"'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod fs_tests {
|
||||
use crate::{
|
||||
relish_storage::test_utils::relative_timeline_path, repository::repo_harness::RepoHarness,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_relish() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("upload_relish")?;
|
||||
let storage = create_storage()?;
|
||||
|
||||
let mut source = create_file_for_upload(
|
||||
&storage.pageserver_workdir.join("whatever"),
|
||||
"whatever_contents",
|
||||
)
|
||||
.await?;
|
||||
let target_path = PathBuf::from("/").join("somewhere").join("else");
|
||||
match storage.upload_relish(&mut source, &target_path).await {
|
||||
Ok(()) => panic!("Should not allow storing files with wrong target path"),
|
||||
Err(e) => {
|
||||
let message = format!("{:?}", e);
|
||||
assert!(message.contains(&target_path.display().to_string()));
|
||||
assert!(message.contains("does not belong to the current storage"));
|
||||
}
|
||||
}
|
||||
assert!(storage.list_relishes().await?.is_empty());
|
||||
|
||||
let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
|
||||
assert_eq!(
|
||||
storage.list_relishes().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
|
||||
assert_eq!(
|
||||
list_relishes_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
"Should list a two different files after second upload"
|
||||
);
|
||||
|
||||
// match storage.upload_relish(&mut source, &target_path_1).await {
|
||||
// Ok(()) => panic!("Should not allow reuploading storage files"),
|
||||
// Err(e) => {
|
||||
// let message = format!("{:?}", e);
|
||||
// assert!(message.contains(&target_path_1.display().to_string()));
|
||||
// assert!(message.contains("File exists"));
|
||||
// }
|
||||
// }
|
||||
assert_eq!(
|
||||
list_relishes_sorted(&storage).await?,
|
||||
vec![target_path_1, target_path_2],
|
||||
"Should list a two different files after all upload attempts"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
|
||||
Ok(storage)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_relish() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_relish")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
let contents_bytes = storage
|
||||
.download_relish(&upload_target, std::io::BufWriter::new(Vec::new()))
|
||||
.await?
|
||||
.into_inner()?;
|
||||
let contents = String::from_utf8(contents_bytes)?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
contents,
|
||||
"We should upload and download the same contents"
|
||||
);
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage
|
||||
.download_relish(&non_existing_path, std::io::BufWriter::new(Vec::new()))
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_relish() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("delete_relish")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
storage.delete_relish(&upload_target).await?;
|
||||
assert!(storage.list_relishes().await?.is_empty());
|
||||
|
||||
match storage.delete_relish(&upload_target).await {
|
||||
Ok(()) => panic!("Should not allow deleting non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&upload_target.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
harness: &RepoHarness,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let storage_path = storage
|
||||
.root
|
||||
.join(relative_timeline_path(harness)?)
|
||||
.join(name);
|
||||
storage
|
||||
.upload_relish(
|
||||
&mut create_file_for_upload(
|
||||
&storage.pageserver_workdir.join(name),
|
||||
&dummy_contents(name),
|
||||
)
|
||||
.await?,
|
||||
&storage_path,
|
||||
)
|
||||
.await?;
|
||||
Ok(storage_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
path: &Path,
|
||||
contents: &str,
|
||||
) -> anyhow::Result<io::BufReader<fs::File>> {
|
||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(path)?;
|
||||
write!(file_for_writing, "{}", contents)?;
|
||||
drop(file_for_writing);
|
||||
Ok(io::BufReader::new(
|
||||
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||
))
|
||||
}
|
||||
|
||||
fn dummy_contents(name: &str) -> String {
|
||||
format!("contents for {}", name)
|
||||
}
|
||||
|
||||
async fn list_relishes_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut relishes = storage.list_relishes().await?;
|
||||
relishes.sort();
|
||||
Ok(relishes)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,35 +1,45 @@
|
||||
//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
|
||||
//! AWS S3 relish storage wrapper around `rust_s3` library.
|
||||
//! Currently does not allow multiple pageservers to use the same bucket concurrently: relishes are
|
||||
//! placed in the root of the bucket.
|
||||
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::{
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
|
||||
use crate::{
|
||||
relish_storage::{strip_workspace_prefix, RelishStorage},
|
||||
layered_repository::METADATA_FILE_NAME,
|
||||
relish_storage::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo},
|
||||
S3Config,
|
||||
};
|
||||
|
||||
const S3_FILE_SEPARATOR: char = '/';
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct S3ObjectKey(String);
|
||||
|
||||
impl S3ObjectKey {
|
||||
fn key(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
|
||||
fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
|
||||
pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 relish storage.
|
||||
pub struct RustS3 {
|
||||
pub struct S3 {
|
||||
pageserver_workdir: &'static Path,
|
||||
bucket: Bucket,
|
||||
}
|
||||
|
||||
impl RustS3 {
|
||||
impl S3 {
|
||||
/// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
let region = aws_config
|
||||
.bucket_region
|
||||
.parse::<Region>()
|
||||
@@ -49,19 +59,17 @@ impl RustS3 {
|
||||
credentials,
|
||||
)
|
||||
.context("Failed to create the s3 bucket")?,
|
||||
pageserver_workdir,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RelishStorage for RustS3 {
|
||||
impl RelishStorage for S3 {
|
||||
type RelishStoragePath = S3ObjectKey;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
|
||||
let mut key = String::new();
|
||||
for segment in relative_path {
|
||||
key.push(S3_FILE_SEPARATOR);
|
||||
@@ -70,6 +78,21 @@ impl RelishStorage for RustS3 {
|
||||
Ok(S3ObjectKey(key))
|
||||
}
|
||||
|
||||
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
|
||||
let storage_path_key = &storage_path.0;
|
||||
let is_metadata =
|
||||
storage_path_key.ends_with(&format!("{}{}", S3_FILE_SEPARATOR, METADATA_FILE_NAME));
|
||||
let download_destination = storage_path.download_destination(self.pageserver_workdir);
|
||||
let (tenant_id, timeline_id) =
|
||||
parse_ids_from_path(storage_path_key.split(S3_FILE_SEPARATOR), storage_path_key)?;
|
||||
Ok(RemoteRelishInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
download_destination,
|
||||
is_metadata,
|
||||
})
|
||||
}
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
let list_response = self
|
||||
.bucket
|
||||
@@ -101,11 +124,11 @@ impl RelishStorage for RustS3 {
|
||||
))
|
||||
} else {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
to.flush().context("Failed to fluch the downoad buffer")?;
|
||||
to.flush().context("Failed to flush the download buffer")?;
|
||||
Ok::<_, anyhow::Error>(to)
|
||||
})
|
||||
.await
|
||||
.context("Failed to joim the download buffer flush task")?
|
||||
.context("Failed to join the download buffer flush task")?
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,3 +170,226 @@ impl RelishStorage for RustS3 {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
relish_storage::test_utils::{
|
||||
custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn download_destination() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination")?;
|
||||
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
|
||||
let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
|
||||
|
||||
let key = S3ObjectKey(format!(
|
||||
"{}{}",
|
||||
S3_FILE_SEPARATOR,
|
||||
relative_path
|
||||
.iter()
|
||||
.map(|segment| segment.to_str().unwrap())
|
||||
.collect::<Vec<_>>()
|
||||
.join(&S3_FILE_SEPARATOR.to_string()),
|
||||
));
|
||||
|
||||
assert_eq!(
|
||||
local_path,
|
||||
key.download_destination(&repo_harness.conf.workdir),
|
||||
"Download destination should consist of s3 path joined with the pageserver workdir prefix"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
|
||||
let segment_1 = "matching";
|
||||
let segment_2 = "relish";
|
||||
let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
|
||||
let expected_key = S3ObjectKey(format!(
|
||||
"{SEPARATOR}{}{SEPARATOR}{}",
|
||||
segment_1,
|
||||
segment_2,
|
||||
SEPARATOR = S3_FILE_SEPARATOR,
|
||||
));
|
||||
|
||||
let actual_key = dummy_storage(&repo_harness.conf.workdir)
|
||||
.storage_path(local_path)
|
||||
.expect("Matching path should map to S3 path normally");
|
||||
assert_eq!(
|
||||
expected_key,
|
||||
actual_key,
|
||||
"S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
|
||||
match storage.storage_path(mismatching_path) {
|
||||
Ok(wrong_key) => panic!(
|
||||
"Expected path '{}' to error, but got S3 key: {:?}",
|
||||
mismatching_path.display(),
|
||||
wrong_key,
|
||||
),
|
||||
Err(e) => e.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("storage_path_negatives")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
|
||||
assert!(
|
||||
error_message.contains("Prefix and the path are equal"),
|
||||
"Message '{}' does not contain the required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
let mismatching_path = PathBuf::from("somewhere").join("else");
|
||||
let error_message = storage_path_error(&storage, &mismatching_path);
|
||||
assert!(
|
||||
error_message.contains(mismatching_path.to_str().unwrap()),
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains("is not prefixed with"),
|
||||
"Message '{}' does not contain a required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("info_positive")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
|
||||
|
||||
let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
|
||||
assert_eq!(
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
is_metadata: false,
|
||||
},
|
||||
storage
|
||||
.info(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta relish"
|
||||
);
|
||||
|
||||
let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
|
||||
assert_eq!(
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
is_metadata: true,
|
||||
},
|
||||
storage
|
||||
.info(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_info_error(storage: &S3, s3_key: &S3ObjectKey) -> String {
|
||||
match storage.info(s3_key) {
|
||||
Ok(wrong_info) => panic!(
|
||||
"Expected key {:?} to error, but got relish info: {:?}",
|
||||
s3_key, wrong_info,
|
||||
),
|
||||
Err(e) => e.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("info_negatives")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
let error_message =
|
||||
storage_info_error(&storage, &S3ObjectKey(totally_wrong_path.to_string()));
|
||||
assert!(error_message.contains(totally_wrong_path));
|
||||
|
||||
let wrong_tenant_id = create_s3_key(
|
||||
&custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?.join("name"),
|
||||
);
|
||||
let error_message = storage_info_error(&storage, &wrong_tenant_id);
|
||||
assert!(error_message.contains(&wrong_tenant_id.0));
|
||||
|
||||
let wrong_timeline_id = create_s3_key(
|
||||
&custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?.join("name"),
|
||||
);
|
||||
let error_message = storage_info_error(&storage, &wrong_timeline_id);
|
||||
assert!(error_message.contains(&wrong_timeline_id.0));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
|
||||
|
||||
let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let key = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.info(&key)?.download_destination;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
"'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
|
||||
S3 {
|
||||
pageserver_workdir,
|
||||
bucket: Bucket::new(
|
||||
"dummy-bucket",
|
||||
"us-east-1".parse().unwrap(),
|
||||
Credentials::anonymous().unwrap(),
|
||||
)
|
||||
.unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_key(relative_relish_path: &Path) -> S3ObjectKey {
|
||||
S3ObjectKey(
|
||||
relative_relish_path
|
||||
.iter()
|
||||
.fold(String::new(), |mut path_string, segment| {
|
||||
path_string.push(S3_FILE_SEPARATOR);
|
||||
path_string.push_str(segment.to_str().unwrap());
|
||||
path_string
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
1559
pageserver/src/relish_storage/storage_sync.rs
Normal file
1559
pageserver/src/relish_storage/storage_sync.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,57 +0,0 @@
|
||||
use std::time::Duration;
|
||||
use std::{collections::BinaryHeap, sync::Mutex, thread};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::{relish_storage::RelishStorage, PageServerConf};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
|
||||
}
|
||||
|
||||
pub fn schedule_timeline_upload(_local_timeline: ()) {
|
||||
// UPLOAD_QUEUE
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .push(SyncTask::Upload(local_timeline))
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum SyncTask {}
|
||||
|
||||
pub fn run_storage_sync_thread<
|
||||
P: std::fmt::Debug,
|
||||
S: 'static + RelishStorage<RelishStoragePath = P>,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
relish_storage: S,
|
||||
max_concurrent_sync: usize,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("Queue based relish storage sync".to_string())
|
||||
.spawn(move || {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
|
||||
log::debug!("Upload queue length: {}", queue_accessor.len());
|
||||
let next_task = queue_accessor.pop();
|
||||
drop(queue_accessor);
|
||||
match next_task {
|
||||
Some(task) => runtime.block_on(async {
|
||||
// suppress warnings
|
||||
let _ = (config, task, &relish_storage, max_concurrent_sync);
|
||||
todo!("omitted for brevity")
|
||||
}),
|
||||
None => {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
log::debug!("Queue based relish storage sync thread shut down");
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(Some(handle))
|
||||
}
|
||||
@@ -214,27 +214,114 @@ impl WALRecord {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod repo_harness {
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use crate::{
|
||||
layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
|
||||
walredo::{WalRedoError, WalRedoManager},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use hex_literal::hex;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
pub const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
pub const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
pub fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(8192, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
pub struct RepoHarness {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_id: ZTenantId,
|
||||
}
|
||||
|
||||
impl RepoHarness {
|
||||
pub fn create(test_name: &'static str) -> Result<Self> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Box<dyn Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
Box::new(LayeredRepository::new(
|
||||
self.conf,
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
false,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager;
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} blk {} to get to {}, with {} and {} records",
|
||||
rel,
|
||||
blknum,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that should work the same with any Repository/Timeline implementation.
|
||||
///
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::repo_harness::*;
|
||||
use super::*;
|
||||
use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use hex_literal::hex;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
use crate::layered_repository::METADATA_FILE_NAME;
|
||||
use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
@@ -250,16 +337,6 @@ mod tests {
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(8192, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
|
||||
let incremental = timeline.get_current_logical_size();
|
||||
let non_incremental = timeline
|
||||
@@ -271,45 +348,6 @@ mod tests {
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
struct RepoHarness {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
}
|
||||
|
||||
impl RepoHarness {
|
||||
fn create(test_name: &'static str) -> Result<Self> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
|
||||
fn load(&self) -> Box<dyn Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
Box::new(LayeredRepository::new(
|
||||
self.conf,
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
false,
|
||||
))
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_relsize")?.load();
|
||||
@@ -821,33 +859,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager;
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} blk {} to get to {}, with {} and {} records",
|
||||
rel,
|
||||
blknum,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,8 +123,6 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
|
||||
tenant.state = TenantState::Active;
|
||||
}
|
||||
|
||||
// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
|
||||
// Relevant PR: https://github.com/zenithdb/zenith/pull/686
|
||||
pub fn register_relish_download(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
@@ -138,14 +136,16 @@ pub fn register_relish_download(
|
||||
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
let mut tenant = m.get_mut(&tenant_id).unwrap();
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
|
||||
state: TenantState::Downloading,
|
||||
repo: None,
|
||||
});
|
||||
tenant.state = TenantState::Downloading;
|
||||
match &tenant.repo {
|
||||
Some(repo) => init_timeline(repo.as_ref(), timeline_id),
|
||||
None => {
|
||||
log::info!("Initialize new repo");
|
||||
}
|
||||
None => log::warn!("Initialize new repo"),
|
||||
}
|
||||
tenant.state = TenantState::Active;
|
||||
}
|
||||
|
||||
// init repo updates Tenant state
|
||||
|
||||
Reference in New Issue
Block a user