mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 10:22:56 +00:00
Compare commits
84 Commits
mx_offset_
...
remove-lay
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
db1ea8b430 | ||
|
|
2dad3a6445 | ||
|
|
d36fd4f141 | ||
|
|
a8a6603e66 | ||
|
|
87c82f5151 | ||
|
|
c9a2b1fd10 | ||
|
|
fdfa86b5b0 | ||
|
|
3eb85957df | ||
|
|
6b4a28bf7f | ||
|
|
22d6c1dda6 | ||
|
|
abfac6ef2a | ||
|
|
e51f2be3d0 | ||
|
|
4e4bc8fbde | ||
|
|
dd2a77c2ef | ||
|
|
ef95637c65 | ||
|
|
206b5d2ada | ||
|
|
77fea61fcc | ||
|
|
e3f4c0e4ac | ||
|
|
2302ecda04 | ||
|
|
5257bbe2b9 | ||
|
|
6b61ed5fab | ||
|
|
f28bf70596 | ||
|
|
dc9c33139b | ||
|
|
0260ee23b9 | ||
|
|
c2f5d011c7 | ||
|
|
1022b4b98f | ||
|
|
791eebefe2 | ||
|
|
50b686c3e4 | ||
|
|
39b10696e9 | ||
|
|
264b0ada9f | ||
|
|
78338f7b94 | ||
|
|
0d533ce840 | ||
|
|
978f1879b9 | ||
|
|
c863d679f8 | ||
|
|
28e9eb6539 | ||
|
|
d07a4d02bb | ||
|
|
c61731a31f | ||
|
|
5a55dce282 | ||
|
|
e6de1b0e8c | ||
|
|
58be279be1 | ||
|
|
d1b92e976a | ||
|
|
7d46c7c118 | ||
|
|
d6c1a9aa18 | ||
|
|
49f2eac934 | ||
|
|
5c8387aff1 | ||
|
|
3c4680b718 | ||
|
|
6b7cbec9b3 | ||
|
|
6de9a33c05 | ||
|
|
77f883c95c | ||
|
|
e6557f4f91 | ||
|
|
e8db20eb26 | ||
|
|
7552e2d25f | ||
|
|
dfb4160403 | ||
|
|
f9bdc030e8 | ||
|
|
e4c9b83a39 | ||
|
|
a78c16328e | ||
|
|
eed99b7251 | ||
|
|
a2fbd93e91 | ||
|
|
66f8f686a0 | ||
|
|
919f2b261a | ||
|
|
9d273c840a | ||
|
|
6600e1f896 | ||
|
|
348369414b | ||
|
|
3890acaf7f | ||
|
|
f537a7a873 | ||
|
|
71bc45a21b | ||
|
|
decef74503 | ||
|
|
8aed805933 | ||
|
|
b8488e70a9 | ||
|
|
16fdd104ac | ||
|
|
bb6dbd2f43 | ||
|
|
c4c4558736 | ||
|
|
bfdc09cf4a | ||
|
|
1839ce0545 | ||
|
|
8e04f0455e | ||
|
|
6839773538 | ||
|
|
2a96c4cfcd | ||
|
|
027cf22663 | ||
|
|
f4daa877b5 | ||
|
|
ed28ced3bc | ||
|
|
d7c120574b | ||
|
|
c9188ffa67 | ||
|
|
c631fa1f50 | ||
|
|
795c3ca131 |
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2156,6 +2156,7 @@ dependencies = [
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
|
||||
@@ -15,8 +15,12 @@ use bytes::{BufMut, Bytes, BytesMut};
|
||||
/// A state of a tenant in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TenantState {
|
||||
/// Tenant is fully operational, its background jobs might be running or not.
|
||||
Active { background_jobs_running: bool },
|
||||
// This tenant is being loaded from local disk
|
||||
Loading,
|
||||
// This tenant is being downloaded from cloud storage.
|
||||
Attaching,
|
||||
/// Tenant is fully operational
|
||||
Active,
|
||||
/// A tenant is recognized by pageserver, but it is being detached or the
|
||||
/// system is being shut down.
|
||||
Paused,
|
||||
@@ -25,10 +29,23 @@ pub enum TenantState {
|
||||
Broken,
|
||||
}
|
||||
|
||||
impl TenantState {
|
||||
pub fn has_in_progress_downloads(&self) -> bool {
|
||||
match self {
|
||||
Self::Loading => true,
|
||||
Self::Attaching => true,
|
||||
Self::Active => false,
|
||||
Self::Paused => false,
|
||||
Self::Broken => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A state of a timeline in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TimelineState {
|
||||
/// Timeline is fully operational, its background jobs are running.
|
||||
/// Timeline is fully operational. If the containing Tenant is Active, the timeline's
|
||||
/// background jobs are running otherwise they will be launched when the tenant is activated.
|
||||
Active,
|
||||
/// A timeline is recognized by pageserver, but not yet ready to operate.
|
||||
/// The status indicates, that the timeline could eventually go back to Active automatically:
|
||||
@@ -170,6 +187,8 @@ pub struct TimelineInfo {
|
||||
pub latest_gc_cutoff_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_logical_size_non_incremental: Option<u64>,
|
||||
@@ -182,8 +201,6 @@ pub struct TimelineInfo {
|
||||
pub last_received_msg_ts: Option<u128>,
|
||||
pub pg_version: u32,
|
||||
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
pub awaits_download: bool,
|
||||
|
||||
pub state: TimelineState,
|
||||
|
||||
45
libs/utils/src/fs_ext.rs
Normal file
45
libs/utils/src/fs_ext.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
/// Extensions to `std::fs` types.
|
||||
use std::{fs, io, path::Path};
|
||||
|
||||
pub trait PathExt {
|
||||
/// Returns an error if `self` is not a directory.
|
||||
fn is_empty_dir(&self) -> io::Result<bool>;
|
||||
}
|
||||
|
||||
impl<P> PathExt for P
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
fn is_empty_dir(&self) -> io::Result<bool> {
|
||||
Ok(fs::read_dir(self)?.into_iter().next().is_none())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[test]
|
||||
fn is_empty_dir() {
|
||||
use super::PathExt;
|
||||
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
// test positive case
|
||||
assert!(
|
||||
dir_path.is_empty_dir().expect("test failure"),
|
||||
"new tempdir should be empty"
|
||||
);
|
||||
|
||||
// invoke on a file to ensure it returns an error
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
let f = std::fs::File::create(&file_path).unwrap();
|
||||
drop(f);
|
||||
assert!(file_path.is_empty_dir().is_err());
|
||||
|
||||
// do it again on a path, we know to be nonexistent
|
||||
std::fs::remove_file(&file_path).unwrap();
|
||||
assert!(file_path.is_empty_dir().is_err());
|
||||
}
|
||||
}
|
||||
@@ -48,6 +48,8 @@ pub mod nonblock;
|
||||
// Default signal handling
|
||||
pub mod signals;
|
||||
|
||||
pub mod fs_ext;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
|
||||
@@ -35,6 +35,7 @@ itertools = "0.10.3"
|
||||
nix = "0.25"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13.0"
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
|
||||
@@ -297,7 +297,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
})
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
let remote_index = {
|
||||
{
|
||||
let _rt_guard = BACKGROUND_RUNTIME.enter();
|
||||
tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
|
||||
};
|
||||
@@ -309,7 +309,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
{
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||
|
||||
let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?;
|
||||
let router = http::make_router(conf, auth.clone(), remote_storage)?;
|
||||
let service =
|
||||
utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
|
||||
@@ -25,7 +25,7 @@ use utils::{
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
|
||||
|
||||
@@ -397,6 +397,11 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id)
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
|
||||
@@ -604,13 +604,7 @@ components:
|
||||
id:
|
||||
type: string
|
||||
state:
|
||||
oneOf:
|
||||
- type: string
|
||||
- type: object
|
||||
properties:
|
||||
background_jobs_running:
|
||||
type: boolean
|
||||
|
||||
type: string
|
||||
current_physical_size:
|
||||
type: integer
|
||||
has_in_progress_downloads:
|
||||
|
||||
@@ -3,19 +3,17 @@ use std::sync::Arc;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::task::JoinError;
|
||||
use tracing::*;
|
||||
|
||||
use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest,
|
||||
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
|
||||
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant::{TenantState, Timeline};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::{config::PageServerConf, tenant_mgr};
|
||||
use utils::{
|
||||
@@ -27,7 +25,7 @@ use utils::{
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
@@ -40,7 +38,6 @@ use crate::CheckpointConfig;
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
}
|
||||
@@ -49,7 +46,6 @@ impl State {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
@@ -60,7 +56,6 @@ impl State {
|
||||
conf,
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_index,
|
||||
remote_storage,
|
||||
})
|
||||
}
|
||||
@@ -86,11 +81,27 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
state: &State,
|
||||
fn build_timeline_info(
|
||||
tenant_state: TenantState,
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let mut info = build_timeline_info_common(tenant_state, timeline)?;
|
||||
if include_non_incremental_logical_size {
|
||||
info.current_logical_size_non_incremental =
|
||||
Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
|
||||
}
|
||||
if include_non_incremental_physical_size {
|
||||
info.current_physical_size_non_incremental =
|
||||
Some(timeline.get_physical_size_non_incremental()?)
|
||||
}
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn build_timeline_info_common(
|
||||
tenant_state: TenantState,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||
@@ -106,22 +117,6 @@ async fn build_timeline_info(
|
||||
}
|
||||
};
|
||||
|
||||
let (remote_consistent_lsn, awaits_download) = if let Some(remote_entry) = state
|
||||
.remote_index
|
||||
.read()
|
||||
.await
|
||||
.timeline_entry(&TenantTimelineId {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
}) {
|
||||
(
|
||||
Some(remote_entry.metadata.disk_consistent_lsn()),
|
||||
remote_entry.awaits_download,
|
||||
)
|
||||
} else {
|
||||
(None, false)
|
||||
};
|
||||
|
||||
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
|
||||
let ancestor_lsn = match timeline.get_ancestor_lsn() {
|
||||
Lsn(0) => None,
|
||||
@@ -136,6 +131,7 @@ async fn build_timeline_info(
|
||||
};
|
||||
let current_physical_size = Some(timeline.get_physical_size());
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_id,
|
||||
@@ -143,30 +139,25 @@ async fn build_timeline_info(
|
||||
ancestor_timeline_id,
|
||||
ancestor_lsn,
|
||||
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
||||
remote_consistent_lsn,
|
||||
last_record_lsn,
|
||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
current_logical_size,
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: if include_non_incremental_logical_size {
|
||||
Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
current_physical_size_non_incremental: if include_non_incremental_physical_size {
|
||||
Some(timeline.get_physical_size_non_incremental()?)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
current_logical_size_non_incremental: None,
|
||||
current_physical_size_non_incremental: None,
|
||||
wal_source_connstr,
|
||||
last_received_msg_lsn,
|
||||
last_received_msg_ts,
|
||||
pg_version: timeline.pg_version,
|
||||
|
||||
remote_consistent_lsn,
|
||||
awaits_download,
|
||||
state,
|
||||
|
||||
// XXX bring back tracking of downloads per timeline, or, introduce
|
||||
// an 'Attaching' state for the timeline and get rid of this field.
|
||||
awaits_download: tenant_state == TenantState::Attaching,
|
||||
|
||||
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
|
||||
// with the control plane.
|
||||
local: LocalTimelineInfo {
|
||||
@@ -176,7 +167,7 @@ async fn build_timeline_info(
|
||||
current_physical_size,
|
||||
},
|
||||
remote: RemoteTimelineInfo {
|
||||
remote_consistent_lsn,
|
||||
remote_consistent_lsn: Some(remote_consistent_lsn),
|
||||
},
|
||||
};
|
||||
Ok(info)
|
||||
@@ -194,34 +185,28 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let new_timeline_id = request_data
|
||||
.new_timeline_id
|
||||
.unwrap_or_else(TimelineId::generate);
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let new_timeline_info = async {
|
||||
match tenant.create_timeline(
|
||||
request_data.new_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
|
||||
).await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info(state, &new_timeline, false, false)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Ok(Some(timeline_info))
|
||||
}
|
||||
Ok(None) => Ok(None), // timeline already exists
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
|
||||
)
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await?;
|
||||
|
||||
Ok(match new_timeline_info {
|
||||
Some(info) => json_response(StatusCode::CREATED, info)?,
|
||||
None => json_response(StatusCode::CONFLICT, ())?,
|
||||
})
|
||||
}
|
||||
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -232,22 +217,21 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let _entered = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
|
||||
let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
|
||||
let (tenant_state, timelines) = {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok(tenant.list_timelines())
|
||||
})?;
|
||||
(tenant.current_state(), tenant.list_timelines())
|
||||
};
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
let timeline_info = build_timeline_info(
|
||||
state,
|
||||
tenant_state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
.await
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -296,24 +280,25 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline_info = async {
|
||||
let timeline = tokio::task::spawn_blocking(move || {
|
||||
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false)
|
||||
let (tenant_state, timeline) = tokio::task::spawn_blocking(move || {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok((
|
||||
tenant.current_state(),
|
||||
tenant.get_timeline(timeline_id, false),
|
||||
))
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
let timeline = timeline.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
state,
|
||||
tenant_state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
.await
|
||||
.context("Failed to get local timeline info: {e:#}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -358,117 +343,28 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
info!("Handling tenant attach {tenant_id}");
|
||||
|
||||
tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) {
|
||||
Ok(tenant) => {
|
||||
if tenant.list_timelines().is_empty() {
|
||||
info!("Attaching to tenant {tenant_id} with zero timelines");
|
||||
Ok(())
|
||||
} else {
|
||||
Err(ApiError::Conflict(
|
||||
"Tenant is already present locally".to_owned(),
|
||||
))
|
||||
}
|
||||
}
|
||||
Err(_) => Ok(()),
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
if let Some(tenant_entry) = index_accessor.tenant_entry_mut(&tenant_id) {
|
||||
if tenant_entry.has_in_progress_downloads() {
|
||||
return Err(ApiError::Conflict(
|
||||
"Tenant download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
for (timeline_id, remote_timeline) in tenant_entry.iter_mut() {
|
||||
storage_sync::schedule_layer_download(tenant_id, *timeline_id);
|
||||
remote_timeline.awaits_download = true;
|
||||
}
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
}
|
||||
// no tenant in the index, release the lock to make the potentially lengthy download operation
|
||||
drop(index_accessor);
|
||||
|
||||
// download index parts for every tenant timeline
|
||||
let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await {
|
||||
Ok(Some(remote_timelines)) => remote_timelines,
|
||||
Ok(None) => return Err(ApiError::NotFound(anyhow!("Unknown remote tenant"))),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote tenant data: {:?}", e);
|
||||
return Err(ApiError::NotFound(anyhow!(
|
||||
"Failed to retrieve remote tenant"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
// recheck that download is not in progress because
|
||||
// we've released the lock to avoid holding it during the download
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
let tenant_entry = match index_accessor.tenant_entry_mut(&tenant_id) {
|
||||
Some(tenant_entry) => {
|
||||
if tenant_entry.has_in_progress_downloads() {
|
||||
return Err(ApiError::Conflict(
|
||||
"Tenant download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
tenant_entry
|
||||
}
|
||||
None => index_accessor.add_tenant_entry(tenant_id),
|
||||
};
|
||||
|
||||
// populate remote index with the data from index part and create directories on the local filesystem
|
||||
for (timeline_id, mut remote_timeline) in remote_timelines {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
if let Some(remote_storage) = &state.remote_storage {
|
||||
// FIXME: distinguish between "Tenant already exists" and other errors
|
||||
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage)
|
||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
remote_timeline.awaits_download = true;
|
||||
tenant_entry.insert(timeline_id, remote_timeline);
|
||||
// schedule actual download
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
} else {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"attach_tenant is possible because pageserver was configured without remote storage"
|
||||
)));
|
||||
}
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
/// Note: is expensive from s3 access perspective,
|
||||
/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts`
|
||||
async fn gather_tenant_timelines_index_parts(
|
||||
state: &State,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Option<Vec<(TimelineId, RemoteTimeline)>>> {
|
||||
let index_parts = match state.remote_storage.as_ref() {
|
||||
Some(storage) => {
|
||||
storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
.with_context(|| format!("Failed to download index parts for tenant {tenant_id}"))?;
|
||||
|
||||
let mut remote_timelines = Vec::with_capacity(index_parts.len());
|
||||
for (timeline_id, index_part) in index_parts {
|
||||
let timeline_path = state.conf.timeline_path(&timeline_id, &tenant_id);
|
||||
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.with_context(|| {
|
||||
format!("Failed to convert index part into remote timeline for timeline {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
remote_timelines.push((timeline_id, remote_timeline));
|
||||
}
|
||||
Ok(Some(remote_timelines))
|
||||
}
|
||||
|
||||
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::delete_timeline(tenant_id, timeline_id)
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
@@ -477,12 +373,6 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
// it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_timeline_entry(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
@@ -499,22 +389,23 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
// Replace this with better handling once the error type permits it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_tenant_entry(&tenant_id);
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
// clone to avoid holding the lock while awaiting for blocking task
|
||||
let remote_index = state.remote_index.read().await.clone();
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
crate::tenant_mgr::list_tenant_info(&remote_index)
|
||||
tenant_mgr::list_tenants()
|
||||
.iter()
|
||||
.map(|(id, state)| TenantInfo {
|
||||
id: *id,
|
||||
state: *state,
|
||||
current_physical_size: None,
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>()
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
@@ -526,58 +417,38 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false);
|
||||
let tenant_info = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered();
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
|
||||
let index_accessor = remote_index.read().await;
|
||||
let has_in_progress_downloads = index_accessor
|
||||
.tenant_entry(&tenant_id)
|
||||
.map(|t| t.has_in_progress_downloads())
|
||||
.unwrap_or_else(|| {
|
||||
info!("Tenant {tenant_id} not found in remote index");
|
||||
false
|
||||
});
|
||||
|
||||
let (tenant_state, current_physical_size) = match tenant {
|
||||
Ok(tenant) => {
|
||||
let timelines = tenant.list_timelines();
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in timelines {
|
||||
current_physical_size += timeline.get_physical_size();
|
||||
}
|
||||
|
||||
(tenant.current_state(), Some(current_physical_size))
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
current_physical_size += timeline.get_physical_size();
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to get local tenant state: {e:#}");
|
||||
if has_in_progress_downloads {
|
||||
(TenantState::Paused, None)
|
||||
} else {
|
||||
(TenantState::Broken, None)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TenantInfo {
|
||||
let state = tenant.current_state();
|
||||
let tenant_info = TenantInfo {
|
||||
id: tenant_id,
|
||||
state: tenant_state,
|
||||
current_physical_size,
|
||||
has_in_progress_downloads: Some(has_in_progress_downloads),
|
||||
},
|
||||
)
|
||||
state,
|
||||
current_physical_size: Some(current_physical_size),
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
};
|
||||
|
||||
Ok::<_, anyhow::Error>(tenant_info)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
}
|
||||
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||
let inputs = tenant
|
||||
@@ -625,8 +496,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
println!("tenant create: {:?}", request_data.trace_read_requests);
|
||||
let remote_index = get_state(&request).remote_index.clone();
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
@@ -696,20 +565,42 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
.map(TenantId::from)
|
||||
.unwrap_or_else(TenantId::generate);
|
||||
|
||||
let new_tenant_id = tokio::task::spawn_blocking(move || {
|
||||
let new_tenant = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
|
||||
let conf = get_config(&request);
|
||||
let state = get_state(&request);
|
||||
|
||||
tenant_mgr::create_tenant(conf, tenant_conf, target_tenant_id, remote_index)
|
||||
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||
// with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
tenant_mgr::create_tenant(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
state.remote_storage.clone(),
|
||||
)
|
||||
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||
// with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
Ok(match new_tenant_id {
|
||||
Some(id) => json_response(StatusCode::CREATED, TenantCreateResponse(id))?,
|
||||
Ok(match new_tenant {
|
||||
Some(tenant) => {
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
if let res @ Err(_) = tenant.wait_to_become_active().await {
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in tenant_mgr::create_tenant, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
// Don't do cleanup because we don't know how we got here.
|
||||
// The tenant will likely be in `Broken` state and subsequent
|
||||
// calls will fail.
|
||||
res.context("created tenant failed to become active")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
TenantCreateResponse(tenant.tenant_id()),
|
||||
)?
|
||||
}
|
||||
None => json_response(StatusCode::CONFLICT, ())?,
|
||||
})
|
||||
}
|
||||
@@ -835,22 +726,16 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
|
||||
let gc_result = wait_task_done
|
||||
.await
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
.context("wait for gc task")
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, result)
|
||||
|
||||
json_response(StatusCode::OK, gc_result)
|
||||
}
|
||||
|
||||
// Run compaction immediately on given timeline.
|
||||
@@ -864,7 +749,10 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline.compact().map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -898,7 +786,6 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
@@ -935,8 +822,7 @@ pub fn make_router(
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_index, remote_storage)
|
||||
.context("Failed to initialize router state")?,
|
||||
State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", status_handler)
|
||||
.put(
|
||||
|
||||
@@ -4,7 +4,7 @@ pub mod config;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
pub mod metrics;
|
||||
pub(crate) mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
@@ -23,11 +23,9 @@ pub mod walreceiver;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use tracing::info;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
@@ -73,7 +71,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
//
|
||||
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
|
||||
// Should it?
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await;
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
@@ -108,20 +106,6 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds
|
||||
}
|
||||
}
|
||||
|
||||
/// A newtype to store arbitrary data grouped by tenant and timeline ids.
|
||||
/// One could use [`utils::id::TenantTimelineId`] for grouping, but that would
|
||||
/// not include the cases where a certain tenant has zero timelines.
|
||||
/// This is sometimes important: a tenant could be registered during initial load from FS,
|
||||
/// even if he has no timelines on disk.
|
||||
#[derive(Debug)]
|
||||
pub struct TenantTimelineValues<T>(HashMap<TenantId, HashMap<TimelineId, T>>);
|
||||
|
||||
impl<T> TenantTimelineValues<T> {
|
||||
fn new() -> Self {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
}
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use metrics::{
|
||||
register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge,
|
||||
UIntGaugeVec,
|
||||
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
|
||||
register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
|
||||
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -200,63 +199,59 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_remote_storage_remaining_sync_items",
|
||||
"Number of storage sync items left in the queue"
|
||||
// remote storage metrics
|
||||
|
||||
pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_remote_upload_queue_unfinished_tasks",
|
||||
"Number of tasks in the upload queue that are not finished yet.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
)
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge")
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static IMAGE_SYNC_TIME: Lazy<GaugeVec> = Lazy::new(|| {
|
||||
register_gauge_vec!(
|
||||
"pageserver_remote_storage_image_sync_duration",
|
||||
"Time spent to synchronize (up/download) a whole pageserver image",
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to register per-timeline pageserver image sync time vec")
|
||||
});
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
Download,
|
||||
Delete,
|
||||
}
|
||||
impl RemoteOpKind {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
Self::Upload => "upload",
|
||||
Self::Download => "download",
|
||||
Self::Delete => "delete",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
|
||||
pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"];
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum RemoteOpFileKind {
|
||||
Layer,
|
||||
Index,
|
||||
}
|
||||
impl RemoteOpFileKind {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
Self::Layer => "layer",
|
||||
Self::Index => "index",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub static IMAGE_SYNC_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_storage_image_sync_count",
|
||||
"Number of synchronization operations executed for pageserver images. \
|
||||
Grouped by tenant, timeline, operation_kind and status",
|
||||
&["tenant_id", "timeline_id", "operation_kind", "status"]
|
||||
)
|
||||
.expect("failed to register pageserver image sync count vec")
|
||||
});
|
||||
pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
|
||||
pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"];
|
||||
pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"];
|
||||
|
||||
pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_storage_image_sync_seconds",
|
||||
"Time took to synchronize (download or upload) a whole pageserver image. \
|
||||
Grouped by operation_kind and status",
|
||||
&["operation_kind", "status"],
|
||||
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
|
||||
"pageserver_remote_operation_seconds",
|
||||
"Time spent on remote storage operations. \
|
||||
Grouped by tenant, timeline, operation_kind and status",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
|
||||
)
|
||||
.expect("failed to register pageserver image sync time histogram vec")
|
||||
});
|
||||
|
||||
pub static REMOTE_INDEX_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_storage_remote_index_uploads_total",
|
||||
"Number of remote index uploads",
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to register pageserver remote index upload vec")
|
||||
});
|
||||
|
||||
pub static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_storage_no_layers_uploads_total",
|
||||
"Number of skipped uploads due to no layers",
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to register pageserver no layers upload vec")
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -473,16 +468,90 @@ impl Drop for TimelineMetrics {
|
||||
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in IMAGE_SYNC_OPERATION_KINDS {
|
||||
for status in IMAGE_SYNC_STATUS {
|
||||
let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]);
|
||||
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
for file_kind in REMOTE_OPERATION_FILE_KINDS {
|
||||
for op in REMOTE_OPERATION_KINDS {
|
||||
for status in REMOTE_OPERATION_STATUSES {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
file_kind,
|
||||
op,
|
||||
status,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
|
||||
}
|
||||
|
||||
use futures::Future;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::Instant;
|
||||
|
||||
/// Wrapper future that measures the time spent by a remote storage operation,
|
||||
/// and records the time and success/failure as a prometheus metric.
|
||||
pub trait MeasureRemoteOp: Sized {
|
||||
fn measure_remote_op(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
) -> MeasuredRemoteOp<Self> {
|
||||
let start = Instant::now();
|
||||
MeasuredRemoteOp {
|
||||
inner: self,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
file_kind,
|
||||
op,
|
||||
start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Sized> MeasureRemoteOp for T {}
|
||||
|
||||
pin_project! {
|
||||
pub struct MeasuredRemoteOp<F>
|
||||
{
|
||||
#[pin]
|
||||
inner: F,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
start: Instant,
|
||||
}
|
||||
}
|
||||
|
||||
impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
type Output = Result<O, E>;
|
||||
|
||||
fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
|
||||
let this = self.project();
|
||||
let poll_result = this.inner.poll(cx);
|
||||
if let Poll::Ready(ref res) = poll_result {
|
||||
let duration = this.start.elapsed();
|
||||
let status = if res.is_ok() { &"success" } else { &"failure" };
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&this.tenant_id.to_string(),
|
||||
&this.timeline_id.to_string(),
|
||||
this.file_kind.as_str(),
|
||||
this.op.as_str(),
|
||||
status,
|
||||
])
|
||||
.unwrap()
|
||||
.observe(duration.as_secs_f64());
|
||||
}
|
||||
poll_result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ use std::net::TcpListener;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::pin;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
@@ -47,7 +48,7 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::tenant_mgr;
|
||||
use crate::trace::Tracer;
|
||||
use crate::CheckpointConfig;
|
||||
@@ -279,7 +280,7 @@ impl PageServerHandler {
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
// Make request tracer if needed
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id).await?;
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path = tenant
|
||||
@@ -291,7 +292,7 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
@@ -376,7 +377,7 @@ impl PageServerHandler {
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id).await?;
|
||||
let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
@@ -431,7 +432,7 @@ impl PageServerHandler {
|
||||
) -> anyhow::Result<()> {
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
|
||||
ensure!(timeline.get_last_record_lsn() == start_lsn);
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
@@ -624,7 +625,7 @@ impl PageServerHandler {
|
||||
full_backup: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// check that the timeline exists
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
@@ -766,7 +767,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
|
||||
@@ -889,7 +890,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id).await?;
|
||||
pgb.write_message(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
@@ -933,8 +934,28 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result<Arc<Timeline>> {
|
||||
tenant_mgr::get_tenant(tenant_id, true)
|
||||
/// Get active tenant.
|
||||
///
|
||||
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
|
||||
/// ensures that queries don't fail immediately after pageserver startup, because
|
||||
/// all tenants are still loading.
|
||||
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
|
||||
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
|
||||
Ok(wait_result) => wait_result
|
||||
// no .context(), the error message is good enough and some tests depend on it
|
||||
.map(move |()| tenant),
|
||||
Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||
async fn get_active_timeline_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
get_active_tenant_with_timeout(tenant_id)
|
||||
.await
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
}
|
||||
|
||||
|
||||
@@ -189,7 +189,7 @@ impl Value {
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize)]
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,90 +1,22 @@
|
||||
//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
//! Helper functions to delete files from remote storage with a RemoteStorage
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tracing::{debug, error, info};
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::storage_sync::{SyncQueue, SyncTask};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use super::{LayersDeletion, SyncData};
|
||||
|
||||
/// Attempts to remove the timleline layers from the remote storage.
|
||||
/// If the task had not adjusted the metadata before, the deletion will fail.
|
||||
pub(super) async fn delete_timeline_layers(
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_queue: &SyncQueue,
|
||||
sync_id: TenantTimelineId,
|
||||
mut delete_data: SyncData<LayersDeletion>,
|
||||
) -> bool {
|
||||
if !delete_data.data.deletion_registered {
|
||||
error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing");
|
||||
delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
|
||||
return false;
|
||||
}
|
||||
|
||||
if delete_data.data.layers_to_delete.is_empty() {
|
||||
info!("No layers to delete, skipping");
|
||||
return true;
|
||||
}
|
||||
|
||||
let layers_to_delete = delete_data
|
||||
.data
|
||||
.layers_to_delete
|
||||
.drain()
|
||||
.collect::<Vec<_>>();
|
||||
debug!("Layers to delete: {layers_to_delete:?}");
|
||||
info!("Deleting {} timeline layers", layers_to_delete.len());
|
||||
|
||||
let mut delete_tasks = layers_to_delete
|
||||
.into_iter()
|
||||
.map(|local_layer_path| async {
|
||||
match remove_storage_object(storage, &local_layer_path).await {
|
||||
Ok(()) => Ok(local_layer_path),
|
||||
Err(e) => Err((e, local_layer_path)),
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errored = false;
|
||||
while let Some(deletion_result) = delete_tasks.next().await {
|
||||
match deletion_result {
|
||||
Ok(local_layer_path) => {
|
||||
debug!(
|
||||
"Successfully deleted layer {} for timeline {sync_id}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
delete_data.data.deleted_layers.insert(local_layer_path);
|
||||
}
|
||||
Err((e, local_layer_path)) => {
|
||||
errored = true;
|
||||
error!(
|
||||
"Failed to delete layer {} for timeline {sync_id}: {e:?}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
delete_data.data.layers_to_delete.insert(local_layer_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if errored {
|
||||
debug!("Reenqueuing failed delete task for timeline {sync_id}");
|
||||
delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
|
||||
} else {
|
||||
info!("Successfully deleted all layers");
|
||||
}
|
||||
errored
|
||||
}
|
||||
|
||||
async fn remove_storage_object(
|
||||
pub(super) async fn delete_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
local_layer_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
fail::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!(
|
||||
"Deleting layer from remote storage: {:?}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
let storage_path = storage
|
||||
.remote_object_id(local_layer_path)
|
||||
.with_context(|| {
|
||||
@@ -94,6 +26,9 @@ async fn remove_storage_object(
|
||||
)
|
||||
})?;
|
||||
|
||||
// XXX: If the deletion fails because the object already didn't exist,
|
||||
// it would be good to just issue a warning but consider it success.
|
||||
// https://github.com/neondatabase/neon/issues/2934
|
||||
storage.delete(&storage_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to delete remote layer from storage at '{:?}'",
|
||||
@@ -101,135 +36,3 @@ async fn remove_storage_object(
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashSet, num::NonZeroUsize};
|
||||
|
||||
use itertools::Itertools;
|
||||
use tempfile::tempdir;
|
||||
use tokio::fs;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
storage_sync::test_utils::{create_local_timeline, dummy_metadata},
|
||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
||||
};
|
||||
use remote_storage::{LocalFs, RemoteStorage};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline_negative() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("delete_timeline_negative")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
|
||||
let deleted = delete_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
sync_id,
|
||||
SyncData {
|
||||
retries: 1,
|
||||
data: LayersDeletion {
|
||||
deleted_layers: HashSet::new(),
|
||||
layers_to_delete: HashSet::new(),
|
||||
deletion_registered: false,
|
||||
},
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(
|
||||
!deleted,
|
||||
"Should not start the deletion for task with delete metadata unregistered"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_timeline() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("delete_timeline")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "c", "d"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
for (local_path, _metadata) in timeline_upload.layers_to_upload {
|
||||
let remote_path =
|
||||
local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
}
|
||||
fs::copy(&local_path, &remote_path).await?;
|
||||
}
|
||||
assert_eq!(
|
||||
local_storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| local_storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_str| layer_str.to_string())
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
"Expect to have all layer files remotely before deletion"
|
||||
);
|
||||
|
||||
let deleted = delete_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
sync_id,
|
||||
SyncData {
|
||||
retries: current_retries,
|
||||
data: LayersDeletion {
|
||||
deleted_layers: HashSet::new(),
|
||||
layers_to_delete: HashSet::from([
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("c"),
|
||||
local_timeline_path.join("something_different"),
|
||||
]),
|
||||
deletion_registered: true,
|
||||
},
|
||||
},
|
||||
)
|
||||
.await;
|
||||
assert!(deleted, "Should be able to delete timeline files");
|
||||
|
||||
assert_eq!(
|
||||
local_storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| local_storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
vec!["b".to_string(), "d".to_string()],
|
||||
"Expect to have only non-deleted files remotely"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,155 +1,237 @@
|
||||
//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory.
|
||||
//! Helper functions to download files from remote storage with a RemoteStorage
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
fmt::Debug,
|
||||
mem,
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{bail, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::storage_sync::index::LayerFileMetadata;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncWriteExt},
|
||||
};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
storage_sync::{index::LayerFileMetadata, SyncTask},
|
||||
TEMP_FILE_SUFFIX,
|
||||
};
|
||||
use utils::{
|
||||
crashsafe::path_with_suffix_extension,
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
};
|
||||
use super::index::IndexPart;
|
||||
use super::RelativePath;
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
LayersDownload, SyncData, SyncQueue,
|
||||
};
|
||||
|
||||
// We collect timelines remotely available for each tenant
|
||||
// in case we failed to gather all index parts (due to an error)
|
||||
// Poisoned variant is returned.
|
||||
// When data is received succesfully without errors Present variant is used.
|
||||
pub enum TenantIndexParts {
|
||||
Poisoned {
|
||||
present: HashMap<TimelineId, IndexPart>,
|
||||
missing: HashSet<TimelineId>,
|
||||
},
|
||||
Present(HashMap<TimelineId, IndexPart>),
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
impl TenantIndexParts {
|
||||
fn add_poisoned(&mut self, timeline_id: TimelineId) {
|
||||
match self {
|
||||
TenantIndexParts::Poisoned { missing, .. } => {
|
||||
missing.insert(timeline_id);
|
||||
}
|
||||
TenantIndexParts::Present(present) => {
|
||||
*self = TenantIndexParts::Poisoned {
|
||||
present: mem::take(present),
|
||||
missing: HashSet::from([timeline_id]),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TenantIndexParts {
|
||||
fn default() -> Self {
|
||||
TenantIndexParts::Present(HashMap::default())
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download_index_parts(
|
||||
///
|
||||
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
|
||||
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
|
||||
///
|
||||
/// Returns the size of the downloaded file.
|
||||
pub async fn download_layer_file<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
keys: HashSet<TenantTimelineId>,
|
||||
) -> HashMap<TenantId, TenantIndexParts> {
|
||||
let mut index_parts: HashMap<TenantId, TenantIndexParts> = HashMap::new();
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
path: &'a RelativePath,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let mut part_downloads = keys
|
||||
.into_iter()
|
||||
.map(|id| async move { (id, download_index_part(conf, storage, id).await) })
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
let local_path = path.to_local_path(&timeline_path);
|
||||
|
||||
while let Some((id, part_upload_result)) = part_downloads.next().await {
|
||||
match part_upload_result {
|
||||
Ok(index_part) => {
|
||||
debug!("Successfully fetched index part for {id}");
|
||||
match index_parts.entry(id.tenant_id).or_default() {
|
||||
TenantIndexParts::Poisoned { present, .. } => {
|
||||
present.insert(id.timeline_id, index_part);
|
||||
}
|
||||
TenantIndexParts::Present(parts) => {
|
||||
parts.insert(id.timeline_id, index_part);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(download_error) => {
|
||||
match download_error {
|
||||
DownloadError::NotFound => {
|
||||
// thats ok because it means that we didnt upload something we have locally for example
|
||||
}
|
||||
e => {
|
||||
let tenant_parts = index_parts.entry(id.tenant_id).or_default();
|
||||
tenant_parts.add_poisoned(id.timeline_id);
|
||||
error!(
|
||||
"Failed to fetch index part for {id}: {e} poisoning tenant index parts"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
// fsync(tmp)
|
||||
// rename(tmp, new)
|
||||
// fsync(new)
|
||||
// fsync(parent)
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut download = storage
|
||||
.download(&layer_storage_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
|
||||
)
|
||||
})?;
|
||||
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file.flush().await.with_context(|| {
|
||||
format!(
|
||||
"failed to flush source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
match layer_metadata.file_size() {
|
||||
Some(expected) if expected != bytes_amount => {
|
||||
anyhow::bail!(
|
||||
"According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||
temp_file_path.display()
|
||||
);
|
||||
}
|
||||
Some(_) | None => {
|
||||
// matches, or upgrading from an earlier IndexPart version
|
||||
}
|
||||
}
|
||||
|
||||
index_parts
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to fsync source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
fs::rename(&temp_file_path, &local_path).await?;
|
||||
|
||||
fsync_path(&local_path)
|
||||
.await
|
||||
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
|
||||
|
||||
tracing::info!("download complete: {}", local_path.display());
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
|
||||
/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests.
|
||||
/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines).
|
||||
/// And then will attempt to download all index files that belong to these timelines.
|
||||
pub async fn gather_tenant_timelines_index_parts(
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub fn is_temp_download_file(path: &Path) -> bool {
|
||||
let extension = path.extension().map(|pname| {
|
||||
pname
|
||||
.to_str()
|
||||
.expect("paths passed to this function must be valid Rust strings")
|
||||
});
|
||||
match extension {
|
||||
Some(TEMP_DOWNLOAD_EXTENSION) => true,
|
||||
Some(_) => false,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// List timelines of given tenant in remote storage
|
||||
pub async fn list_remote_timelines<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines = storage
|
||||
.list_prefixes(Some(&tenant_storage_path))
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
|
||||
)
|
||||
})?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!("no timelines found on the remote storage")
|
||||
}
|
||||
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut part_downloads = FuturesUnordered::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: TimelineId = object_name.parse().with_context(|| {
|
||||
format!("failed to parse object name into timeline id '{object_name}'")
|
||||
})?;
|
||||
|
||||
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
|
||||
// yet, launch a download task for it.
|
||||
if !timeline_ids.contains(&timeline_id) {
|
||||
timeline_ids.insert(timeline_id);
|
||||
let storage_clone = storage.clone();
|
||||
part_downloads.push(async move {
|
||||
(
|
||||
timeline_id,
|
||||
download_index_part(conf, &storage_clone, tenant_id, timeline_id).await,
|
||||
)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for all the download tasks to complete.
|
||||
let mut timeline_parts = Vec::new();
|
||||
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
|
||||
let index_part = part_upload_result
|
||||
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
|
||||
|
||||
debug!("Successfully fetched index part for timeline {timeline_id}");
|
||||
timeline_parts.push((timeline_id, index_part));
|
||||
}
|
||||
Ok(timeline_parts)
|
||||
}
|
||||
|
||||
pub async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<HashMap<TimelineId, IndexPart>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id)
|
||||
.await
|
||||
.with_context(|| format!("Failed to list timeline sync ids for tenat {tenant_id}"))?;
|
||||
|
||||
match download_index_parts(conf, storage, timeline_sync_ids)
|
||||
.await
|
||||
.remove(&tenant_id)
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))?
|
||||
{
|
||||
TenantIndexParts::Poisoned { missing, .. } => {
|
||||
anyhow::bail!("Failed to download index parts for all timelines. Missing {missing:?}")
|
||||
}
|
||||
TenantIndexParts::Present(parts) => Ok(parts),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieves index data from the remote storage for a given timeline.
|
||||
async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_id: TenantTimelineId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let index_part_path = conf
|
||||
.metadata_path(sync_id.timeline_id, sync_id.tenant_id)
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let mut index_part_download = storage
|
||||
.download_storage_object(None, &index_part_path)
|
||||
.await?;
|
||||
let part_storage_path = storage
|
||||
.remote_object_id(&index_part_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
let mut index_part_download = storage.download(&part_storage_path).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
io::copy(
|
||||
tokio::io::copy(
|
||||
&mut index_part_download.download_stream,
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
@@ -171,525 +253,5 @@ async fn download_index_part(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let missing_files = index_part.missing_files();
|
||||
if !missing_files.is_empty() {
|
||||
warn!("Found missing layers in index part for timeline {sync_id}: {missing_files:?}");
|
||||
}
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
/// Timeline download result, with extra data, needed for downloading.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum DownloadedTimeline {
|
||||
/// Remote timeline data is either absent or corrupt, no download possible.
|
||||
Abort,
|
||||
/// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
|
||||
/// Initial download failed due to some error, the download task is rescheduled for another retry.
|
||||
FailedAndRescheduled,
|
||||
/// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
|
||||
/// Initial download successful.
|
||||
Successful(SyncData<LayersDownload>),
|
||||
}
|
||||
|
||||
/// Attempts to download all given timeline's layers.
|
||||
/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
|
||||
/// updated in the end, if the remote one contains a newer disk_consistent_lsn.
|
||||
///
|
||||
/// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task.
|
||||
pub(super) async fn download_timeline_layers<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
sync_queue: &'a SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: TenantTimelineId,
|
||||
mut download_data: SyncData<LayersDownload>,
|
||||
) -> DownloadedTimeline {
|
||||
let remote_timeline = match remote_timeline {
|
||||
Some(remote_timeline) => {
|
||||
if !remote_timeline.awaits_download {
|
||||
error!("Timeline with sync id {sync_id} is not awaiting download");
|
||||
return DownloadedTimeline::Abort;
|
||||
}
|
||||
remote_timeline
|
||||
}
|
||||
None => {
|
||||
error!("Timeline with sync id {sync_id} is not present in the remote index");
|
||||
return DownloadedTimeline::Abort;
|
||||
}
|
||||
};
|
||||
|
||||
let download = &mut download_data.data;
|
||||
|
||||
let layers_to_download = remote_timeline
|
||||
.stored_files()
|
||||
.iter()
|
||||
.filter_map(|(layer_path, metadata)| {
|
||||
if !download.layers_to_skip.contains(layer_path) {
|
||||
Some((layer_path.to_owned(), metadata.to_owned()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
debug!("Layers to download: {layers_to_download:?}");
|
||||
info!("Downloading {} timeline layers", layers_to_download.len());
|
||||
|
||||
if layers_to_download.is_empty() {
|
||||
info!("No layers to download after filtering, skipping");
|
||||
return DownloadedTimeline::Successful(download_data);
|
||||
}
|
||||
|
||||
let mut download_tasks = layers_to_download
|
||||
.into_iter()
|
||||
.map(|(layer_destination_path, metadata)| async move {
|
||||
|
||||
match layer_destination_path.metadata() {
|
||||
Ok(m) if m.is_file() => {
|
||||
// the file exists from earlier round when we failed after renaming it as
|
||||
// layer_destination_path
|
||||
let verified = if let Some(expected) = metadata.file_size() {
|
||||
m.len() == expected
|
||||
} else {
|
||||
// behaviour before recording metadata was to accept any existing
|
||||
true
|
||||
};
|
||||
|
||||
if verified {
|
||||
debug!(
|
||||
"Layer already exists locally, skipping download: {}",
|
||||
layer_destination_path.display()
|
||||
);
|
||||
return Ok((layer_destination_path, LayerFileMetadata::new(m.len())))
|
||||
} else {
|
||||
// no need to remove it, it will be overwritten by fs::rename
|
||||
// after successful download
|
||||
warn!("Downloaded layer exists already but layer file metadata mismatches: {}, metadata {:?}", layer_destination_path.display(), metadata);
|
||||
}
|
||||
}
|
||||
Ok(m) => {
|
||||
return Err(anyhow::anyhow!("Downloaded layer destination exists but is not a file: {m:?}, target needs to be removed/archived manually: {layer_destination_path:?}"));
|
||||
}
|
||||
Err(_) => {
|
||||
// behave as the file didn't exist
|
||||
}
|
||||
}
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
// fsync(tmp)
|
||||
// rename(tmp, new)
|
||||
// fsync(new)
|
||||
// fsync(parent)
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path =
|
||||
path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
|
||||
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file =
|
||||
fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to initiate the download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let bytes_amount = io::copy(&mut layer_download.download_stream, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
|
||||
// we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
|
||||
// But for additional safety let's check/wait for any pending operations.
|
||||
destination_file.flush().await.with_context(|| {
|
||||
format!(
|
||||
"failed to flush source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
match metadata.file_size() {
|
||||
Some(expected) if expected != bytes_amount => {
|
||||
anyhow::bail!(
|
||||
"According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||
temp_file_path.display()
|
||||
);
|
||||
},
|
||||
Some(_) | None => {
|
||||
// matches, or upgrading from an earlier IndexPart version
|
||||
}
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to fsync source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
fs::rename(&temp_file_path, &layer_destination_path).await?;
|
||||
|
||||
fsync_path(&layer_destination_path).await.with_context(|| {
|
||||
format!(
|
||||
"Cannot fsync layer destination path {}",
|
||||
layer_destination_path.display(),
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok::<_, anyhow::Error>((layer_destination_path, LayerFileMetadata::new(bytes_amount)))
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errors_happened = false;
|
||||
// keep files we've downloaded to remove them from layers_to_skip if directory fsync fails
|
||||
let mut undo = HashSet::new();
|
||||
while let Some(download_result) = download_tasks.next().await {
|
||||
match download_result {
|
||||
Ok((downloaded_path, metadata)) => {
|
||||
undo.insert(downloaded_path.clone());
|
||||
download.layers_to_skip.insert(downloaded_path.clone());
|
||||
// what if the key existed already? ignore, because then we would had
|
||||
// downloaded a partial file, and had to retry
|
||||
download.gathered_metadata.insert(downloaded_path, metadata);
|
||||
}
|
||||
Err(e) => {
|
||||
errors_happened = true;
|
||||
error!("Failed to download a layer for timeline {sync_id}: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fsync timeline directory which is a parent directory for downloaded files
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = &sync_id;
|
||||
let timeline_dir = conf.timeline_path(timeline_id, tenant_id);
|
||||
if let Err(e) = fsync_path(&timeline_dir).await {
|
||||
error!(
|
||||
"Cannot fsync parent directory {} error {}",
|
||||
timeline_dir.display(),
|
||||
e
|
||||
);
|
||||
for item in undo {
|
||||
download.layers_to_skip.remove(&item);
|
||||
// intentionally don't clear the gathered_metadata because it exists for fsync_path
|
||||
// failure on parent directory
|
||||
}
|
||||
errors_happened = true;
|
||||
}
|
||||
|
||||
if errors_happened {
|
||||
debug!("Reenqueuing failed download task for timeline {sync_id}");
|
||||
download_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
||||
DownloadedTimeline::FailedAndRescheduled
|
||||
} else {
|
||||
info!("Successfully downloaded all layers");
|
||||
DownloadedTimeline::Successful(download_data)
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_timeline_sync_ids(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_path: &Path,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<HashSet<TenantTimelineId>> {
|
||||
let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines = storage
|
||||
.list_prefixes(Some(&tenant_storage_path))
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
|
||||
)
|
||||
})?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!("no timelines found on the remote storage")
|
||||
}
|
||||
|
||||
let mut sync_ids = HashSet::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: TimelineId = object_name.parse().with_context(|| {
|
||||
format!("failed to parse object name into timeline id '{object_name}'")
|
||||
})?;
|
||||
|
||||
sync_ids.insert(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(sync_ids)
|
||||
}
|
||||
|
||||
async fn fsync_path(path: impl AsRef<Path>) -> Result<(), io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
collections::{BTreeSet, HashSet},
|
||||
num::NonZeroUsize,
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
use remote_storage::{LocalFs, RemoteStorage};
|
||||
use tempfile::tempdir;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_timeline() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("download_timeline")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
|
||||
for local_path in timeline_upload.layers_to_upload.keys() {
|
||||
let remote_path =
|
||||
local_storage.resolve_in_storage(&storage.remote_object_id(local_path)?)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
}
|
||||
fs::copy(&local_path, &remote_path).await?;
|
||||
}
|
||||
let mut read_dir = fs::read_dir(&local_timeline_path).await?;
|
||||
while let Some(dir_entry) = read_dir.next_entry().await? {
|
||||
if dir_entry.file_name().to_str() == Some("layer_to_keep_locally") {
|
||||
continue;
|
||||
} else {
|
||||
fs::remove_file(dir_entry.path()).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut remote_timeline = RemoteTimeline::new(metadata.clone());
|
||||
remote_timeline.awaits_download = true;
|
||||
remote_timeline.add_timeline_layers(layer_files.iter().map(|layer| {
|
||||
let layer_path = local_timeline_path.join(layer);
|
||||
|
||||
// this could had also been LayerFileMetadata::default(), but since in this test we
|
||||
// don't do the merge operation done by storage_sync::download_timeline_data, it would
|
||||
// not be merged back to timeline.
|
||||
let metadata_from_upload = timeline_upload
|
||||
.layers_to_upload
|
||||
.get(&layer_path)
|
||||
.expect("layer must exist in previously uploaded paths")
|
||||
.to_owned();
|
||||
(layer_path, metadata_from_upload)
|
||||
}));
|
||||
|
||||
let download_data = match download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
&sync_queue,
|
||||
Some(&remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
current_retries,
|
||||
LayersDownload::from_skipped_layers(HashSet::from([
|
||||
local_timeline_path.join("layer_to_skip")
|
||||
])),
|
||||
),
|
||||
)
|
||||
.await
|
||||
{
|
||||
DownloadedTimeline::Successful(data) => data,
|
||||
wrong_result => {
|
||||
panic!("Expected a successful download for timeline, but got: {wrong_result:?}")
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
current_retries, download_data.retries,
|
||||
"On successful download, retries are not expected to change"
|
||||
);
|
||||
assert_eq!(
|
||||
download_data
|
||||
.data
|
||||
.layers_to_skip
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer| local_timeline_path.join(layer))
|
||||
.collect(),
|
||||
"On successful download, layers to skip should contain all downloaded files and present layers that were skipped"
|
||||
);
|
||||
|
||||
let mut downloaded_files = BTreeSet::new();
|
||||
let mut read_dir = fs::read_dir(&local_timeline_path).await?;
|
||||
while let Some(dir_entry) = read_dir.next_entry().await? {
|
||||
downloaded_files.insert(dir_entry.path());
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
downloaded_files,
|
||||
layer_files
|
||||
.iter()
|
||||
.filter(|layer| layer != &&"layer_to_skip")
|
||||
.map(|layer| local_timeline_path.join(layer))
|
||||
.collect(),
|
||||
"On successful download, all layers that were not skipped, should be downloaded"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_timeline_negatives() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("download_timeline_negatives")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
|
||||
let empty_remote_timeline_download = download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
&sync_queue,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
matches!(empty_remote_timeline_download, DownloadedTimeline::Abort),
|
||||
"Should not allow downloading for empty remote timeline"
|
||||
);
|
||||
|
||||
let not_expecting_download_remote_timeline = RemoteTimeline::new(dummy_metadata(Lsn(5)));
|
||||
assert!(
|
||||
!not_expecting_download_remote_timeline.awaits_download,
|
||||
"Should not expect download for the timeline"
|
||||
);
|
||||
let already_downloading_remote_timeline_download = download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
&sync_queue,
|
||||
Some(¬_expecting_download_remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
matches!(
|
||||
already_downloading_remote_timeline_download,
|
||||
DownloadedTimeline::Abort,
|
||||
),
|
||||
"Should not allow downloading for remote timeline that does not expect it"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_download_index_part() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_download_index_part")?;
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
HashSet::from([
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?,
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?,
|
||||
]),
|
||||
HashSet::from([RelativePath::new(
|
||||
&local_timeline_path,
|
||||
local_timeline_path.join("three"),
|
||||
)?]),
|
||||
metadata.disk_consistent_lsn(),
|
||||
metadata.to_bytes()?,
|
||||
);
|
||||
|
||||
let local_index_part_path = harness
|
||||
.conf
|
||||
.metadata_path(sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?;
|
||||
let index_part_local_path = PathBuf::from(index_part_remote_id.to_string());
|
||||
fs::create_dir_all(index_part_local_path.parent().unwrap()).await?;
|
||||
fs::write(&index_part_local_path, serde_json::to_vec(&index_part)?).await?;
|
||||
|
||||
let downloaded_index_part = download_index_part(harness.conf, &storage, sync_id).await?;
|
||||
|
||||
assert_eq!(
|
||||
downloaded_index_part, index_part,
|
||||
"Downloaded index part should be the same as the one in storage"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,26 +2,18 @@
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Ok};
|
||||
use anyhow::{Context, Ok};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::log::warn;
|
||||
|
||||
use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata};
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
|
||||
use super::download::TenantIndexParts;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// A part of the filesystem path, that needs a root to become a path again.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
@@ -30,270 +22,23 @@ pub struct RelativePath(String);
|
||||
|
||||
impl RelativePath {
|
||||
/// Attempts to strip off the base from path, producing a relative path or an error.
|
||||
pub fn new<P: AsRef<Path>>(base: &Path, path: P) -> anyhow::Result<Self> {
|
||||
let path = path.as_ref();
|
||||
let relative = path.strip_prefix(base).with_context(|| {
|
||||
pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result<RelativePath> {
|
||||
let relative = path.strip_prefix(timeline_path).with_context(|| {
|
||||
format!(
|
||||
"path '{}' is not relative to base '{}'",
|
||||
path.display(),
|
||||
base.display()
|
||||
timeline_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(RelativePath(relative.to_string_lossy().to_string()))
|
||||
Ok(Self::from_filename(relative))
|
||||
}
|
||||
|
||||
/// Joins the relative path with the base path.
|
||||
fn as_path(&self, base: &Path) -> PathBuf {
|
||||
base.join(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct TenantEntry(HashMap<TimelineId, RemoteTimeline>);
|
||||
|
||||
impl TenantEntry {
|
||||
pub fn has_in_progress_downloads(&self) -> bool {
|
||||
self.values()
|
||||
.any(|remote_timeline| remote_timeline.awaits_download)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for TenantEntry {
|
||||
type Target = HashMap<TimelineId, RemoteTimeline>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for TenantEntry {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HashMap<TimelineId, RemoteTimeline>> for TenantEntry {
|
||||
fn from(inner: HashMap<TimelineId, RemoteTimeline>) -> Self {
|
||||
Self(inner)
|
||||
}
|
||||
}
|
||||
|
||||
/// An index to track tenant files that exist on the remote storage.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct RemoteTimelineIndex {
|
||||
entries: HashMap<TenantId, TenantEntry>,
|
||||
}
|
||||
|
||||
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||
#[derive(Default)]
|
||||
pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
|
||||
|
||||
impl RemoteIndex {
|
||||
pub fn from_parts(
|
||||
conf: &'static PageServerConf,
|
||||
index_parts: HashMap<TenantId, TenantIndexParts>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut entries: HashMap<TenantId, TenantEntry> = HashMap::new();
|
||||
|
||||
for (tenant_id, index_parts) in index_parts {
|
||||
match index_parts {
|
||||
// TODO: should we schedule a retry so it can be recovered? otherwise we can revive it only through detach/attach or pageserver restart
|
||||
TenantIndexParts::Poisoned { missing, ..} => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline(s): {missing:?}"),
|
||||
TenantIndexParts::Present(timelines) => {
|
||||
for (timeline_id, index_part) in timelines {
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
let remote_timeline =
|
||||
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.context("Failed to restore remote timeline data from index part")?;
|
||||
|
||||
entries
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.insert(timeline_id, remote_timeline);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { entries }))))
|
||||
pub fn from_filename(path: &Path) -> RelativePath {
|
||||
RelativePath(path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
|
||||
self.0.read().await
|
||||
}
|
||||
|
||||
pub async fn write(&self) -> tokio::sync::RwLockWriteGuard<'_, RemoteTimelineIndex> {
|
||||
self.0.write().await
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for RemoteIndex {
|
||||
fn clone(&self) -> Self {
|
||||
Self(Arc::clone(&self.0))
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineIndex {
|
||||
pub fn timeline_entry(
|
||||
&self,
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: &TenantTimelineId,
|
||||
) -> Option<&RemoteTimeline> {
|
||||
self.entries.get(tenant_id)?.get(timeline_id)
|
||||
}
|
||||
|
||||
pub fn timeline_entry_mut(
|
||||
&mut self,
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: &TenantTimelineId,
|
||||
) -> Option<&mut RemoteTimeline> {
|
||||
self.entries.get_mut(tenant_id)?.get_mut(timeline_id)
|
||||
}
|
||||
|
||||
pub fn add_timeline_entry(
|
||||
&mut self,
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: TenantTimelineId,
|
||||
entry: RemoteTimeline,
|
||||
) {
|
||||
self.entries
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.insert(timeline_id, entry);
|
||||
}
|
||||
|
||||
pub fn remove_timeline_entry(
|
||||
&mut self,
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: TenantTimelineId,
|
||||
) -> Option<RemoteTimeline> {
|
||||
self.entries
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.remove(&timeline_id)
|
||||
}
|
||||
|
||||
pub fn tenant_entry(&self, tenant_id: &TenantId) -> Option<&TenantEntry> {
|
||||
self.entries.get(tenant_id)
|
||||
}
|
||||
|
||||
pub fn tenant_entry_mut(&mut self, tenant_id: &TenantId) -> Option<&mut TenantEntry> {
|
||||
self.entries.get_mut(tenant_id)
|
||||
}
|
||||
|
||||
pub fn add_tenant_entry(&mut self, tenant_id: TenantId) -> &mut TenantEntry {
|
||||
self.entries.entry(tenant_id).or_default()
|
||||
}
|
||||
|
||||
pub fn remove_tenant_entry(&mut self, tenant_id: &TenantId) -> Option<TenantEntry> {
|
||||
self.entries.remove(tenant_id)
|
||||
}
|
||||
|
||||
pub fn set_awaits_download(
|
||||
&mut self,
|
||||
id: &TenantTimelineId,
|
||||
awaits_download: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
self.timeline_entry_mut(id)
|
||||
.ok_or_else(|| anyhow!("unknown timeline sync {id}"))?
|
||||
.awaits_download = awaits_download;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Restored index part data about the timeline, stored in the remote index.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteTimeline {
|
||||
timeline_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||
missing_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||
|
||||
pub metadata: TimelineMetadata,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
impl RemoteTimeline {
|
||||
pub fn new(metadata: TimelineMetadata) -> Self {
|
||||
Self {
|
||||
timeline_layers: HashMap::default(),
|
||||
missing_layers: HashMap::default(),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_timeline_layers(
|
||||
&mut self,
|
||||
new_layers: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
|
||||
) {
|
||||
self.timeline_layers.extend(new_layers);
|
||||
}
|
||||
|
||||
pub fn add_upload_failures(
|
||||
&mut self,
|
||||
upload_failures: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
|
||||
) {
|
||||
self.missing_layers.extend(upload_failures);
|
||||
}
|
||||
|
||||
pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
|
||||
self.timeline_layers
|
||||
.retain(|layer, _| !layers_to_remove.contains(layer));
|
||||
self.missing_layers
|
||||
.retain(|layer, _| !layers_to_remove.contains(layer));
|
||||
}
|
||||
|
||||
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
||||
pub fn stored_files(&self) -> &HashMap<PathBuf, LayerFileMetadata> {
|
||||
&self.timeline_layers
|
||||
}
|
||||
|
||||
/// Combines metadata gathered or verified during downloading needed layer files to metadata on
|
||||
/// the [`RemoteIndex`], so it can be uploaded later.
|
||||
pub fn merge_metadata_from_downloaded(
|
||||
&mut self,
|
||||
downloaded: &HashMap<PathBuf, LayerFileMetadata>,
|
||||
) {
|
||||
downloaded.iter().for_each(|(path, metadata)| {
|
||||
if let Some(upgraded) = self.timeline_layers.get_mut(path) {
|
||||
upgraded.merge(metadata);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
|
||||
let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
|
||||
let default_metadata = &IndexLayerMetadata::default();
|
||||
|
||||
let find_metadata = |key: &RelativePath| -> LayerFileMetadata {
|
||||
index_part
|
||||
.layer_metadata
|
||||
.get(key)
|
||||
.unwrap_or(default_metadata)
|
||||
.into()
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
timeline_layers: index_part
|
||||
.timeline_layers
|
||||
.iter()
|
||||
.map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
|
||||
.collect(),
|
||||
missing_layers: index_part
|
||||
.missing_layers
|
||||
.iter()
|
||||
.map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
|
||||
.collect(),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
})
|
||||
pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf {
|
||||
timeline_path.join(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -322,6 +67,10 @@ impl LayerFileMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// This is used to initialize the metadata for remote layers, for which
|
||||
/// the metadata was missing from the index part file.
|
||||
pub const MISSING: Self = LayerFileMetadata { file_size: None };
|
||||
|
||||
pub fn file_size(&self) -> Option<u64> {
|
||||
self.file_size
|
||||
}
|
||||
@@ -335,7 +84,8 @@ impl LayerFileMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// Part of the remote index, corresponding to a certain timeline.
|
||||
/// In-memory representation of an `index_part.json` file
|
||||
///
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
///
|
||||
/// This type needs to be backwards and forwards compatible. When changing the fields,
|
||||
@@ -350,13 +100,10 @@ pub struct IndexPart {
|
||||
/// Each of the layers present on remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
timeline_layers: HashSet<RelativePath>,
|
||||
pub timeline_layers: HashSet<RelativePath>,
|
||||
|
||||
/// Currently is not really used in pageserver,
|
||||
/// present to manually keep track of the layer files that pageserver might never retrieve.
|
||||
///
|
||||
/// Such "holes" might appear if any upload task was evicted on an error threshold:
|
||||
/// the this layer will only be rescheduled for upload on pageserver restart.
|
||||
/// FIXME: unused field. This should be removed, but that changes the on-disk format,
|
||||
/// so we need to make sure we're backwards- (and maybe forwards-) compatible
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
|
||||
/// Per layer file metadata, which can be present for a present or missing layer file.
|
||||
@@ -364,10 +111,12 @@ pub struct IndexPart {
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
#[serde(default)]
|
||||
layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
|
||||
pub layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated here for convenience.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
disk_consistent_lsn: Lsn,
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
@@ -379,63 +128,32 @@ impl IndexPart {
|
||||
const LATEST_VERSION: usize = 1;
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn new(
|
||||
timeline_layers: HashSet<RelativePath>,
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
layers_and_metadata: HashMap<RelativePath, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
let mut timeline_layers = HashSet::new();
|
||||
let mut layer_metadata = HashMap::new();
|
||||
|
||||
separate_paths_and_metadata(
|
||||
&layers_and_metadata,
|
||||
&mut timeline_layers,
|
||||
&mut layer_metadata,
|
||||
);
|
||||
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
timeline_layers,
|
||||
missing_layers,
|
||||
layer_metadata: HashMap::default(),
|
||||
missing_layers: HashSet::new(),
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn missing_files(&self) -> &HashSet<RelativePath> {
|
||||
&self.missing_layers
|
||||
}
|
||||
|
||||
pub fn from_remote_timeline(
|
||||
timeline_path: &Path,
|
||||
remote_timeline: RemoteTimeline,
|
||||
) -> anyhow::Result<Self> {
|
||||
let metadata_bytes = remote_timeline.metadata.to_bytes()?;
|
||||
|
||||
let mut layer_metadata = HashMap::new();
|
||||
|
||||
let mut missing_layers = HashSet::new();
|
||||
|
||||
separate_paths_and_metadata(
|
||||
timeline_path,
|
||||
&remote_timeline.missing_layers,
|
||||
&mut missing_layers,
|
||||
&mut layer_metadata,
|
||||
)
|
||||
.context("Failed to convert missing layers' paths to relative ones")?;
|
||||
|
||||
let mut timeline_layers = HashSet::new();
|
||||
|
||||
separate_paths_and_metadata(
|
||||
timeline_path,
|
||||
&remote_timeline.timeline_layers,
|
||||
&mut timeline_layers,
|
||||
&mut layer_metadata,
|
||||
)
|
||||
.context("Failed to convert timeline layers' paths to relative ones")?;
|
||||
|
||||
Ok(Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
timeline_layers,
|
||||
missing_layers,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
|
||||
metadata_bytes,
|
||||
})
|
||||
pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
|
||||
TimelineMetadata::from_bytes(&self.metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,202 +172,20 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
}
|
||||
|
||||
fn separate_paths_and_metadata(
|
||||
timeline_path: &Path,
|
||||
input: &HashMap<PathBuf, LayerFileMetadata>,
|
||||
input: &HashMap<RelativePath, LayerFileMetadata>,
|
||||
output: &mut HashSet<RelativePath>,
|
||||
layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
) {
|
||||
for (path, metadata) in input {
|
||||
let rel_path = RelativePath::new(timeline_path, path)?;
|
||||
let metadata = IndexLayerMetadata::from(metadata);
|
||||
|
||||
layer_metadata.insert(rel_path.clone(), metadata);
|
||||
output.insert(rel_path);
|
||||
layer_metadata.insert(path.clone(), metadata);
|
||||
output.insert(path.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion() {
|
||||
let harness = TenantHarness::create("index_part_conversion").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata = TimelineMetadata::new(
|
||||
Lsn(5).align(),
|
||||
Some(Lsn(4)),
|
||||
None,
|
||||
Lsn(3),
|
||||
Lsn(2),
|
||||
Lsn(1),
|
||||
DEFAULT_PG_VERSION,
|
||||
);
|
||||
let remote_timeline = RemoteTimeline {
|
||||
timeline_layers: HashMap::from([
|
||||
(timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
|
||||
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||
]),
|
||||
missing_layers: HashMap::from([
|
||||
(timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
|
||||
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
};
|
||||
|
||||
let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone())
|
||||
.expect("Correct remote timeline should be convertible to index part");
|
||||
|
||||
assert_eq!(
|
||||
index_part.timeline_layers.iter().collect::<BTreeSet<_>>(),
|
||||
BTreeSet::from([
|
||||
&RelativePath("layer_1".to_string()),
|
||||
&RelativePath("layer_2".to_string())
|
||||
]),
|
||||
"Index part should have all remote timeline layers after the conversion"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.missing_layers.iter().collect::<BTreeSet<_>>(),
|
||||
BTreeSet::from([
|
||||
&RelativePath("missing_1".to_string()),
|
||||
&RelativePath("missing_2".to_string())
|
||||
]),
|
||||
"Index part should have all missing remote timeline layers after the conversion"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.disk_consistent_lsn,
|
||||
metadata.disk_consistent_lsn(),
|
||||
"Index part should have disk consistent lsn from the timeline"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.metadata_bytes,
|
||||
metadata
|
||||
.to_bytes()
|
||||
.expect("Failed to serialize correct metadata into bytes"),
|
||||
"Index part should have all missing remote timeline layers after the conversion"
|
||||
);
|
||||
|
||||
let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.expect("Correct index part should be convertible to remote timeline");
|
||||
|
||||
let original_metadata = &remote_timeline.metadata;
|
||||
let restored_metadata = &restored_timeline.metadata;
|
||||
// we have to compare the metadata this way, since its header is different after creation and restoration,
|
||||
// but that is now consireded ok.
|
||||
assert_eq!(
|
||||
original_metadata.disk_consistent_lsn(),
|
||||
restored_metadata.disk_consistent_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.prev_record_lsn(),
|
||||
restored_metadata.prev_record_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.ancestor_timeline(),
|
||||
restored_metadata.ancestor_timeline(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.ancestor_lsn(),
|
||||
restored_metadata.ancestor_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.latest_gc_cutoff_lsn(),
|
||||
restored_metadata.latest_gc_cutoff_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.initdb_lsn(),
|
||||
restored_metadata.initdb_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remote_timeline.awaits_download, restored_timeline.awaits_download,
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose download flag"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remote_timeline
|
||||
.timeline_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
restored_timeline
|
||||
.timeline_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose layer data"
|
||||
);
|
||||
assert_eq!(
|
||||
remote_timeline
|
||||
.missing_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
restored_timeline
|
||||
.missing_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose missing file data"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion_negatives() {
|
||||
let harness = TenantHarness::create("index_part_conversion_negatives").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata = TimelineMetadata::new(
|
||||
Lsn(5).align(),
|
||||
Some(Lsn(4)),
|
||||
None,
|
||||
Lsn(3),
|
||||
Lsn(2),
|
||||
Lsn(1),
|
||||
DEFAULT_PG_VERSION,
|
||||
);
|
||||
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashMap::from([
|
||||
(PathBuf::from("bad_path"), LayerFileMetadata::new(1)),
|
||||
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||
]),
|
||||
missing_layers: HashMap::from([
|
||||
(timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
|
||||
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
},
|
||||
);
|
||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with layer paths that are not in the timeline directory");
|
||||
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashMap::from([
|
||||
(timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
|
||||
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||
]),
|
||||
missing_layers: HashMap::from([
|
||||
(PathBuf::from("bad_path"), LayerFileMetadata::new(3)),
|
||||
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||
]),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
},
|
||||
);
|
||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v0_indexpart_is_parsed() {
|
||||
|
||||
@@ -1,36 +1,34 @@
|
||||
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use anyhow::{bail, Context};
|
||||
use fail::fail_point;
|
||||
use std::path::Path;
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
LayersUpload, SyncData, SyncQueue,
|
||||
};
|
||||
use crate::metrics::NO_LAYERS_UPLOAD;
|
||||
use crate::{config::PageServerConf, storage_sync::SyncTask};
|
||||
use super::index::IndexPart;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::storage_sync::LayerFileMetadata;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part(
|
||||
pub(super) async fn upload_index_part<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_id: TenantTimelineId,
|
||||
index_part: IndexPart,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
index_part: &'a IndexPart,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-index", |_| {
|
||||
bail!("failpoint before-upload-index")
|
||||
});
|
||||
let index_part_bytes = serde_json::to_vec(&index_part)
|
||||
.context("Failed to serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
|
||||
let index_part_path = conf
|
||||
.metadata_path(sync_id.timeline_id, sync_id.tenant_id)
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
storage
|
||||
.upload_storage_object(
|
||||
@@ -39,441 +37,68 @@ pub(super) async fn upload_index_part(
|
||||
&index_part_path,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to upload index part for '{sync_id}'"))
|
||||
}
|
||||
|
||||
/// Timeline upload result, with extra data, needed for uploading.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum UploadedTimeline {
|
||||
/// Upload failed due to some error, the upload task is rescheduled for another retry.
|
||||
FailedAndRescheduled(anyhow::Error),
|
||||
/// No issues happened during the upload, all task files were put into the remote storage.
|
||||
Successful(SyncData<LayersUpload>),
|
||||
.with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layers<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
sync_queue: &SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: TenantTimelineId,
|
||||
mut upload_data: SyncData<LayersUpload>,
|
||||
) -> UploadedTimeline {
|
||||
let upload = &mut upload_data.data;
|
||||
let new_upload_lsn = upload
|
||||
.metadata
|
||||
.as_ref()
|
||||
.map(|meta| meta.disk_consistent_lsn());
|
||||
pub(super) async fn upload_timeline_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
source_path: &Path,
|
||||
known_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-layer", |_| {
|
||||
bail!("failpoint before-upload-layer")
|
||||
});
|
||||
let storage_path = storage.remote_object_id(source_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let already_uploaded_layers = remote_timeline
|
||||
.map(|timeline| {
|
||||
timeline
|
||||
.stored_files()
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<std::collections::HashSet<_>>()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
let source_file = fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a source file for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let layers_to_upload = upload
|
||||
.layers_to_upload
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if !already_uploaded_layers.contains(k) {
|
||||
Some((k.to_owned(), v.to_owned()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the source file metadata for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?
|
||||
.len();
|
||||
|
||||
if layers_to_upload.is_empty() {
|
||||
debug!("No layers to upload after filtering, aborting");
|
||||
NO_LAYERS_UPLOAD
|
||||
.with_label_values(&[
|
||||
&sync_id.tenant_id.to_string(),
|
||||
&sync_id.timeline_id.to_string(),
|
||||
])
|
||||
.inc();
|
||||
return UploadedTimeline::Successful(upload_data);
|
||||
}
|
||||
|
||||
debug!("Layers to upload: {layers_to_upload:?}");
|
||||
info!(
|
||||
"Uploading {} timeline layers, new lsn: {new_upload_lsn:?}",
|
||||
layers_to_upload.len(),
|
||||
);
|
||||
|
||||
let mut upload_tasks = layers_to_upload
|
||||
.into_iter()
|
||||
.map(|(source_path, known_metadata)| async move {
|
||||
let source_file = match fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upen a source file for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
};
|
||||
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the source file metadata for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(UploadError::Other)?
|
||||
.len();
|
||||
|
||||
// FIXME: this looks bad
|
||||
if let Some(metadata_size) = known_metadata.file_size() {
|
||||
if metadata_size != fs_size {
|
||||
return Err(UploadError::Other(anyhow::anyhow!(
|
||||
"File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
// this is a silly state we would like to avoid
|
||||
}
|
||||
|
||||
let fs_size = usize::try_from(fs_size).with_context(|| format!("File {source_path:?} size {fs_size} could not be converted to usize"))
|
||||
.map_err(UploadError::Other)?;
|
||||
|
||||
match storage
|
||||
.upload_storage_object(Box::new(source_file), fs_size, &source_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to upload layer file for {sync_id}"))
|
||||
{
|
||||
Ok(()) => Ok(source_path),
|
||||
Err(e) => Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errors = Vec::new();
|
||||
while let Some(upload_result) = upload_tasks.next().await {
|
||||
match upload_result {
|
||||
Ok(uploaded_path) => {
|
||||
let metadata = upload
|
||||
.layers_to_upload
|
||||
.remove(&uploaded_path)
|
||||
.expect("metadata should always exist, assuming no double uploads");
|
||||
upload.uploaded_layers.insert(uploaded_path, metadata);
|
||||
}
|
||||
Err(e) => match e {
|
||||
UploadError::Other(e) => {
|
||||
error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
|
||||
errors.push(format!("{e:#}"));
|
||||
}
|
||||
UploadError::MissingLocalFile(source_path, e) => {
|
||||
if source_path.exists() {
|
||||
error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
|
||||
errors.push(format!("{e:#}"));
|
||||
} else {
|
||||
// We have run the upload sync task, but the file we wanted to upload is gone.
|
||||
// This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to
|
||||
// retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and
|
||||
// run compaction/gc tasks, removing redundant files from disk.
|
||||
// It's not good to pause GC/compaction because of those and we would rather skip such uploads.
|
||||
//
|
||||
// Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance).
|
||||
// We don't try to read a more recent version, since it could contain `disk_consistent_lsn` that does not have its upload finished yet.
|
||||
// This will create "missing" layers and make data inconsistent.
|
||||
// Instead, we only update the metadata when it was submitted in an upload task as a checkpoint result.
|
||||
upload.layers_to_upload.remove(&source_path);
|
||||
warn!(
|
||||
"Missing locally a layer file {} scheduled for upload, skipping",
|
||||
source_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
},
|
||||
// FIXME: this looks bad
|
||||
if let Some(metadata_size) = known_metadata.file_size() {
|
||||
if metadata_size != fs_size {
|
||||
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
}
|
||||
}
|
||||
|
||||
if errors.is_empty() {
|
||||
info!("Successfully uploaded all layers");
|
||||
UploadedTimeline::Successful(upload_data)
|
||||
} else {
|
||||
debug!("Reenqueuing failed upload task for timeline {sync_id}");
|
||||
upload_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Upload(upload_data));
|
||||
UploadedTimeline::FailedAndRescheduled(anyhow::anyhow!(
|
||||
"Errors appeared during layer uploads: {:?}",
|
||||
errors
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
enum UploadError {
|
||||
MissingLocalFile(PathBuf, anyhow::Error),
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
collections::{BTreeSet, HashSet},
|
||||
num::NonZeroUsize,
|
||||
};
|
||||
|
||||
use remote_storage::{LocalFs, RemoteStorage};
|
||||
use tempfile::tempdir;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
},
|
||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::{upload_index_part, *};
|
||||
|
||||
#[tokio::test]
|
||||
async fn regular_layer_upload() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("regular_layer_upload")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a", "b"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let mut timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
timeline_upload.metadata = None;
|
||||
|
||||
assert!(
|
||||
local_storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
)
|
||||
.await;
|
||||
|
||||
let upload_data = match upload_result {
|
||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||
wrong_result => {
|
||||
panic!("Expected a successful upload for timeline, but got: {wrong_result:?}")
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
current_retries, upload_data.retries,
|
||||
"On successful upload, retries are not expected to change"
|
||||
);
|
||||
let upload = &upload_data.data;
|
||||
assert!(
|
||||
upload.layers_to_upload.is_empty(),
|
||||
"Successful upload should have no layers left to upload"
|
||||
);
|
||||
assert_eq!(
|
||||
upload
|
||||
.uploaded_layers
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_file| local_timeline_path.join(layer_file))
|
||||
.collect(),
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
upload.metadata, None,
|
||||
"Successful upload without metadata should not have it returned either"
|
||||
);
|
||||
|
||||
let storage_files = local_storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
"All layers should be uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| local_storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
.map(|file| local_timeline_path.join(file))
|
||||
.collect(),
|
||||
"Uploaded files should match with the local ones"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario.
|
||||
#[tokio::test]
|
||||
async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("layer_upload_after_local_fs_update")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a1", "b1"];
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let current_retries = 5;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let layers_to_upload = {
|
||||
let mut layers = layer_files.to_vec();
|
||||
layers.push("layer_to_remove");
|
||||
layers
|
||||
};
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone())
|
||||
.await?;
|
||||
assert!(
|
||||
local_storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
|
||||
fs::remove_file(local_timeline_path.join("layer_to_remove")).await?;
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
&sync_queue,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
)
|
||||
.await;
|
||||
|
||||
let upload_data = match upload_result {
|
||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||
wrong_result => panic!(
|
||||
"Expected a successful after local fs upload for timeline, but got: {wrong_result:?}"
|
||||
),
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
current_retries, upload_data.retries,
|
||||
"On successful upload, retries are not expected to change"
|
||||
);
|
||||
let upload = &upload_data.data;
|
||||
assert!(
|
||||
upload.layers_to_upload.is_empty(),
|
||||
"Successful upload should have no layers left to upload, even those that were removed from the local fs"
|
||||
);
|
||||
assert_eq!(
|
||||
upload
|
||||
.uploaded_layers
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_file| local_timeline_path.join(layer_file))
|
||||
.collect(),
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
upload.metadata,
|
||||
Some(metadata),
|
||||
"Successful upload should not change its metadata"
|
||||
);
|
||||
|
||||
let storage_files = local_storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
"All layers should be uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| local_storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
.map(|file| local_timeline_path.join(file))
|
||||
.collect(),
|
||||
"Uploaded files should match with the local ones"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_upload_index_part() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_upload_index_part")?;
|
||||
let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = GenericRemoteStorage::new(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
HashSet::from([
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?,
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?,
|
||||
]),
|
||||
HashSet::from([RelativePath::new(
|
||||
&local_timeline_path,
|
||||
local_timeline_path.join("three"),
|
||||
)?]),
|
||||
metadata.disk_consistent_lsn(),
|
||||
metadata.to_bytes()?,
|
||||
);
|
||||
|
||||
assert!(
|
||||
local_storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?;
|
||||
|
||||
let storage_files = local_storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
1,
|
||||
"Should have only the index part file uploaded"
|
||||
);
|
||||
|
||||
let index_part_path = storage_files.first().unwrap();
|
||||
assert_eq!(
|
||||
index_part_path.object_name(),
|
||||
Some(IndexPart::FILE_NAME),
|
||||
"Remote index part should have the correct name"
|
||||
);
|
||||
let remote_index_part: IndexPart = serde_json::from_slice(
|
||||
&fs::read(local_storage.resolve_in_storage(index_part_path)?).await?,
|
||||
)?;
|
||||
assert_eq!(
|
||||
index_part, remote_index_part,
|
||||
"Remote index part should match the local one"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
// this is a silly state we would like to avoid
|
||||
}
|
||||
|
||||
let fs_size = usize::try_from(fs_size).with_context(|| {
|
||||
format!("File {source_path:?} size {fs_size} could not be converted to usize")
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(Box::new(source_file), fs_size, &storage_path, None)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload a layer from local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -197,8 +197,8 @@ pub enum TaskKind {
|
||||
// Task that flushes frozen in-memory layers to disk
|
||||
LayerFlushTask,
|
||||
|
||||
// Task that manages the remote upload queue
|
||||
StorageSync,
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -30,14 +30,15 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
@@ -191,6 +192,8 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
|
||||
drop_watch: Option<DropNotify>,
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -327,10 +330,13 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
fn drop_notify(&self) -> DropNotify {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner
|
||||
.drop_watch
|
||||
.get_or_insert_with(|| DropNotify::new())
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
@@ -551,6 +557,7 @@ impl DeltaLayer {
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -578,6 +585,7 @@ impl DeltaLayer {
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -743,6 +751,7 @@ impl DeltaLayerWriterInner {
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
drop_watch: None,
|
||||
}),
|
||||
};
|
||||
|
||||
|
||||
@@ -26,7 +26,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{ImageFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
@@ -34,7 +36,6 @@ use bytes::Bytes;
|
||||
use hex;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
@@ -117,6 +118,8 @@ pub struct ImageLayerInner {
|
||||
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
|
||||
drop_watch: Option<DropNotify>,
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
@@ -184,10 +187,13 @@ impl Layer for ImageLayer {
|
||||
todo!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
fn drop_notify(&self) -> DropNotify {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner
|
||||
.drop_watch
|
||||
.get_or_insert_with(|| DropNotify::new())
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
@@ -351,6 +357,7 @@ impl ImageLayer {
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -378,6 +385,7 @@ impl ImageLayer {
|
||||
loaded: false,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
drop_watch: None,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -532,6 +540,7 @@ impl ImageLayerWriterInner {
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
drop_watch: None,
|
||||
}),
|
||||
};
|
||||
|
||||
|
||||
@@ -10,9 +10,11 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::walrecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{ensure, Result};
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
@@ -172,8 +174,8 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
/// Nothing to do here. When you drop the last reference to the layer, it will
|
||||
/// be deallocated.
|
||||
fn delete(&self) -> Result<()> {
|
||||
bail!("can't delete an InMemoryLayer")
|
||||
fn drop_notify(&self) -> DropNotify {
|
||||
panic!("can't delete an InMemoryLayer")
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
|
||||
@@ -242,7 +242,8 @@ pub fn save_metadata(
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
OpenOptions::new().write(true).create_new(first_save),
|
||||
)?;
|
||||
)
|
||||
.context("open_with_options")?;
|
||||
|
||||
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
|
||||
|
||||
@@ -264,6 +265,26 @@ pub fn save_metadata(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let metadata_path = conf.metadata_path(timeline_id, tenant_id);
|
||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -145,9 +145,31 @@ pub trait Layer: Send + Sync {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete(&self) -> Result<()>;
|
||||
fn drop_notify(&self) -> DropNotify;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool) -> Result<()>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DropNotify(std::sync::Arc<tokio::sync::Notify>);
|
||||
|
||||
impl DropNotify {
|
||||
pub fn new() -> Self {
|
||||
DropNotify(std::sync::Arc::new(tokio::sync::Notify::new()))
|
||||
}
|
||||
|
||||
pub async fn dropped(&self) {
|
||||
self.0.notified().await
|
||||
}
|
||||
|
||||
pub fn notify_waiters(&self) {
|
||||
self.0.notify_waiters();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DropNotify {
|
||||
fn drop(&mut self) {
|
||||
self.0.notify_waiters();
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,10 @@
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server.
|
||||
|
||||
use std::collections::{hash_map, HashMap};
|
||||
use std::collections::hash_map;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -13,18 +13,11 @@ use tracing::*;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::http::models::TenantInfo;
|
||||
use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
|
||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::{
|
||||
ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
|
||||
};
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{is_temporary, is_uninit_mark, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::{self, path_with_suffix_extension};
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
mod tenants_state {
|
||||
@@ -59,153 +52,112 @@ mod tenants_state {
|
||||
pub fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<RemoteIndex> {
|
||||
) -> anyhow::Result<()> {
|
||||
let _entered = info_span!("init_tenant_mgr").entered();
|
||||
|
||||
let local_tenant_files = local_tenant_timeline_files(conf)
|
||||
.context("Failed to collect local tenant timeline files")?;
|
||||
|
||||
let (remote_index, tenants_to_attach) = if let Some(storage) = remote_storage {
|
||||
let storage_config = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.expect("remote storage without config");
|
||||
let mut broken_tenants = HashMap::new();
|
||||
let mut ready_tenants = HashMap::new();
|
||||
for (tenant_id, tenant_attach_data) in local_tenant_files.into_iter() {
|
||||
match tenant_attach_data {
|
||||
TenantAttachData::Ready(t) => {
|
||||
ready_tenants.insert(tenant_id, t);
|
||||
}
|
||||
TenantAttachData::Broken(e) => {
|
||||
broken_tenants.insert(tenant_id, TenantAttachData::Broken(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = storage_sync::spawn_storage_sync_task(
|
||||
conf,
|
||||
ready_tenants,
|
||||
storage,
|
||||
storage_config.max_concurrent_syncs,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
.context("Failed to spawn the storage sync thread")?;
|
||||
|
||||
let n = local_timeline_init_statuses.0.len();
|
||||
let mut synced_timelines = local_timeline_init_statuses.0.into_iter().fold(
|
||||
HashMap::<TenantId, TenantAttachData>::with_capacity(n),
|
||||
|mut new_values, (tenant_id, old_values)| {
|
||||
let new_timeline_values = new_values
|
||||
.entry(tenant_id)
|
||||
.or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
|
||||
if let TenantAttachData::Ready(t) = new_timeline_values {
|
||||
for (timeline_id, old_value) in old_values {
|
||||
if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value {
|
||||
t.insert(timeline_id, TimelineLocalFiles::ready(metadata));
|
||||
// Scan local filesystem for attached tenants
|
||||
let mut number_of_tenants = 0;
|
||||
let tenants_dir = conf.tenants_path();
|
||||
for dir_entry in std::fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
match &dir_entry {
|
||||
Ok(dir_entry) => {
|
||||
let tenant_dir_path = dir_entry.path();
|
||||
if crate::is_temporary(&tenant_dir_path) {
|
||||
info!(
|
||||
"Found temporary tenant directory, removing: {}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
tenant_dir_path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
} else {
|
||||
match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
|
||||
Ok(Some(tenant)) => {
|
||||
tenants_state::write_tenants().insert(tenant.tenant_id(), tenant);
|
||||
number_of_tenants += 1;
|
||||
}
|
||||
Ok(None) => {
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove empty tenant directory '{}': {e:#}",
|
||||
tenant_dir_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
dir_entry,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
new_values
|
||||
},
|
||||
);
|
||||
synced_timelines.extend(broken_tenants);
|
||||
|
||||
(remote_index, synced_timelines)
|
||||
} else {
|
||||
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
||||
(RemoteIndex::default(), local_tenant_files)
|
||||
};
|
||||
attach_local_tenants(conf, &remote_index, tenants_to_attach);
|
||||
|
||||
Ok(remote_index)
|
||||
}
|
||||
|
||||
/// Reads local files to load tenants and their timelines given into pageserver's memory.
|
||||
/// Ignores other timelines that might be present for tenant, but were not passed as a parameter.
|
||||
/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken",
|
||||
/// and the load continues.
|
||||
///
|
||||
/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully.
|
||||
/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines.
|
||||
/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before).
|
||||
///
|
||||
/// Attach happens on startup and sucessful timeline downloads
|
||||
/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered).
|
||||
pub fn attach_local_tenants(
|
||||
conf: &'static PageServerConf,
|
||||
remote_index: &RemoteIndex,
|
||||
tenants_to_attach: HashMap<TenantId, TenantAttachData>,
|
||||
) {
|
||||
let _entered = info_span!("attach_local_tenants").entered();
|
||||
let number_of_tenants = tenants_to_attach.len();
|
||||
|
||||
for (tenant_id, local_timelines) in tenants_to_attach {
|
||||
let mut tenants_accessor = tenants_state::write_tenants();
|
||||
let tenant = match tenants_accessor.entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(o) => {
|
||||
info!("Tenant {tenant_id} was found in pageserver's memory");
|
||||
Arc::clone(o.get())
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
info!("Tenant {tenant_id} was not found in pageserver's memory, loading it");
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
conf,
|
||||
TenantConfOpt::default(),
|
||||
Arc::new(PostgresRedoManager::new(conf, tenant_id)),
|
||||
tenant_id,
|
||||
remote_index.clone(),
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
match local_timelines {
|
||||
TenantAttachData::Broken(_) => {
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
TenantAttachData::Ready(_) => {
|
||||
match Tenant::load_tenant_config(conf, tenant_id) {
|
||||
Ok(tenant_conf) => {
|
||||
tenant.update_tenant_config(tenant_conf);
|
||||
tenant.activate(false);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
v.insert(Arc::clone(&tenant));
|
||||
tenant
|
||||
}
|
||||
};
|
||||
drop(tenants_accessor);
|
||||
match local_timelines {
|
||||
TenantAttachData::Broken(e) => warn!("{}", e),
|
||||
TenantAttachData::Ready(ref timelines) => {
|
||||
info!("Attaching {} timelines for {tenant_id}", timelines.len());
|
||||
debug!("Timelines to attach: {local_timelines:?}");
|
||||
let has_timelines = !timelines.is_empty();
|
||||
let timelines_to_attach = timelines
|
||||
.iter()
|
||||
.map(|(&k, v)| (k, v.metadata().to_owned()))
|
||||
.collect();
|
||||
match tenant.init_attach_timelines(timelines_to_attach) {
|
||||
Ok(()) => {
|
||||
info!("successfully loaded local timelines for tenant {tenant_id}");
|
||||
tenant.activate(has_timelines);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to attach tenant timelines: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// On error, print it, but continue with the other tenants. If we error out
|
||||
// here, the pageserver startup fails altogether, causing outage for *all*
|
||||
// tenants. That seems worse.
|
||||
error!(
|
||||
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
|
||||
dir_entry,
|
||||
tenants_dir.display(),
|
||||
e,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Processed {number_of_tenants} local tenants during attach")
|
||||
info!("Processed {number_of_tenants} local tenants at startup");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_local_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||
if !tenant_path.is_dir() {
|
||||
anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
|
||||
}
|
||||
|
||||
let is_empty = tenant_path
|
||||
.is_empty_dir()
|
||||
.context("check whether tenant_path is an empty dir")?;
|
||||
if is_empty {
|
||||
info!("skipping empty tenant directory {tenant_path:?}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
Tenant::spawn_attach(conf, tenant_id, &remote_storage)
|
||||
} else {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
Tenant::create_broken_tenant(conf, tenant_id)
|
||||
}
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(conf, tenant_id, remote_storage)
|
||||
};
|
||||
Ok(Some(tenant))
|
||||
}
|
||||
|
||||
///
|
||||
@@ -218,7 +170,7 @@ pub async fn shutdown_all_tenants() {
|
||||
for (_, tenant) in m.drain() {
|
||||
if tenant.is_active() {
|
||||
// updates tenant state, forbidding new GC and compaction iterations from starting
|
||||
tenant.set_state(TenantState::Paused);
|
||||
tenant.set_paused();
|
||||
tenants_to_shut_down.push(tenant)
|
||||
}
|
||||
}
|
||||
@@ -247,156 +199,41 @@ pub async fn shutdown_all_tenants() {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_tenant_files(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
anyhow::ensure!(
|
||||
!target_tenant_directory.exists(),
|
||||
"cannot create new tenant repo: '{tenant_id}' directory already exists",
|
||||
);
|
||||
|
||||
let temporary_tenant_dir =
|
||||
path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX);
|
||||
debug!(
|
||||
"Creating temporary directory structure in {}",
|
||||
temporary_tenant_dir.display()
|
||||
);
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
|
||||
format!(
|
||||
"could not create temporary tenant directory {}",
|
||||
temporary_tenant_dir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let creation_result = try_create_target_tenant_dir(
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
&temporary_tenant_dir,
|
||||
&target_tenant_directory,
|
||||
);
|
||||
|
||||
if creation_result.is_err() {
|
||||
error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
|
||||
if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
|
||||
error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
|
||||
} else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
|
||||
error!(
|
||||
"Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
creation_result
|
||||
}
|
||||
|
||||
fn try_create_target_tenant_dir(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
temporary_tenant_dir: &Path,
|
||||
target_tenant_directory: &Path,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let temporary_tenant_timelines_dir = rebase_directory(
|
||||
&conf.timelines_path(&tenant_id),
|
||||
target_tenant_directory,
|
||||
temporary_tenant_dir,
|
||||
)
|
||||
.with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
|
||||
let temporary_tenant_config_path = rebase_directory(
|
||||
&conf.tenant_config_path(tenant_id),
|
||||
target_tenant_directory,
|
||||
temporary_tenant_dir,
|
||||
)
|
||||
.with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
|
||||
|
||||
Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"Failed to write tenant {} config to {}",
|
||||
tenant_id,
|
||||
temporary_tenant_config_path.display()
|
||||
)
|
||||
},
|
||||
)?;
|
||||
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
|
||||
format!(
|
||||
"could not create tenant {} temporary timelines directory {}",
|
||||
tenant_id,
|
||||
temporary_tenant_timelines_dir.display()
|
||||
)
|
||||
})?;
|
||||
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
|
||||
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
|
||||
});
|
||||
|
||||
fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
|
||||
format!(
|
||||
"failed to move tenant {} temporary directory {} into the permanent one {}",
|
||||
tenant_id,
|
||||
temporary_tenant_dir.display(),
|
||||
target_tenant_directory.display()
|
||||
)
|
||||
})?;
|
||||
let target_dir_parent = target_tenant_directory.parent().with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant {} dir parent for {}",
|
||||
tenant_id,
|
||||
target_tenant_directory.display()
|
||||
)
|
||||
})?;
|
||||
crashsafe::fsync(target_dir_parent).with_context(|| {
|
||||
format!(
|
||||
"Failed to fsync renamed directory's parent {} for tenant {}",
|
||||
target_dir_parent.display(),
|
||||
tenant_id,
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result<PathBuf> {
|
||||
let relative_path = original_path.strip_prefix(base).with_context(|| {
|
||||
format!(
|
||||
"Failed to strip base prefix '{}' off path '{}'",
|
||||
base.display(),
|
||||
original_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(new_base.join(relative_path))
|
||||
}
|
||||
|
||||
pub fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<Option<TenantId>> {
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(_) => {
|
||||
debug!("tenant {tenant_id} already exists");
|
||||
Ok(None)
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
create_tenant_files(conf, tenant_conf, tenant_id)?;
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
conf,
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
remote_index,
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
tenant.activate(false);
|
||||
v.insert(tenant);
|
||||
Ok(Some(tenant_id))
|
||||
// Hold the write_tenants() lock, since all of this is local IO.
|
||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
|
||||
let tenant_directory =
|
||||
super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
|
||||
let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?;
|
||||
match created_tenant {
|
||||
None => {
|
||||
// We get None in case the directory is empty.
|
||||
// This shouldn't happen here, because we just created the directory.
|
||||
// So, skip any cleanup work for now, we don't know how we reached this state.
|
||||
anyhow::bail!("we just created the tenant directory, it can't be empty");
|
||||
}
|
||||
Some(tenant) => {
|
||||
anyhow::ensure!(
|
||||
tenant_id == tenant.tenant_id(),
|
||||
"loaded created tenant has unexpected tenant id (expect {} != actual {})",
|
||||
tenant_id,
|
||||
tenant.tenant_id()
|
||||
);
|
||||
v.insert(Arc::clone(&tenant));
|
||||
Ok(Some(tenant))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -420,7 +257,10 @@ pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<
|
||||
.get(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
anyhow::bail!("Tenant {tenant_id} is not active")
|
||||
anyhow::bail!(
|
||||
"Tenant {tenant_id} is not active. Current state: {:?}",
|
||||
tenant.current_state()
|
||||
)
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
@@ -451,9 +291,6 @@ pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> an
|
||||
match get_tenant(tenant_id, true) {
|
||||
Ok(tenant) => {
|
||||
tenant.delete_timeline(timeline_id)?;
|
||||
if tenant.list_timelines().is_empty() {
|
||||
tenant.activate(false);
|
||||
}
|
||||
}
|
||||
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
||||
}
|
||||
@@ -473,7 +310,7 @@ pub async fn detach_tenant(
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
};
|
||||
|
||||
tenant.set_state(TenantState::Paused);
|
||||
tenant.set_paused();
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
|
||||
@@ -497,358 +334,94 @@ pub async fn detach_tenant(
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
|
||||
pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
|
||||
tenants_state::read_tenants()
|
||||
.iter()
|
||||
.map(|(id, tenant)| {
|
||||
let has_in_progress_downloads = remote_index
|
||||
.tenant_entry(id)
|
||||
.map(|entry| entry.has_in_progress_downloads());
|
||||
|
||||
// TODO this is not correct when we might have remote storage sync disabled:
|
||||
// we keep `RemoteTimelineIndex` in memory anyway for simplicity and this error message is printed still
|
||||
if has_in_progress_downloads.is_none() {
|
||||
error!("timeline is not found in remote index while it is present in the tenants registry")
|
||||
}
|
||||
|
||||
TenantInfo {
|
||||
id: *id,
|
||||
state: tenant.current_state(),
|
||||
current_physical_size: None,
|
||||
has_in_progress_downloads,
|
||||
}
|
||||
})
|
||||
.map(|(id, tenant)| (*id, tenant.current_state()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TenantAttachData {
|
||||
Ready(HashMap<TimelineId, TimelineLocalFiles>),
|
||||
Broken(anyhow::Error),
|
||||
}
|
||||
/// Attempts to collect information about all tenant and timelines, existing on the local FS.
|
||||
/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories,
|
||||
/// that may appear due to such removals.
|
||||
/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities.
|
||||
fn local_tenant_timeline_files(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<HashMap<TenantId, TenantAttachData>> {
|
||||
let _entered = info_span!("local_tenant_timeline_files").entered();
|
||||
|
||||
let mut local_tenant_timeline_files = HashMap::new();
|
||||
let tenants_dir = config.tenants_path();
|
||||
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
match &tenants_dir_entry {
|
||||
Ok(tenants_dir_entry) => {
|
||||
let tenant_dir_path = tenants_dir_entry.path();
|
||||
if is_temporary(&tenant_dir_path) {
|
||||
info!(
|
||||
"Found temporary tenant directory, removing: {}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
if let Err(e) = fs::remove_dir_all(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
tenant_dir_path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
} else {
|
||||
match collect_timelines_for_tenant(config, &tenant_dir_path) {
|
||||
Ok((tenant_id, TenantAttachData::Broken(e))) => {
|
||||
local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e));
|
||||
},
|
||||
Ok((tenant_id, TenantAttachData::Ready(collected_files))) => {
|
||||
if collected_files.is_empty() {
|
||||
match remove_if_empty(&tenant_dir_path) {
|
||||
Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()),
|
||||
Ok(false) => {
|
||||
// insert empty timeline entry: it has some non-temporary files inside that we cannot remove
|
||||
// so make obvious for HTTP API callers, that something exists there and try to load the tenant
|
||||
let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
|
||||
},
|
||||
Err(e) => error!("Failed to remove empty tenant directory: {e:?}"),
|
||||
}
|
||||
} else {
|
||||
match local_tenant_timeline_files.entry(tenant_id) {
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
entry.insert(TenantAttachData::Ready(collected_files));
|
||||
}
|
||||
hash_map::Entry::Occupied(entry) =>{
|
||||
if let TenantAttachData::Ready(old_timelines) = entry.into_mut() {
|
||||
old_timelines.extend(collected_files);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(e) => error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
tenants_dir_entry,
|
||||
e
|
||||
),
|
||||
}
|
||||
/// Execute Attach mgmt API command.
|
||||
///
|
||||
/// Downloading all the tenant data is performed in the background, this merely
|
||||
/// spawns the background task and returns quickly.
|
||||
pub async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(e) => {
|
||||
// Cannot attach a tenant that already exists. The error message depends on
|
||||
// the state it's in.
|
||||
match e.get().current_state() {
|
||||
TenantState::Attaching => {
|
||||
anyhow::bail!("tenant {tenant_id} attach is already in progress")
|
||||
}
|
||||
current_state => {
|
||||
anyhow::bail!("tenant already exists, current state: {current_state:?}")
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
|
||||
tenants_dir_entry,
|
||||
tenants_dir.display(),
|
||||
e
|
||||
),
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
|
||||
v.insert(tenant);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Collected files for {} tenants",
|
||||
local_tenant_timeline_files.len(),
|
||||
);
|
||||
Ok(local_tenant_timeline_files)
|
||||
}
|
||||
|
||||
fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result<bool> {
|
||||
let directory_is_empty = tenant_dir_path
|
||||
.read_dir()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to read directory '{}' contents",
|
||||
tenant_dir_path.display()
|
||||
)
|
||||
})?
|
||||
.next()
|
||||
.is_none();
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
if directory_is_empty {
|
||||
fs::remove_dir_all(&tenant_dir_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove empty directory '{}'",
|
||||
tenant_dir_path.display(),
|
||||
)
|
||||
})?;
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||
let guard = tenants_state::read_tenants();
|
||||
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
fn collect_timelines_for_tenant(
|
||||
config: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
) -> anyhow::Result<(TenantId, TenantAttachData)> {
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
let timelines_dir = config.timelines_path(&tenant_id);
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
if !timelines_dir.as_path().is_dir() {
|
||||
return Ok((
|
||||
tenant_id,
|
||||
TenantAttachData::Broken(anyhow::anyhow!(
|
||||
"Tenant {} has no timelines directory at {}",
|
||||
tenant_id,
|
||||
timelines_dir.display()
|
||||
)),
|
||||
));
|
||||
}
|
||||
|
||||
let mut tenant_timelines = HashMap::new();
|
||||
for timelines_dir_entry in fs::read_dir(&timelines_dir)
|
||||
.with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))?
|
||||
{
|
||||
match timelines_dir_entry {
|
||||
Ok(timelines_dir_entry) => {
|
||||
let timeline_dir = timelines_dir_entry.path();
|
||||
if is_temporary(&timeline_dir) {
|
||||
info!(
|
||||
"Found temporary timeline directory, removing: {}",
|
||||
timeline_dir.display()
|
||||
);
|
||||
if let Err(e) = fs::remove_dir_all(&timeline_dir) {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
timeline_dir.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
} else if is_uninit_mark(&timeline_dir) {
|
||||
let timeline_uninit_mark_file = &timeline_dir;
|
||||
info!(
|
||||
"Found an uninit mark file {}, removing the timeline and its uninit mark",
|
||||
timeline_uninit_mark_file.display()
|
||||
);
|
||||
let timeline_id = timeline_uninit_mark_file
|
||||
.file_stem()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline uninit mark name {}",
|
||||
timeline_uninit_mark_file.display()
|
||||
)
|
||||
})?;
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
if let Err(e) =
|
||||
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
|
||||
{
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
} else {
|
||||
let timeline_id = timeline_dir
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline dir name {}",
|
||||
timeline_dir.display()
|
||||
)
|
||||
})?;
|
||||
let timeline_uninit_mark_file =
|
||||
config.timeline_uninit_mark_file_path(tenant_id, timeline_id);
|
||||
if timeline_uninit_mark_file.exists() {
|
||||
info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark");
|
||||
if let Err(e) = remove_timeline_and_uninit_mark(
|
||||
&timeline_dir,
|
||||
&timeline_uninit_mark_file,
|
||||
) {
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
} else {
|
||||
match collect_timeline_files(&timeline_dir) {
|
||||
Ok((metadata, timeline_files)) => {
|
||||
tenant_timelines.insert(
|
||||
timeline_id,
|
||||
TimelineLocalFiles::collected(metadata, timeline_files),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to process timeline dir contents at '{}', reason: {:?}",
|
||||
timeline_dir.display(),
|
||||
e
|
||||
);
|
||||
match remove_if_empty(&timeline_dir) {
|
||||
Ok(true) => info!(
|
||||
"Removed empty timeline directory {}",
|
||||
timeline_dir.display()
|
||||
),
|
||||
Ok(false) => (),
|
||||
Err(e) => {
|
||||
error!("Failed to remove empty timeline directory: {e:?}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to list timelines for entry tenant {tenant_id}, reason: {e:?}")
|
||||
// Run in task_mgr to avoid race with detach operation
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
if tenant_timelines.is_empty() {
|
||||
// this is normal, we've removed all broken, empty and temporary timeline dirs
|
||||
// but should allow the tenant to stay functional and allow creating new timelines
|
||||
// on a restart, we require tenants to have the timelines dir, so leave it on disk
|
||||
debug!("Tenant {tenant_id} has no timelines loaded");
|
||||
}
|
||||
|
||||
Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
|
||||
}
|
||||
|
||||
fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
|
||||
fs::remove_dir_all(&timeline_dir)
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// we can leave the uninit mark without a timeline dir,
|
||||
// just remove the mark then
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to remove unit marked timeline directory {}",
|
||||
timeline_dir.display()
|
||||
)
|
||||
})?;
|
||||
fs::remove_file(&uninit_mark).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove timeline uninit mark file {}",
|
||||
uninit_mark.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// discover timeline files and extract timeline metadata
|
||||
// NOTE: ephemeral files are excluded from the list
|
||||
fn collect_timeline_files(
|
||||
timeline_dir: &Path,
|
||||
) -> anyhow::Result<(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>)> {
|
||||
let mut timeline_files = HashMap::new();
|
||||
let mut timeline_metadata_path = None;
|
||||
|
||||
let timeline_dir_entries =
|
||||
fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
|
||||
for entry in timeline_dir_entries {
|
||||
let entry_path = entry.context("Failed to list timeline dir entry")?.path();
|
||||
let metadata = entry_path.metadata()?;
|
||||
|
||||
if metadata.is_file() {
|
||||
if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
|
||||
timeline_metadata_path = Some(entry_path);
|
||||
} else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
|
||||
debug!("skipping ephemeral file {}", entry_path.display());
|
||||
continue;
|
||||
} else if is_temporary(&entry_path) {
|
||||
info!("removing temp timeline file at {}", entry_path.display());
|
||||
fs::remove_file(&entry_path).with_context(|| {
|
||||
format!(
|
||||
"failed to remove temp download file at {}",
|
||||
entry_path.display()
|
||||
)
|
||||
})?;
|
||||
} else {
|
||||
let layer_metadata = LayerFileMetadata::new(metadata.len());
|
||||
timeline_files.insert(entry_path, layer_metadata);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed
|
||||
// then attach is lost. There would be no retries for that,
|
||||
// initial collect will fail because there is no metadata.
|
||||
// We either need to start download if we see empty dir after restart or attach caller should
|
||||
// be aware of that and retry attach if awaits_download for timeline switched from true to false
|
||||
// but timelinne didn't appear locally.
|
||||
// Check what happens with remote index in that case.
|
||||
let timeline_metadata_path = match timeline_metadata_path {
|
||||
Some(path) => path,
|
||||
None => anyhow::bail!("No metadata file found in the timeline directory"),
|
||||
};
|
||||
let metadata = TimelineMetadata::from_bytes(
|
||||
&fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
|
||||
)
|
||||
.context("Failed to parse timeline metadata file bytes")?;
|
||||
|
||||
anyhow::ensure!(
|
||||
metadata.ancestor_timeline().is_some() || !timeline_files.is_empty(),
|
||||
"Timeline has no ancestor and no layer files"
|
||||
);
|
||||
|
||||
Ok((metadata, timeline_files))
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
@@ -6,12 +6,27 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::task_mgr;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant_mgr;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn start_background_loops(tenant_id: TenantId) {
|
||||
// Do not start the background loops.
|
||||
// Right now, in tests, Tenant is only created by TenantHarness,
|
||||
// and all tests that use TenantHarness assume that there are
|
||||
// no background loops that do compaction and GC. If they want it
|
||||
// to happen, they call the corresponding functions directly.
|
||||
//
|
||||
// XXX replace this with a TenantConfigRequest flag that is
|
||||
// also usable by tests, see https://github.com/neondatabase/neon/issues/2917
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
#[cfg(not(test))]
|
||||
pub fn start_background_loops(tenant_id: TenantId) {
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
@@ -69,7 +84,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
|
||||
// Run compaction
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if let Err(e) = tenant.compaction_iteration() {
|
||||
if let Err(e) = tenant.compaction_iteration().await {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
@@ -154,7 +169,7 @@ async fn wait_for_active_tenant(
|
||||
};
|
||||
|
||||
// if the tenant has a proper status already, no need to wait for anything
|
||||
if tenant.should_run_tasks() {
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
ControlFlow::Continue(tenant)
|
||||
} else {
|
||||
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
|
||||
@@ -163,14 +178,12 @@ async fn wait_for_active_tenant(
|
||||
Ok(()) => {
|
||||
let new_state = *tenant_state_updates.borrow();
|
||||
match new_state {
|
||||
TenantState::Active {
|
||||
background_jobs_running: true,
|
||||
} => {
|
||||
debug!("Tenant state changed to active with background jobs enabled, continuing the task loop");
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(tenant);
|
||||
}
|
||||
state => {
|
||||
debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}");
|
||||
debug!("Not running the task loop, tenant is not active: {state:?}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1091,9 +1091,9 @@ mod tests {
|
||||
Ok(walingest)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_relsize")?.load();
|
||||
#[tokio::test]
|
||||
async fn test_relsize() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_relsize")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
@@ -1219,9 +1219,9 @@ mod tests {
|
||||
|
||||
// Test what happens if we dropped a relation
|
||||
// and then created it again within the same layer.
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_drop_extend")?.load();
|
||||
#[tokio::test]
|
||||
async fn test_drop_extend() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_drop_extend")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
@@ -1259,9 +1259,9 @@ mod tests {
|
||||
// Test what happens if we truncated a relation
|
||||
// so that one of its segments was dropped
|
||||
// and then extended it again within the same layer.
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_truncate_extend")?.load();
|
||||
#[tokio::test]
|
||||
async fn test_truncate_extend() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
@@ -1347,9 +1347,9 @@ mod tests {
|
||||
|
||||
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_large_rel")?.load();
|
||||
#[tokio::test]
|
||||
async fn test_large_rel() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_large_rel")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
|
||||
@@ -870,10 +870,10 @@ mod tests {
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use url::Host;
|
||||
|
||||
#[test]
|
||||
fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_no_candidate")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?;
|
||||
@@ -964,7 +964,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("connection_no_candidate")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
let connected_sk_id = NodeId(0);
|
||||
@@ -1058,10 +1058,10 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_connection_candidate() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn no_connection_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_candidate")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
state.wal_connection = None;
|
||||
@@ -1176,7 +1176,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
@@ -1244,7 +1244,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1336,7 +1336,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1411,7 +1411,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let new_lsn = Lsn(100_100).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1490,7 +1490,7 @@ mod tests {
|
||||
|
||||
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
|
||||
|
||||
fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
|
||||
async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
|
||||
WalreceiverState {
|
||||
id: TenantTimelineId {
|
||||
tenant_id: harness.tenant_id,
|
||||
@@ -1498,6 +1498,7 @@ mod tests {
|
||||
},
|
||||
timeline: harness
|
||||
.load()
|
||||
.await
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager")
|
||||
.initialize()
|
||||
|
||||
@@ -26,14 +26,13 @@ use crate::{
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
tenant::{Timeline, WalReceiverInfo},
|
||||
tenant_mgr,
|
||||
walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
};
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use pq_proto::ReplicationFeedback;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Status of the connection.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -141,10 +140,6 @@ pub async fn handle_walreceiver_connection(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
//
|
||||
@@ -293,19 +288,8 @@ pub async fn handle_walreceiver_connection(
|
||||
})?;
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let remote_index = tenant.get_remote_index();
|
||||
let timeline_remote_consistent_lsn = remote_index
|
||||
.read()
|
||||
.await
|
||||
// here we either do not have this timeline in remote index
|
||||
// or there were no checkpoints for it yet
|
||||
.timeline_entry(&TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn())
|
||||
// no checkpoint was uploaded
|
||||
.unwrap_or(Lsn(0));
|
||||
let timeline_remote_consistent_lsn =
|
||||
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let write_lsn = u64::from(last_lsn);
|
||||
|
||||
@@ -15,10 +15,12 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*No timelines to attach received.*",
|
||||
".*Failed to process timeline dir contents.*",
|
||||
".*Failed to load delta layer.*",
|
||||
".*Timeline .* was not found.*",
|
||||
".*could not find data for key.*",
|
||||
".*is not active. Current state: Broken.*",
|
||||
".*will not become active. Current state: Broken.*",
|
||||
".*failed to load metadata.*",
|
||||
".*could not load tenant.*load local timeline.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -77,16 +79,20 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
# But all others are broken
|
||||
|
||||
# First timeline would not get loaded into pageserver due to corrupt metadata file
|
||||
with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err:
|
||||
with pytest.raises(
|
||||
Exception, match=f"Tenant {tenant1} will not become active. Current state: Broken"
|
||||
) as err:
|
||||
pg1.start()
|
||||
log.info(
|
||||
f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
|
||||
)
|
||||
|
||||
# Second timeline has no ancestors, only the metadata file and no layer files
|
||||
# We don't have the remote storage enabled, which means timeline is in an incorrect state,
|
||||
# it's not loaded at all
|
||||
with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err:
|
||||
# Second timeline has no ancestors, only the metadata file and no layer files.
|
||||
# That is checked explicitly in the pageserver, and causes the tenant to be marked
|
||||
# as broken.
|
||||
with pytest.raises(
|
||||
Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
|
||||
) as err:
|
||||
pg2.start()
|
||||
log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
|
||||
|
||||
|
||||
@@ -56,9 +56,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
cur.execute("SELECT count(*) FROM foo")
|
||||
assert cur.fetchone() == (100000,)
|
||||
|
||||
# Stop the page server by force, and restart it
|
||||
# Restart the server again, but delay the loading of tenants, and test what the
|
||||
# pageserver does if a compute node connects and sends a request for the tenant
|
||||
# while it's still in Loading state. (It waits for the loading to finish, and then
|
||||
# processes the request.)
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-loading-tenant=return(5000)"})
|
||||
|
||||
# Check that it's in Loading state
|
||||
client = env.pageserver.http_client()
|
||||
tenant_status = client.tenant_status(env.initial_tenant)
|
||||
log.info("Tenant status : %s", tenant_status)
|
||||
assert tenant_status["state"] == "Loading"
|
||||
|
||||
# Try to read. This waits until the loading finishes, and then return normally.
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute("SELECT count(*) FROM foo")
|
||||
assert cur.fetchone() == (100000,)
|
||||
|
||||
|
||||
# Test that repeatedly kills and restarts the page server, while the
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -13,11 +14,12 @@ from fixtures.neon_fixtures import (
|
||||
RemoteStorageKind,
|
||||
assert_no_in_progress_downloads_for_tenant,
|
||||
available_remote_storages,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
from fixtures.utils import print_gc_result, query_scalar, wait_until
|
||||
|
||||
|
||||
#
|
||||
@@ -63,9 +65,12 @@ def test_remote_storage_backup_and_restore(
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
|
||||
env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
|
||||
env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
|
||||
env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
|
||||
# FIXME retry downloads without throwing errors
|
||||
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
|
||||
# we have a bunch of pytest.raises for these below
|
||||
env.pageserver.allowed_errors.append(".*tenant already exists.*")
|
||||
env.pageserver.allowed_errors.append(".*attach is already in progress.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
@@ -77,6 +82,16 @@ def test_remote_storage_backup_and_restore(
|
||||
|
||||
checkpoint_numbers = range(1, 3)
|
||||
|
||||
# On the first iteration, exercise retry code path by making the uploads
|
||||
# fail for the first 3 times
|
||||
action = "3*return->off"
|
||||
pageserver_http.configure_failpoints(
|
||||
[
|
||||
("before-upload-layer", action),
|
||||
("before-upload-index", action),
|
||||
]
|
||||
)
|
||||
|
||||
for checkpoint_number in checkpoint_numbers:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
@@ -94,6 +109,7 @@ def test_remote_storage_backup_and_restore(
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
log.info(f"waiting for checkpoint {checkpoint_number} upload")
|
||||
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
|
||||
log.info(f"upload of checkpoint {checkpoint_number} is done")
|
||||
@@ -118,19 +134,24 @@ def test_remote_storage_backup_and_restore(
|
||||
time.sleep(10)
|
||||
|
||||
# assert cannot attach timeline that is scheduled for download
|
||||
with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"):
|
||||
# FIXME implement layer download retries
|
||||
with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
tenant_status = client.tenant_status(tenant_id)
|
||||
log.info("Tenant status with active failpoint: %s", tenant_status)
|
||||
assert tenant_status["has_in_progress_downloads"] is True
|
||||
# FIXME implement layer download retries
|
||||
# assert tenant_status["has_in_progress_downloads"] is True
|
||||
|
||||
# trigger temporary download files removal
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
# ensure that an initiated attach operation survives pageserver restart
|
||||
with pytest.raises(
|
||||
Exception, match=r".*(tenant already exists|attach is already in progress).*"
|
||||
):
|
||||
client.tenant_attach(tenant_id)
|
||||
log.info("waiting for timeline redownload")
|
||||
wait_until(
|
||||
number_of_iterations=20,
|
||||
@@ -152,3 +173,229 @@ def test_remote_storage_backup_and_restore(
|
||||
query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};")
|
||||
== f"{data_secret}|{checkpoint_number}"
|
||||
)
|
||||
|
||||
|
||||
# Exercises the upload queue retry code paths.
|
||||
# - Use failpoints to cause all storage ops to fail
|
||||
# - Churn on database to create layer & index uploads, and layer deletions
|
||||
# - Check that these operations are queued up, using the appropriate metrics
|
||||
# - Disable failpoints
|
||||
# - Wait for all uploads to finish
|
||||
# - Verify that remote is consistent and up-to-date (=all retries were done and succeeded)
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_remote_storage_upload_queue_retries(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_storage_backup_and_restore",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# create tenant with config that will determinstically allow
|
||||
# compaction and gc
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# small checkpointing and compaction targets to ensure we generate many operations
|
||||
"checkpoint_distance": f"{32 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{32 * 1024}",
|
||||
# large horizon to avoid automatic GC (our assert on gc_result below relies on that)
|
||||
"gc_horizon": f"{1024 ** 4}",
|
||||
"gc_period": "1h",
|
||||
# disable PITR so that GC considers just gc_horizon
|
||||
"pitr_interval": "0s",
|
||||
}
|
||||
)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
|
||||
pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
|
||||
|
||||
def configure_storage_sync_failpoints(action):
|
||||
client.configure_failpoints(
|
||||
[
|
||||
("before-upload-layer", action),
|
||||
("before-upload-index", action),
|
||||
("before-delete-layer", action),
|
||||
]
|
||||
)
|
||||
|
||||
def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
|
||||
# create initial set of layers & upload them with failpoints configured
|
||||
pg.safe_psql_many(
|
||||
[
|
||||
f"""
|
||||
INSERT INTO foo (id, val)
|
||||
SELECT g, '{data}'
|
||||
FROM generate_series(1, 10000) g
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET val = EXCLUDED.val
|
||||
""",
|
||||
# to ensure that GC can actually remove some layers
|
||||
"VACUUM foo",
|
||||
]
|
||||
)
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
def get_queued_count(file_kind, op_kind):
|
||||
metrics = client.get_metrics()
|
||||
matches = re.search(
|
||||
f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
|
||||
metrics,
|
||||
re.MULTILINE,
|
||||
)
|
||||
assert matches
|
||||
return int(matches[1])
|
||||
|
||||
# create some layers & wait for uploads to finish
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("b")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
|
||||
print_gc_result(gc_result)
|
||||
assert gc_result["layers_removed"] > 0
|
||||
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
|
||||
|
||||
# let all future operations queue up
|
||||
configure_storage_sync_failpoints("return")
|
||||
|
||||
# create more churn to generate all upload ops
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
|
||||
print_gc_result(gc_result)
|
||||
assert gc_result["layers_removed"] > 0
|
||||
|
||||
# ensure that all operation types that can be in the upload queue have queued up
|
||||
assert get_queued_count(file_kind="layer", op_kind="upload") > 0
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") >= 2
|
||||
assert get_queued_count(file_kind="layer", op_kind="delete") > 0
|
||||
|
||||
# unblock all operations and wait for them to finish
|
||||
configure_storage_sync_failpoints("off")
|
||||
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
|
||||
|
||||
# try a restore to verify that the uploads worked
|
||||
# XXX: should vary this test to selectively fail just layer uploads, index uploads, deletions
|
||||
# but how do we validate the result after restore?
|
||||
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.postgres.stop_all()
|
||||
|
||||
dir_to_clear = Path(env.repo_dir) / "tenants"
|
||||
shutil.rmtree(dir_to_clear)
|
||||
os.mkdir(dir_to_clear)
|
||||
|
||||
env.pageserver.start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
def tenant_active():
|
||||
all_states = client.tenant_list()
|
||||
[tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
|
||||
assert tenant["has_in_progress_downloads"] is False
|
||||
assert tenant["state"] == "Active"
|
||||
|
||||
wait_until(30, 1, tenant_active)
|
||||
|
||||
log.info("restarting postgres to validate")
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
with pg.cursor() as cur:
|
||||
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
|
||||
|
||||
|
||||
# Test that we correctly handle timeline with layers stuck in upload queue
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_storage_backup_and_restore",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# create tenant with config that will determinstically allow
|
||||
# compaction and gc
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# small checkpointing and compaction targets to ensure we generate many operations
|
||||
"checkpoint_distance": f"{64 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{64 * 1024}",
|
||||
# large horizon to avoid automatic GC (our assert on gc_result below relies on that)
|
||||
"gc_horizon": f"{1024 ** 4}",
|
||||
"gc_period": "1h",
|
||||
# disable PITR so that GC considers just gc_horizon
|
||||
"pitr_interval": "0s",
|
||||
}
|
||||
)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
|
||||
client.configure_failpoints(("before-upload-layer", "return"))
|
||||
|
||||
pg.safe_psql_many(
|
||||
[
|
||||
"CREATE TABLE foo (x INTEGER)",
|
||||
"INSERT INTO foo SELECT g FROM generate_series(1, 10000) g",
|
||||
]
|
||||
)
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
assert timeline_path.exists()
|
||||
assert len(list(timeline_path.glob("*"))) >= 8
|
||||
|
||||
def get_queued_count(file_kind, op_kind):
|
||||
metrics = client.get_metrics()
|
||||
matches = re.search(
|
||||
f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
|
||||
metrics,
|
||||
re.MULTILINE,
|
||||
)
|
||||
assert matches
|
||||
return int(matches[1])
|
||||
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") > 0
|
||||
|
||||
# timeline delete should work despite layer files stuck in upload
|
||||
log.info("sending delete request")
|
||||
client.timeline_delete(tenant_id, timeline_id)
|
||||
|
||||
assert not timeline_path.exists()
|
||||
|
||||
# timeline deletion should kill ongoing uploads
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") == 0
|
||||
|
||||
# Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken,
|
||||
# this would likely generate some ERROR level log entries that the NeonEnvBuilder would detect
|
||||
client.configure_failpoints(("before-upload-layer", "off"))
|
||||
# XXX force retry, currently we have to wait for exponential backoff
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
# TODO Test that we correctly handle GC of files that are stuck in upload queue.
|
||||
|
||||
@@ -3,8 +3,17 @@ from threading import Thread
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PageserverHttpClient
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
def do_gc_target(
|
||||
@@ -20,21 +29,11 @@ def do_gc_target(
|
||||
log.info("gc http thread returning")
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="""
|
||||
Commit 'make test_tenant_detach_smoke fail reproducibly' adds failpoint to make this test fail reproducibly.
|
||||
Fix in https://github.com/neondatabase/neon/pull/2851 will come as part of
|
||||
https://github.com/neondatabase/neon/pull/2785 .
|
||||
"""
|
||||
)
|
||||
def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found in the local state")
|
||||
# FIXME: we have a race condition between GC and detach. GC might fail with this
|
||||
# error. Similar to https://github.com/neondatabase/neon/issues/2671
|
||||
env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*")
|
||||
env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found")
|
||||
|
||||
# first check for non existing tenant
|
||||
tenant_id = TenantId.generate()
|
||||
@@ -73,7 +72,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
|
||||
|
||||
# Detach while running manual GC.
|
||||
# It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes)
|
||||
# It should wait for manual GC to finish because it runs in a task associated with the tenant.
|
||||
pageserver_http.configure_failpoints(
|
||||
("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
|
||||
)
|
||||
@@ -98,3 +97,73 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found"
|
||||
):
|
||||
pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
|
||||
|
||||
|
||||
#
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
||||
def test_detach_while_attaching(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_detach_while_attaching",
|
||||
)
|
||||
|
||||
##### First start, insert secret data and upload it to the remote storage
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
|
||||
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
|
||||
|
||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||
# shared_buffers, otherwise the SELECT after restart will just return answer
|
||||
# from shared_buffers without hitting the page server, which defeats the point
|
||||
# of this test.
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE foo (t text)")
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
"""
|
||||
)
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
# wait until pageserver receives that data
|
||||
wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
# run checkpoint manually to be sure that data landed in remote storage
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
log.info("waiting for upload")
|
||||
|
||||
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
||||
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
|
||||
log.info("upload is done")
|
||||
|
||||
# Detach it
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
# And re-attach
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
|
||||
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
# Before it has chance to finish, detach it again
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
# is there a better way to assert that failpoint triggered?
|
||||
time.sleep(10)
|
||||
|
||||
# Attach it again. If the GC and compaction loops from the previous attach/detach
|
||||
# cycle are still running, things could get really confusing..
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("SELECT COUNT(*) FROM foo")
|
||||
|
||||
@@ -41,16 +41,10 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
|
||||
for t in timelines:
|
||||
client.timeline_delete(tenant, t)
|
||||
|
||||
def assert_active_without_jobs(tenant):
|
||||
assert get_state(tenant) == {"Active": {"background_jobs_running": False}}
|
||||
|
||||
# Create tenant, start compute
|
||||
tenant, _ = env.neon_cli.create_tenant()
|
||||
env.neon_cli.create_timeline(name, tenant_id=tenant)
|
||||
pg = env.postgres.create_start(name, tenant_id=tenant)
|
||||
assert get_state(tenant) == {
|
||||
"Active": {"background_jobs_running": True}
|
||||
}, "Pageserver should activate a tenant and start background jobs if timelines are loaded"
|
||||
|
||||
# Stop compute
|
||||
pg.stop()
|
||||
@@ -59,7 +53,6 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
|
||||
for tenant_info in client.tenant_list():
|
||||
tenant_id = TenantId(tenant_info["id"])
|
||||
delete_all_timelines(tenant_id)
|
||||
wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id))
|
||||
|
||||
# Assert that all tasks finish quickly after tenant is detached
|
||||
assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0
|
||||
|
||||
@@ -217,21 +217,13 @@ def test_pageserver_with_empty_tenants(
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*Tenant .* has no timelines directory.*")
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*could not load tenant.*Failed to list timelines directory.*"
|
||||
)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant_without_timelines_dir = env.initial_tenant
|
||||
log.info(
|
||||
f"Tenant {tenant_without_timelines_dir} becomes broken: it abnormally looses tenants/ directory and is expected to be completely ignored when pageserver restarts"
|
||||
)
|
||||
shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines")
|
||||
|
||||
tenant_with_empty_timelines_dir = client.tenant_create()
|
||||
log.info(
|
||||
f"Tenant {tenant_with_empty_timelines_dir} gets all of its timelines deleted: still should be functional"
|
||||
)
|
||||
temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir)
|
||||
for temp_timeline in temp_timelines:
|
||||
client.timeline_delete(
|
||||
@@ -250,27 +242,23 @@ def test_pageserver_with_empty_tenants(
|
||||
# Trigger timeline reinitialization after pageserver restart
|
||||
env.postgres.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
tenant_without_timelines_dir = env.initial_tenant
|
||||
shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines")
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
tenants = client.tenant_list()
|
||||
|
||||
assert (
|
||||
len(tenants) == 2
|
||||
), "Pageserver should attach only tenants with empty or not existing timelines/ dir on restart"
|
||||
assert len(tenants) == 2
|
||||
|
||||
[broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)]
|
||||
assert (
|
||||
broken_tenant
|
||||
), f"A broken tenant {tenant_without_timelines_dir} should exists in the tenant list"
|
||||
assert (
|
||||
broken_tenant["state"] == "Broken"
|
||||
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
||||
|
||||
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
|
||||
assert (
|
||||
loaded_tenant
|
||||
), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory"
|
||||
assert loaded_tenant["state"] == {
|
||||
"Active": {"background_jobs_running": False}
|
||||
}, "Empty tenant should be loaded and ready for timeline creation"
|
||||
loaded_tenant["state"] == "Active"
|
||||
), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
|
||||
|
||||
@@ -160,6 +160,15 @@ def test_tenants_attached_after_download(
|
||||
|
||||
##### Stop the pageserver, erase its layer file to force it being downloaded from S3
|
||||
env.postgres.stop_all()
|
||||
|
||||
detail_before = client.timeline_detail(
|
||||
tenant_id, timeline_id, include_non_incremental_physical_size=True
|
||||
)
|
||||
assert (
|
||||
detail_before["current_physical_size_non_incremental"]
|
||||
== detail_before["current_physical_size"]
|
||||
)
|
||||
|
||||
env.pageserver.stop()
|
||||
|
||||
timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
@@ -186,11 +195,17 @@ def test_tenants_attached_after_download(
|
||||
assert (
|
||||
len(restored_timelines) == 1
|
||||
), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
|
||||
retored_timeline = restored_timelines[0]
|
||||
assert retored_timeline["timeline_id"] == str(
|
||||
restored_timeline = restored_timelines[0]
|
||||
assert restored_timeline["timeline_id"] == str(
|
||||
timeline_id
|
||||
), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
|
||||
|
||||
# Check that the physical size matches after re-downloading
|
||||
detail_after = client.timeline_detail(
|
||||
tenant_id, timeline_id, include_non_incremental_physical_size=True
|
||||
)
|
||||
assert detail_before["current_physical_size"] == detail_after["current_physical_size"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_tenant_upgrades_index_json_from_v0(
|
||||
@@ -339,10 +354,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Redownloading locally existing .* due to size mismatch.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Downloaded layer exists already but layer file metadata mismatches.*"
|
||||
".*removing local file .* because it has unexpected length.*"
|
||||
)
|
||||
|
||||
# FIXME: Are these expected?
|
||||
|
||||
Reference in New Issue
Block a user