diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 586ce2a73a..88603d9539 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -163,6 +163,8 @@ pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: TenantId, pub state: TenantState, + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } @@ -191,9 +193,12 @@ pub struct TimelineInfo { #[serde_as(as = "DisplayFromStr")] pub remote_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, - pub current_physical_size_non_incremental: Option, + + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, #[serde_as(as = "Option")] @@ -205,6 +210,22 @@ pub struct TimelineInfo { pub state: TimelineState, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct DownloadRemoteLayersTaskInfo { + pub task_id: String, + pub state: DownloadRemoteLayersTaskState, + pub total_layer_count: u64, // stable once `completed` + pub successful_download_count: u64, // stable once `completed` + pub failed_download_count: u64, // stable once `completed` +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum DownloadRemoteLayersTaskState { + Running, + Completed, + ShutDown, +} + pub type ConfigureFailpointsRequest = Vec; /// Information for configuring a single fail point diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 973c3cd3a6..aa87865a8a 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,7 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::tenant::Timeline; +use crate::task_mgr; +use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; @@ -152,23 +153,29 @@ where SlruKind::MultiXactOffsets, SlruKind::MultiXactMembers, ] { - for segno in self.timeline.list_slru_segments(kind, self.lsn)? { + for segno in + with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))? + { self.add_slru_segment(kind, segno)?; } } // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { + for ((spcnode, dbnode), has_relmap_file) in + with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))? + { self.add_dbdir(spcnode, dbnode, has_relmap_file)?; // Gather and send relational files in each database if full backup is requested. if self.full_backup { - for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? { + for rel in with_ondemand_download_sync(|| { + self.timeline.list_rels(spcnode, dbnode, self.lsn) + })? { self.add_rel(rel)?; } } } - for xid in self.timeline.list_twophase_files(self.lsn)? { + for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? { self.add_twophase_file(xid)?; } @@ -185,7 +192,8 @@ where } fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { - let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?; + let nblocks = + with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?; // Function that adds relation segment data to archive let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { @@ -208,7 +216,8 @@ where for blknum in blocks { let img = self .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?; + .get_rel_page_at_lsn(tag, blknum, self.lsn, false) + .no_ondemand_download()?; segment_data.extend_from_slice(&img[..]); } @@ -222,13 +231,16 @@ where // Generate SLRU segment files from repository. // fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; + let nblocks = with_ondemand_download_sync(|| { + self.timeline.get_slru_segment_size(slru, segno, self.lsn) + })?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { - let img = self - .timeline - .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; + let img = with_ondemand_download_sync(|| { + self.timeline + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn) + })?; if slru == SlruKind::Clog { ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); @@ -260,7 +272,9 @@ where has_relmap_file: bool, ) -> anyhow::Result<()> { let relmap_img = if has_relmap_file { - let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; + let img = with_ondemand_download_sync(|| { + self.timeline.get_relmap_file(spcnode, dbnode, self.lsn) + })?; ensure!(img.len() == 512); Some(img) } else { @@ -295,7 +309,8 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, self.lsn)? + .list_rels(spcnode, dbnode, self.lsn) + .no_ondemand_download()? .is_empty() { return Ok(()); @@ -327,7 +342,7 @@ where // Extract twophase state files // fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self.timeline.get_twophase_file(xid, self.lsn)?; + let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -361,14 +376,12 @@ where zenith_signal.as_bytes(), )?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) - .context("failed get control bytes")?; + let checkpoint_bytes = + with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn)) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = + with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn)) + .context("failed get control bytes")?; let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( &pg_control_bytes, @@ -490,3 +503,11 @@ where } } } + +fn with_ondemand_download_sync(f: F) -> anyhow::Result +where + F: Send + Fn() -> PageReconstructResult, + T: Send, +{ + task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f)) +} diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs index c5da54b8fc..f9d3e8553f 100644 --- a/pageserver/src/billing_metrics.rs +++ b/pageserver/src/billing_metrics.rs @@ -73,10 +73,10 @@ pub enum BillingMetricKind { /// This is an absolute, per-tenant metric. /// This is the same metric that tenant/tenant_id/size endpoint returns. SyntheticStorageSize, - /// Size of all the files in the tenant's directory on disk on the pageserver. + /// Size of all the layer files in the tenant's directory on disk on the pageserver. /// This is an absolute, per-tenant metric. - /// See also prometheus metric CURRENT_PHYSICAL_SIZE. - PhysicalSize, + /// See also prometheus metric RESIDENT_PHYSICAL_SIZE. + ResidentSize, /// Size of the remote storage (S3) directory. /// This is an absolute, per-tenant metric. RemoteStorageSize, @@ -89,7 +89,7 @@ impl FromStr for BillingMetricKind { match s { "written_size" => Ok(Self::WrittenSize), "synthetic_storage_size" => Ok(Self::SyntheticStorageSize), - "physical_size" => Ok(Self::PhysicalSize), + "resident_size" => Ok(Self::ResidentSize), "remote_storage_size" => Ok(Self::RemoteStorageSize), _ => anyhow::bail!("invalid value \"{s}\" for metric type"), } @@ -101,7 +101,7 @@ impl fmt::Display for BillingMetricKind { f.write_str(match self { BillingMetricKind::WrittenSize => "written_size", BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size", - BillingMetricKind::PhysicalSize => "physical_size", + BillingMetricKind::ResidentSize => "resident_size", BillingMetricKind::RemoteStorageSize => "remote_storage_size", }) } @@ -171,7 +171,7 @@ pub async fn collect_metrics_task( let tenant = tenant_mgr::get_tenant(tenant_id, true).await?; - let mut tenant_physical_size = 0; + let mut tenant_resident_size = 0; // iterate through list of timelines in tenant for timeline in tenant.list_timelines().iter() { @@ -186,27 +186,27 @@ pub async fn collect_metrics_task( timeline_written_size, )); - let timeline_size = timeline.get_physical_size(); - tenant_physical_size += timeline_size; + let timeline_resident_size = timeline.get_resident_physical_size(); + tenant_resident_size += timeline_resident_size; debug!( - "per-timeline current metrics for tenant: {}: timeline {} physical_size={} last_record_lsn {} (as bytes)", - tenant_id, timeline.timeline_id, timeline_size, timeline_written_size) + "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)", + tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size) } let tenant_remote_size = tenant.get_remote_size().await?; debug!( - "collected current metrics for tenant: {}: state={:?} tenant_physical_size={} remote_size={}", - tenant_id, tenant_state, tenant_physical_size, tenant_remote_size + "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}", + tenant_id, tenant_state, tenant_resident_size, tenant_remote_size ); current_metrics.push(( BillingMetricsKey { tenant_id, timeline_id: None, - metric: BillingMetricKind::PhysicalSize, + metric: BillingMetricKind::ResidentSize, }, - tenant_physical_size, + tenant_resident_size, )); current_metrics.push(( diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 937a6144b6..6d97f3206e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -12,7 +12,7 @@ use super::models::{ TimelineCreateRequest, TimelineInfo, }; use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::tenant::Timeline; +use crate::tenant::{with_ondemand_download, Timeline}; use crate::tenant_config::TenantConfOpt; use crate::{config::PageServerConf, tenant_mgr}; use utils::{ @@ -78,25 +78,23 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res } // Helper function to construct a TimelineInfo struct for a timeline -fn build_timeline_info( +async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, ) -> anyhow::Result { let mut info = build_timeline_info_common(timeline)?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. // Otherwise, if someone deletes the timeline / detaches the tenant while // we're executing this function, we will outlive the timeline on-disk state. - info.current_logical_size_non_incremental = - Some(timeline.get_current_logical_size_non_incremental( - info.last_record_lsn, - CancellationToken::new(), - )?); - } - if include_non_incremental_physical_size { - info.current_physical_size_non_incremental = - Some(timeline.get_physical_size_non_incremental()?) + info.current_logical_size_non_incremental = Some( + timeline + .get_current_logical_size_non_incremental( + info.last_record_lsn, + CancellationToken::new(), + ) + .await?, + ); } Ok(info) } @@ -128,7 +126,7 @@ fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result) -> anyhow::Result) -> Result, let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); - let include_non_incremental_physical_size = - query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; let response_data = async { @@ -210,17 +206,16 @@ async fn timeline_list_handler(request: Request) -> Result, let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { - let timeline_info = build_timeline_info( - &timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) - .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") - .map_err(ApiError::InternalServerError)?; + let timeline_info = + build_timeline_info(&timeline, include_non_incremental_logical_size) + .await + .context( + "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}", + ) + .map_err(ApiError::InternalServerError)?; response_data.push(timeline_info); } - Ok(response_data) } .instrument(info_span!("timeline_list", tenant = %tenant_id)) @@ -264,8 +259,6 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result(timeline_info) } @@ -308,10 +298,11 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result format!("{lsn}"), LsnForTimestamp::Future(_lsn) => "future".into(), LsnForTimestamp::Past(_lsn) => "past".into(), @@ -433,7 +424,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { - current_physical_size += timeline.get_physical_size(); + current_physical_size += timeline.layer_size_sum().approximate_is_ok(); } let state = tenant.current_state(); @@ -786,6 +777,45 @@ async fn timeline_checkpoint_handler(request: Request) -> Result, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + match timeline.spawn_download_all_remote_layers().await { + Ok(st) => json_response(StatusCode::ACCEPTED, st), + Err(st) => json_response(StatusCode::CONFLICT, st), + } +} + +async fn timeline_download_remote_layers_handler_get( + request: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + let info = timeline + .get_download_all_remote_layers_task_info() + .context("task never started since last pageserver process start") + .map_err(ApiError::NotFound)?; + json_response(StatusCode::OK, info) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -870,6 +900,14 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", testing_api!("run timeline checkpoint", timeline_checkpoint_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_post, + ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_get, + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index db83bdb3a1..1684ca3c64 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -187,13 +187,13 @@ fn import_slru( path: &Path, mut reader: Reader, len: usize, -) -> Result<()> { - trace!("importing slru file {}", path.display()); +) -> anyhow::Result<()> { + info!("importing slru file {path:?}"); let mut buf: [u8; 8192] = [0u8; 8192]; let filename = &path .file_name() - .expect("missing slru filename") + .with_context(|| format!("missing slru filename for path {path:?}"))? .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; @@ -279,7 +279,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .no_ondemand_download()?; last_lsn = lsn; nrecords += 1; @@ -405,7 +407,9 @@ pub fn import_wal_from_tar( let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .no_ondemand_download()?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 626d5e99e3..e01eb12b7b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -91,7 +91,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { } } -fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { +pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 } else { diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 308f9cd4eb..205ee0ffad 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,13 +84,10 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -// Metrics for determining timeline's physical size. -// A layered timeline's physical is defined as the total size of -// (delta/image) layer files on disk. -static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { +static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "pageserver_current_physical_size", - "Current physical size grouped by timeline", + "pageserver_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem.", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric") @@ -146,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 1.0, // 1 sec ]; -const STORAGE_IO_TIME_OPERATIONS: &[&str] = - &["open", "close", "read", "write", "seek", "fsync", "gc"]; +const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ + "open", "close", "read", "write", "seek", "fsync", "gc", "metadata", +]; const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; @@ -375,7 +373,7 @@ pub struct TimelineMetrics { pub load_layer_map_histo: Histogram, pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, - pub current_physical_size_gauge: UIntGauge, + pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -416,7 +414,7 @@ impl TimelineMetrics { let wait_lsn_time_histo = WAIT_LSN_TIME .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); let current_logical_size_gauge = CURRENT_LOGICAL_SIZE @@ -442,7 +440,7 @@ impl TimelineMetrics { load_layer_map_histo, last_record_gauge, wait_lsn_time_histo, - current_physical_size_gauge, + resident_physical_size_gauge, current_logical_size_gauge, num_persistent_files_created, persistent_bytes_written, @@ -458,7 +456,7 @@ impl Drop for TimelineMetrics { let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); - let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d9c19d04b7..fd4353a421 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -541,7 +541,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; + let exists = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_exists(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, @@ -558,7 +561,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; + let n_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_size(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -575,9 +581,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let total_blocks = - timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; - + let total_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest) + }) + .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { @@ -603,11 +610,14 @@ impl PageServerHandler { } */ - // FIXME: this profiling now happens at different place than it used to. The - // current profiling is based on a thread-local variable, so it doesn't work - // across awaits - let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; + let page = crate::tenant::with_ondemand_download(|| { + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); + timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7b4b05ed18..77910bceda 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,11 +6,12 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use super::tenant::PageReconstructResult; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::repository::*; use crate::tenant::Timeline; use crate::walrecord::NeonWalRecord; -use anyhow::{self, bail, ensure, Context}; +use crate::{repository::*, try_no_ondemand_download}; +use anyhow::Context; use bytes::{Buf, Bytes}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -97,16 +98,18 @@ impl Timeline { blknum: BlockNumber, lsn: Lsn, latest: bool, - ) -> anyhow::Result { - ensure!(tag.relnode != 0, "invalid relnode"); + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } - let nblocks = self.get_rel_size(tag, lsn, latest)?; + let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest)); if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, lsn, nblocks ); - return Ok(ZERO_PAGE.clone()); + return PageReconstructResult::Success(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); @@ -120,38 +123,45 @@ impl Timeline { dbnode: Oid, lsn: Lsn, latest: bool, - ) -> anyhow::Result { + ) -> PageReconstructResult { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn)?; + let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn)); for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest)?; + let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest)); total_blocks += n_blocks as usize; } - Ok(total_blocks) + PageReconstructResult::Success(total_blocks) } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> anyhow::Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_size( + &self, + tag: RelTag, + lsn: Lsn, + latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(nblocks); + return PageReconstructResult::Success(nblocks); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest)? + && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest)) { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, // without extending it. Tolerate that by claiming that // any non-existent FSM fork has size 0. - return Ok(0); + return PageReconstructResult::Success(0); } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn)?; + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); let nblocks = buf.get_u32_le(); if latest { @@ -164,25 +174,35 @@ impl Timeline { // associated with most recent value of LSN. self.update_cached_rel_size(tag, lsn, nblocks); } - Ok(nblocks) + PageReconstructResult::Success(nblocks) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> anyhow::Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_exists( + &self, + tag: RelTag, + lsn: Lsn, + _latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } // first try to lookup relation in cache if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(true); + return PageReconstructResult::Success(true); } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); - - Ok(exists) + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Get a list of all existing relations in given tablespace and database. @@ -191,21 +211,25 @@ impl Timeline { spcnode: Oid, dbnode: Oid, lsn: Lsn, - ) -> anyhow::Result> { + ) -> PageReconstructResult> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); - Ok(rels) + PageReconstructResult::Success(rels) + } + Err(e) => PageReconstructResult::from(e), + } } /// Look up given SLRU page version. @@ -215,7 +239,7 @@ impl Timeline { segno: u32, blknum: BlockNumber, lsn: Lsn, - ) -> anyhow::Result { + ) -> PageReconstructResult { let key = slru_block_to_key(kind, segno, blknum); self.get(key, lsn) } @@ -226,10 +250,10 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, - ) -> anyhow::Result { + ) -> PageReconstructResult { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn)?; - Ok(buf.get_u32_le()) + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf.get_u32_le()) } /// Get size of an SLRU segment @@ -238,14 +262,18 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, - ) -> anyhow::Result { + ) -> PageReconstructResult { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.segments.get(&segno).is_some(); - Ok(exists) + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.segments.get(&segno).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Locate LSN, such that all transactions that committed before @@ -258,7 +286,7 @@ impl Timeline { pub fn find_lsn_for_timestamp( &self, search_timestamp: TimestampTz, - ) -> anyhow::Result { + ) -> PageReconstructResult { let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; let max_lsn = self.get_last_record_lsn(); @@ -274,12 +302,12 @@ impl Timeline { // cannot overflow, high and low are both smaller than u64::MAX / 2 let mid = (high + low) / 2; - let cmp = self.is_latest_commit_timestamp_ge_than( + let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than( search_timestamp, Lsn(mid * 8), &mut found_smaller, &mut found_larger, - )?; + )); if cmp { high = mid; @@ -291,15 +319,15 @@ impl Timeline { (false, false) => { // This can happen if no commit records have been processed yet, e.g. // just after importing a cluster. - Ok(LsnForTimestamp::NoData(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn)) } (true, false) => { // Didn't find any commit timestamps larger than the request - Ok(LsnForTimestamp::Future(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn)) } (false, true) => { // Didn't find any commit timestamps smaller than the request - Ok(LsnForTimestamp::Past(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn)) } (true, true) => { // low is the LSN of the first commit record *after* the search_timestamp, @@ -309,7 +337,7 @@ impl Timeline { // Otherwise, if you restore to the returned LSN, the database will // include physical changes from later commits that will be marked // as aborted, and will need to be vacuumed away. - Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8))) + PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8))) } } } @@ -327,12 +355,20 @@ impl Timeline { probe_lsn: Lsn, found_smaller: &mut bool, found_larger: &mut bool, - ) -> anyhow::Result { - for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? { - let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?; + ) -> PageReconstructResult { + for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) { + let nblocks = try_no_ondemand_download!(self.get_slru_segment_size( + SlruKind::Clog, + segno, + probe_lsn + )); for blknum in (0..nblocks).rev() { - let clog_page = - self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; + let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn( + SlruKind::Clog, + segno, + blknum, + probe_lsn + )); if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; @@ -341,61 +377,75 @@ impl Timeline { if timestamp >= search_timestamp { *found_larger = true; - return Ok(true); + return PageReconstructResult::Success(true); } else { *found_smaller = true; } } } } - Ok(false) + PageReconstructResult::Success(false) } /// Get a list of SLRU segments - pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> anyhow::Result> { + pub fn list_slru_segments( + &self, + kind: SlruKind, + lsn: Lsn, + ) -> PageReconstructResult> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; - - Ok(dir.segments) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.segments), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> anyhow::Result { + pub fn get_relmap_file( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + ) -> PageReconstructResult { let key = relmap_file_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_dbdirs(&self, lsn: Lsn) -> anyhow::Result> { + pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(DBDIR_KEY, lsn)?; - let dir = DbDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn)); - Ok(dir.dbdirs) + match DbDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.dbdirs), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result { + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult { let key = twophase_file_key(xid); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_twophase_files(&self, lsn: Lsn) -> anyhow::Result> { + pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let dir = TwoPhaseDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn)); - Ok(dir.xids) + match TwoPhaseDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.xids), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_control_file(&self, lsn: Lsn) -> anyhow::Result { + pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult { self.get(CONTROLFILE_KEY, lsn) } - pub fn get_checkpoint(&self, lsn: Lsn) -> anyhow::Result { + pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult { self.get(CHECKPOINT_KEY, lsn) } @@ -404,23 +454,26 @@ impl Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental( + pub async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, cancel: CancellationToken, - ) -> std::result::Result { + ) -> Result { // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self.list_rels(*spcnode, *dbnode, lsn)? { + for rel in + crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn)) + .await? + { if cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); total_size += relsize as u64; @@ -433,7 +486,7 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - pub fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result { + pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -441,8 +494,8 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; - let dbdir = DbDirectory::des(&buf)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; + let dbdir = DbDirectory::des(&buf).context("deserialization failure")?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); dbs.sort_unstable(); @@ -451,14 +504,15 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn)? + .list_rels(spcnode, dbnode, lsn) + .no_ondemand_download()? .iter() .cloned() .collect(); rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -474,13 +528,13 @@ impl Timeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = self.get_download(slrudir_key, lsn).await?; + let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn)?; + let mut buf = self.get_download(segsize_key, lsn).await?; let segsize = buf.get_u32_le(); result.add_range( @@ -492,8 +546,8 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; + let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?; + let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); for xid in xids { @@ -606,7 +660,7 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -633,7 +687,7 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } @@ -652,7 +706,7 @@ impl<'a> DatadirModification<'a> { /// Store a relmapper file (pg_filenode.map) in the repository pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> { // Add it to the directory (if it doesn't exist already) - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dbdir = DbDirectory::des(&buf)?; let r = dbdir.dbdirs.insert((spcnode, dbnode), true); @@ -680,10 +734,10 @@ impl<'a> DatadirModification<'a> { pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.insert(xid) { - bail!("twophase file for xid {} already exists", xid); + anyhow::bail!("twophase file for xid {} already exists", xid); } self.put( TWOPHASEDIR_KEY, @@ -707,10 +761,13 @@ impl<'a> DatadirModification<'a> { pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> { let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?; + let total_blocks = self + .tline + .get_db_size(spcnode, dbnode, req_lsn, true) + .no_ondemand_download()?; // Remove entry from dbdir - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; @@ -734,10 +791,10 @@ impl<'a> DatadirModification<'a> { /// /// 'nblocks' is the initial size. pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { // Didn't exist. Update dbdir @@ -749,12 +806,12 @@ impl<'a> DatadirModification<'a> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key)?)? + RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)? }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - bail!("rel {} already exists", rel); + anyhow::bail!("rel {rel} already exists"); } self.put( rel_dir_key, @@ -778,12 +835,16 @@ impl<'a> DatadirModification<'a> { /// Truncate relation pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true)? { + if self + .tline + .get_rel_exists(rel, last_lsn, true) + .no_ondemand_download()? + { let size_key = rel_size_to_key(rel); // Fetch the old size first - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // Update the entry with the new size. let buf = nblocks.to_le_bytes(); @@ -804,11 +865,11 @@ impl<'a> DatadirModification<'a> { /// Extend relation /// If new size is smaller, do nothing. pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Put size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // only extend relation here. never decrease the size if nblocks > old_size { @@ -825,11 +886,11 @@ impl<'a> DatadirModification<'a> { /// Drop a relation. pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Remove it from the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = RelDirectory::des(&buf)?; if dir.rels.remove(&(rel.relnode, rel.forknum)) { @@ -840,7 +901,7 @@ impl<'a> DatadirModification<'a> { // update logical size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); self.pending_nblocks -= old_size as i64; // Remove enty from relation size cache @@ -860,11 +921,11 @@ impl<'a> DatadirModification<'a> { ) -> anyhow::Result<()> { // Add it to the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - bail!("slru segment {:?}/{} already exists", kind, segno); + anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } self.put( dir_key, @@ -899,7 +960,7 @@ impl<'a> DatadirModification<'a> { pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.remove(&segno) { @@ -925,7 +986,7 @@ impl<'a> DatadirModification<'a> { /// This method is used for marking truncated SLRU files pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { // Remove it from the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -1019,7 +1080,7 @@ impl<'a> DatadirModification<'a> { // Internal helper functions to batch the modifications - fn get(&self, key: Key) -> anyhow::Result { + fn get(&self, key: Key) -> PageReconstructResult { // Have we already updated the same key? Read the pending updated // version in that case. // @@ -1027,14 +1088,16 @@ impl<'a> DatadirModification<'a> { // value that has been removed, deletion only avoids leaking storage. if let Some(value) = self.pending_updates.get(&key) { if let Value::Image(img) = value { - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { // Currently, we never need to read back a WAL record that we // inserted in the same "transaction". All the metadata updates // work directly with Images, and we never need to read actual // data pages. We could handle this if we had to, by calling // the walredo manager, but let's keep it simple for now. - bail!("unexpected pending WAL record"); + return PageReconstructResult::from(anyhow::anyhow!( + "unexpected pending WAL record" + )); } } else { let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); @@ -1400,7 +1463,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { }, key.field6, ), - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } @@ -1426,14 +1489,14 @@ pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber 0x00 => SlruKind::Clog, 0x01 => SlruKind::MultiXactMembers, 0x02 => SlruKind::MultiXactOffsets, - _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), }; let segno = key.field4; let blknum = key.field6; (kind, segno, blknum) } - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs index 9253b250cd..a2337e8fd6 100644 --- a/pageserver/src/storage_sync2.rs +++ b/pageserver/src/storage_sync2.rs @@ -148,31 +148,43 @@ //! following two cases: //! - (1) We had the file locally, deleted it locally, scheduled a remote delete, //! but crashed before it finished remotely. -//! - (2) We never had the file locally because we were still in tenant attach -//! when we crashed. (Similar case for on-demand download in the future.) +//! - (2) We never had the file locally because we haven't on-demand downloaded +//! it yet. //! -//! # Downloads (= Tenant Attach) +//! # Downloads //! //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for -//! downloading files from the remote storage. Downloads are performed immediately, -//! independently of the uploads. +//! downloading files from the remote storage. Downloads are performed immediately +//! against the `RemoteStorage`, independently of the upload queue. //! //! When we attach a tenant, we perform the following steps: //! - create `Tenant` object in `TenantState::Attaching` state -//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s -//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart` -//! - eagerly download all the remote layers using the client's download APIs -//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state. +//! - List timelines that are present in remote storage, and for each: +//! - download their remote [`IndexPart`]s +//! - create `Timeline` struct and a `RemoteTimelineClient` +//! - initialize the client's upload queue with its `IndexPart` +//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart` +//! but not present locally +//! - schedule uploads for layers that are only present locally. +//! - if the remote `IndexPart`'s metadata was newer than the metadata in +//! the local filesystem, write the remote metadata to the local filesystem +//! - After the above is done for each timeline, open the tenant for business by +//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state. +//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops. //! -//! Most of the above happens in [`Timeline::reconcile_with_remote`]. +//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers. //! We keep track of the fact that a client is in `Attaching` state in a marker -//! file on the local disk. -//! However, the distinction is moot for storage sync since we call -//! `reconcile_with_remote` for tenants both with and without the marker file. -//! -//! In the future, downloading will be done on-demand and `reconcile_with_remote` -//! will only be responsible for re-scheduling upload ops after a crash of an -//! `Active` tenant. +//! file on the local disk. This is critical because, when we restart the pageserver, +//! we do not want to do the `List timelines` step for each tenant that has already +//! been successfully attached (for performance & cost reasons). +//! Instead, for a tenant without the attach marker file, we assume that the +//! local state is in sync or ahead of the remote state. This includes the list +//! of all of the tenant's timelines, which is particularly critical to be up-to-date: +//! if there's a timeline on the remote that the pageserver doesn't know about, +//! the GC will not consider its branch point, leading to data loss. +//! So, for a tenant with the attach marker file, we know that we do not yet have +//! persisted all the remote timeline's metadata files locally. To exclude the +//! risk above, we re-run the procedure for such tenants //! //! # Operating Without Remote Storage //! diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs index c81be05981..4256767020 100644 --- a/pageserver/src/storage_sync2/download.rs +++ b/pageserver/src/storage_sync2/download.rs @@ -180,6 +180,10 @@ pub async fn list_remote_timelines<'a>( let tenant_path = conf.timelines_path(&tenant_id); let tenant_storage_path = conf.remote_path(&tenant_path)?; + fail::fail_point!("storage-sync-list-remote-timelines", |_| { + anyhow::bail!("storage-sync-list-remote-timelines"); + }); + let timelines = download_retry( || storage.list_prefixes(Some(&tenant_storage_path)), &format!("list prefixes for {tenant_path:?}"), diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index fe3ad1a57d..a1b3ad26b0 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -35,6 +35,7 @@ #![allow(clippy::declare_interior_mutable_const)] use std::collections::HashMap; +use std::fmt; use std::future::Future; use std::panic::AssertUnwindSafe; use std::sync::atomic::{AtomicU64, Ordering}; @@ -134,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create background op runtime") }); +#[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); +impl fmt::Display for PageserverTaskId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + /// Each task that we track is associated with a "task ID". It's just an /// increasing number that we assign. Note that it is different from tokio::task::Id. static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1); @@ -198,6 +206,9 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, + // Task that downloads a file from remote storage + RemoteDownloadTask, + // task that handles the initial downloading of all tenants InitialLoad, @@ -206,6 +217,9 @@ pub enum TaskKind { // task that handhes metrics collection MetricsCollection, + + // task that drives downloading layers + DownloadAllRemoteLayers, } #[derive(Default)] @@ -437,6 +451,10 @@ pub fn current_task_kind() -> Option { CURRENT_TASK.try_with(|ct| ct.kind).ok() } +pub fn current_task_id() -> Option { + CURRENT_TASK.try_with(|ct| ct.task_id).ok() +} + /// A Future that can be used to check if the current task has been requested to /// shut down. pub async fn shutdown_watcher() { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 799a34fb3b..1240a3b4fb 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -81,6 +81,7 @@ pub mod filename; mod image_layer; mod inmemory_layer; pub mod layer_map; +mod remote_layer; pub mod metadata; mod par_fsync; @@ -90,7 +91,7 @@ mod timeline; pub mod size; -pub use timeline::Timeline; +pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -2780,9 +2781,18 @@ mod tests { writer.finish_write(Lsn(0x20)); drop(writer); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); Ok(()) } @@ -2859,15 +2869,15 @@ mod tests { // Check page contents on both branches assert_eq!( - from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "foo at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "bar at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?, "foobar at 0x20" ); @@ -3026,7 +3036,10 @@ mod tests { tenant .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; - assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); + assert!(newtline + .get(*TEST_KEY, Lsn(0x25)) + .no_ondemand_download() + .is_ok()); Ok(()) } @@ -3056,7 +3069,7 @@ mod tests { // Check that the data is still accessible on the branch. assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50))?, + newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?, TEST_IMG(&format!("foo at {}", Lsn(0x40))) ); @@ -3203,11 +3216,26 @@ mod tests { tline.freeze_and_flush().await?; tline.compact().await?; - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?, + TEST_IMG("foo at 0x30") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?, + TEST_IMG("foo at 0x40") + ); Ok(()) } @@ -3315,7 +3343,7 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } @@ -3401,7 +3429,7 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } @@ -3476,7 +3504,7 @@ mod tests { println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, *lsn)?, + tline.get(test_key, *lsn).no_ondemand_download()?, TEST_IMG(&format!("{idx} {blknum} at {lsn}")) ); } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index e1006dfe00..5b724b6263 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -39,7 +39,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::{BufWriter, Write}; use std::io::{Seek, SeekFrom}; use std::ops::Range; @@ -183,6 +183,8 @@ pub struct DeltaLayer { pub key_range: Range, pub lsn_range: Range, + pub file_size: u64, + inner: RwLock, } @@ -411,6 +413,10 @@ impl PersistentLayer for DeltaLayer { fs::remove_file(self.path())?; Ok(()) } + + fn file_size(&self) -> Option { + Some(self.file_size) + } } impl DeltaLayer { @@ -535,6 +541,7 @@ impl DeltaLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &DeltaFileName, + file_size: u64, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), @@ -542,6 +549,7 @@ impl DeltaLayer { tenant_id, key_range: filename.key_range.clone(), lsn_range: filename.lsn_range.clone(), + file_size, inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -554,21 +562,23 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn_range: summary.lsn_range, + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -725,6 +735,10 @@ impl DeltaLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + // Note: Because we opened the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -734,6 +748,7 @@ impl DeltaLayerWriterInner { timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs index b1dbbfb683..1e129fc01d 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -36,10 +36,11 @@ use bytes::Bytes; use hex; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::Write; use std::io::{Seek, SeekFrom}; use std::ops::Range; +use std::os::unix::prelude::FileExt; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard}; use tracing::*; @@ -105,6 +106,7 @@ pub struct ImageLayer { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub key_range: Range, + pub file_size: u64, // This entry contains an image of all pages as of this LSN pub lsn: Lsn, @@ -228,6 +230,10 @@ impl PersistentLayer for ImageLayer { fs::remove_file(self.path())?; Ok(()) } + + fn file_size(&self) -> Option { + Some(self.file_size) + } } impl ImageLayer { @@ -344,6 +350,7 @@ impl ImageLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &ImageFileName, + file_size: u64, ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), @@ -351,6 +358,7 @@ impl ImageLayer { tenant_id, key_range: filename.key_range.clone(), lsn: filename.lsn, + file_size, inner: RwLock::new(ImageLayerInner { loaded: false, file: None, @@ -363,21 +371,21 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: std::os::unix::prelude::FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; - + let metadata = file + .metadata() + .context("get file metadata to determine size")?; Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn: summary.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { file: None, loaded: false, @@ -523,6 +531,10 @@ impl ImageLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get metadata to determine file size")?; + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -532,6 +544,7 @@ impl ImageLayerWriterInner { tenant_id: self.tenant_id, key_range: self.key_range.clone(), lsn: self.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/remote_layer.rs b/pageserver/src/tenant/remote_layer.rs new file mode 100644 index 0000000000..affe8ca0a8 --- /dev/null +++ b/pageserver/src/tenant/remote_layer.rs @@ -0,0 +1,212 @@ +//! A RemoteLayer is an in-memory placeholder for a layer file that exists +//! in remote storage. +//! +use crate::config::PageServerConf; +use crate::repository::Key; +use crate::storage_sync::index::LayerFileMetadata; +use crate::tenant::delta_layer::DeltaLayer; +use crate::tenant::filename::{DeltaFileName, ImageFileName}; +use crate::tenant::image_layer::ImageLayer; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use anyhow::{bail, Result}; +use std::ops::Range; +use std::path::PathBuf; +use std::sync::Arc; + +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use super::filename::LayerFileName; +use super::storage_layer::{LayerIter, LayerKeyIter, PersistentLayer}; + +#[derive(Debug)] +pub struct RemoteLayer { + tenantid: TenantId, + timelineid: TimelineId, + key_range: Range, + lsn_range: Range, + + pub file_name: LayerFileName, + + pub layer_metadata: LayerFileMetadata, + + is_delta: bool, + + is_incremental: bool, + + pub(crate) ongoing_download: Arc, +} + +impl Layer for RemoteLayer { + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_state: &mut ValueReconstructState, + ) -> Result { + bail!( + "layer {} needs to be downloaded", + self.filename().file_name() + ); + } + + fn is_incremental(&self) -> bool { + self.is_incremental + } + + /// debugging function to print out the contents of the layer + fn dump(&self, _verbose: bool) -> Result<()> { + println!( + "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenantid, + self.timelineid, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end + ); + + Ok(()) + } + + fn short_id(&self) -> String { + self.filename().file_name() + } +} + +impl PersistentLayer for RemoteLayer { + fn get_tenant_id(&self) -> TenantId { + self.tenantid + } + + fn get_timeline_id(&self) -> TimelineId { + self.timelineid + } + + fn filename(&self) -> LayerFileName { + if self.is_delta { + DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + } + .into() + } else { + ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + } + .into() + } + } + + fn local_path(&self) -> Option { + None + } + + fn iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn key_iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn delete(&self) -> Result<()> { + Ok(()) + } + + fn downcast_remote_layer<'a>(self: Arc) -> Option> { + Some(self) + } + + fn is_remote_layer(&self) -> bool { + true + } + + fn file_size(&self) -> Option { + self.layer_metadata.file_size() + } +} + +impl RemoteLayer { + pub fn new_img( + tenantid: TenantId, + timelineid: TimelineId, + fname: &ImageFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn..(fname.lsn + 1), + is_delta: false, + is_incremental: false, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + pub fn new_delta( + tenantid: TenantId, + timelineid: TimelineId, + fname: &DeltaFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn_range.clone(), + is_delta: true, + is_incremental: true, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + /// Create a Layer struct representing this layer, after it has been downloaded. + pub fn create_downloaded_layer( + &self, + conf: &'static PageServerConf, + file_size: u64, + ) -> Arc { + if self.is_delta { + let fname = DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + }; + Arc::new(DeltaLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } else { + let fname = ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + }; + Arc::new(ImageLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } + } +} diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 5ce0837562..aa11985cbe 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -97,8 +97,6 @@ pub(super) async fn gather_inputs( // used to determine the `retention_period` for the size model let mut max_cutoff_distance = None; - // this will probably conflict with on-demand downloaded layers, or at least force them all - // to be downloaded for timeline in timelines { let last_record_lsn = timeline.get_last_record_lsn(); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 79eaa96591..8bfac5df8e 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -8,6 +8,7 @@ use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; +use std::sync::Arc; use utils::{ id::{TenantId, TimelineId}, @@ -15,6 +16,8 @@ use utils::{ }; use super::filename::LayerFileName; +use super::remote_layer::RemoteLayer; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -161,4 +164,28 @@ pub trait PersistentLayer: Layer { /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; + + fn downcast_remote_layer(self: Arc) -> Option> { + None + } + + fn is_remote_layer(&self) -> bool { + false + } + + /// Returns None if the layer file size is not known. + /// + /// Should not change over the lifetime of the layer object because + /// current_physical_size is computed as the som of this value. + fn file_size(&self) -> Option; +} + +pub fn downcast_remote_layer( + layer: &Arc, +) -> Option> { + if layer.is_remote_layer() { + Arc::clone(layer).downcast_remote_layer() + } else { + None + } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 61d619a17b..f4288fea36 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3,11 +3,14 @@ use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use itertools::Itertools; use once_cell::sync::OnceCell; -use pageserver_api::models::TimelineState; +use pageserver_api::models::{ + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState, +}; use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; -use tokio::task::spawn_blocking; use tokio_util::sync::CancellationToken; use tracing::*; @@ -22,6 +25,7 @@ use std::time::{Duration, Instant, SystemTime}; use crate::storage_sync::index::IndexPart; use crate::storage_sync::RemoteTimelineClient; +use crate::tenant::remote_layer::RemoteLayer; use crate::tenant::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, @@ -76,7 +80,7 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - _myself: Weak, + myself: Weak, pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -93,10 +97,7 @@ pub struct Timeline { walredo_mgr: Arc, /// Remote storage client. - /// - /// If Some, use it to upload all newly created layers to the remote storage, - /// and keep remote metadata file in sync. In the future, also use it to download - /// layer files on-demand. + /// See [`storage_sync2`] module comment for details. pub remote_client: Option>, // What page versions do we hold in the repository? If we get a @@ -187,6 +188,8 @@ pub struct Timeline { /// Relation size cache pub rel_size_cache: RwLock>, + download_all_remote_layers_task_info: RwLock>, + state: watch::Sender, } @@ -308,12 +311,68 @@ impl LogicalSize { } } +/// Returned by [`Timeline::layer_size_sum`] +pub enum LayerSizeSum { + /// The result is accurate. + Accurate(u64), + // We don't know the layer file size of one or more layers. + // They contribute to the sum with a value of 0. + // Hence, the sum is a lower bound for the actualy layer file size sum. + ApproximateLowerBound(u64), +} + +impl LayerSizeSum { + pub fn approximate_is_ok(self) -> u64 { + match self { + LayerSizeSum::Accurate(v) => v, + LayerSizeSum::ApproximateLowerBound(v) => v, + } + } +} + pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, pub last_received_msg_ts: u128, } +/// Like `?`, but for [`PageReconstructResult`]. +/// Use it to bubble up the `NeedsDownload` and `Error` to the caller. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_no_ondemand_download { + ($result:expr) => {{ + let result = $result; + match result { + PageReconstructResult::Success(value) => value, + PageReconstructResult::NeedsDownload(timeline, layer) => { + return PageReconstructResult::NeedsDownload(timeline, layer); + } + PageReconstructResult::Error(e) => return PageReconstructResult::Error(e), + } + }}; +} + +/// Replacement for `?` in functions that return [`PageReconstructResult`]. +/// +/// Given an `expr: Result`, use `try_page_reconstruct_result!(expr)` +/// instead of `(expr)?`. +/// If `expr` is `Ok(v)`, the macro evaluates to `v`. +/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_page_reconstruct_result { + ($result:expr) => {{ + let result = $result; + match result { + Ok(v) => v, + Err(e) => return PageReconstructResult::from(e), + } + }}; +} + /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -343,6 +402,77 @@ pub struct GcInfo { pub pitr_cutoff: Lsn, } +pub enum PageReconstructResult { + Success(T), + /// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map + /// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then + /// retry the operation that returned this error. + NeedsDownload(Weak, Weak), + Error(PageReconstructError), +} + +/// An error happened in a get() operation. +#[derive(thiserror::Error)] +pub enum PageReconstructError { + #[error(transparent)] + Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error + + #[error(transparent)] + WalRedo(#[from] crate::walredo::WalRedoError), +} + +impl std::fmt::Debug for PageReconstructError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Self::Other(err) => err.fmt(f), + Self::WalRedo(err) => err.fmt(f), + } + } +} + +/// This impl makes it so you can substitute return type +/// `Result` with `PageReconstructError` in functions +/// and existing `?` will generally continue to work. +/// The reason why thanks to +/// anyhow::Error that `(some error type)ensures that exis +impl From for PageReconstructResult +where + E: Into, +{ + fn from(e: E) -> Self { + Self::Error(e.into()) + } +} + +impl PageReconstructResult { + /// Treat the need for on-demand download as an error. + /// + /// **Avoid this function in new code** if you can help it, + /// as on-demand download will become the norm in the future, + /// especially once we implement layer file eviction. + /// + /// If you are in an async function, use [`with_ondemand_download`] + /// to do the download right here. + /// + /// If you are in a sync function, change its return type from + /// `Result` to `PageReconstructResult` and bubble up + /// the non-success cases of `PageReconstructResult` to the caller. + /// This gives them a chance to do the download and retry. + /// Consider using [`try_no_ondemand_download`] for convenience. + /// + /// For more background, read the comment on [`with_ondemand_download`]. + pub fn no_ondemand_download(self) -> anyhow::Result { + match self { + PageReconstructResult::Success(value) => Ok(value), + // TODO print more info about the timeline + PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"), + PageReconstructResult::Error(e) => { + Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) + } + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -370,8 +500,10 @@ impl Timeline { /// the Repository implementation may incorrectly return a value from an ancestor /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// - pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result { - anyhow::ensure!(lsn.is_valid(), "Invalid LSN"); + pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult { + if !lsn.is_valid() { + return PageReconstructResult::from(anyhow!("Invalid LSN")); + } // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image @@ -381,7 +513,7 @@ impl Timeline { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image Ordering::Greater => { unreachable!("the returned lsn should never be after the requested lsn") } @@ -396,13 +528,18 @@ impl Timeline { img: cached_page_img, }; - self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state)); self.metrics .reconstruct_time_histo .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } + // Like get(), but if a remote layer file is needed, it is downloaded as part of this call. + pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result { + with_ondemand_download(|| self.get(key, lsn)).await + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -429,30 +566,27 @@ impl Timeline { } } - /// Get the physical size of the timeline at the latest LSN - pub fn get_physical_size(&self) -> u64 { - self.metrics.current_physical_size_gauge.get() + /// The sum of the file size of all historic layers in the layer map. + /// This method makes no distinction between local and remote layers. + /// Hence, the result **does not represent local filesystem usage**. + pub fn layer_size_sum(&self) -> LayerSizeSum { + let layer_map = self.layers.read().unwrap(); + let mut size = 0; + let mut no_size_cnt = 0; + for l in layer_map.iter_historic_layers() { + let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); + size += l_size; + no_size_cnt += l_no_size; + } + if no_size_cnt == 0 { + LayerSizeSum::Accurate(size) + } else { + LayerSizeSum::ApproximateLowerBound(size) + } } - /// Get the physical size of the timeline at the latest LSN non incrementally - pub fn get_physical_size_non_incremental(&self) -> anyhow::Result { - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - // total size of layer files in the current timeline directory - let mut total_physical_size = 0; - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_string_lossy(); - - if ImageFileName::parse_str(&fname).is_some() - || DeltaFileName::parse_str(&fname).is_some() - { - total_physical_size += direntry.metadata()?.len(); - } - } - - Ok(total_physical_size) + pub fn get_resident_physical_size(&self) -> u64 { + self.metrics.resident_physical_size_gauge.get() } /// @@ -560,14 +694,18 @@ impl Timeline { // Define partitioning schema if needed - match self.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - ) { + match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) + .await + { Ok((partitioning, lsn)) => { // 2. Create new image layers for partitions that have been modified // "enough". - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + let layer_paths_to_upload = + self.create_image_layers(&partitioning, lsn, false).await?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -761,7 +899,7 @@ impl Timeline { let mut result = Timeline { conf, tenant_conf, - _myself: myself.clone(), + myself: myself.clone(), timeline_id, tenant_id, pg_version, @@ -817,6 +955,9 @@ impl Timeline { last_received_wal: Mutex::new(None), rel_size_cache: RwLock::new(HashMap::new()), + + download_all_remote_layers_task_info: RwLock::new(None), + state, }; result.repartition_threshold = result.get_checkpoint_distance() / 10; @@ -935,11 +1076,18 @@ impl Timeline { continue; } - let layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + let file_size = direntry_path.metadata()?.len(); + + let layer = ImageLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &imgfilename, + file_size, + ); trace!("found layer {}", layer.path().display()); - total_physical_size += layer.path().metadata()?.len(); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { @@ -959,11 +1107,18 @@ impl Timeline { continue; } - let layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + let file_size = direntry_path.metadata()?.len(); + + let layer = DeltaLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &deltafilename, + file_size, + ); trace!("found layer {}", layer.path().display()); - total_physical_size += layer.path().metadata()?.len(); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { @@ -997,7 +1152,7 @@ impl Timeline { num_layers, disk_consistent_lsn, total_physical_size ); self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .set(total_physical_size); timer.stop_and_record(); @@ -1005,21 +1160,14 @@ impl Timeline { Ok(()) } - async fn download_missing( + async fn create_remote_layers( &self, index_part: &IndexPart, - remote_client: &RemoteTimelineClient, local_layers: HashMap>, up_to_date_disk_consistent_lsn: Lsn, ) -> anyhow::Result>> { // Are we missing some files that are present in remote storage? - // Download them now. - // TODO Downloading many files this way is not efficient. - // Better to use FuturesUnordered. Maybe keep as is because: - // a) inplace download is a throw-away code, on-demand patch doesnt need that - // b) typical case now is that there is nothing to sync, this downloads a lot - // 1) if there was another pageserver that came and generated new files - // 2) during attach of a timeline with big history which we currently do not do + // Create RemoteLayer instances for them. let mut local_only_layers = local_layers; for remote_layer_name in &index_part.timeline_layers { let local_layer = local_only_layers.remove(remote_layer_name); @@ -1033,7 +1181,7 @@ impl Timeline { // Is the local layer's size different from the size stored in the // remote index file? // If so, rename_to_backup those files & replace their local layer with - // a RemoteLayer in the laye rmap so that we re-download them on-demand. + // a RemoteLayer in the layer map so that we re-download them on-demand. if let Some(local_layer) = local_layer { let local_layer_path = local_layer .local_path() @@ -1058,7 +1206,7 @@ impl Timeline { assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { - self.metrics.current_physical_size_gauge.sub(local_size); + self.metrics.resident_physical_size_gauge.sub(local_size); self.layers.write().unwrap().remove_historic(local_layer); // fall-through to adding the remote layer } @@ -1079,7 +1227,7 @@ impl Timeline { } info!( - "remote layer does not exist locally, downloading it now: {}", + "remote layer does not exist locally, creating remote layer: {}", remote_layer_name.file_name() ); @@ -1093,28 +1241,18 @@ impl Timeline { continue; } - trace!("downloading image file: {remote_layer_name:?}"); - let downloaded_size = remote_client - .download_layer_file(remote_layer_name, &remote_layer_metadata) - .await - .with_context(|| { - format!("failed to download image layer {remote_layer_name:?}") - })?; - trace!("done"); + let remote_layer = RemoteLayer::new_img( + self.tenant_id, + self.timeline_id, + imgfilename, + &remote_layer_metadata, + ); + let remote_layer = Arc::new(remote_layer); - let image_layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename); - - self.layers - .write() - .unwrap() - .insert_historic(Arc::new(image_layer)); - self.metrics - .current_physical_size_gauge - .add(downloaded_size); + self.layers.write().unwrap().insert_historic(remote_layer); } LayerFileName::Delta(deltafilename) => { - // Create a DeltaLayer struct for each delta file. + // Create a RemoteLayer for the delta file. // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is // OK for a delta layer to have end LSN 101, but if the end LSN @@ -1122,29 +1260,19 @@ impl Timeline { // before crash. if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { warn!( - "found future delta layer {} on timeline {} remote_consistent_lsn is {}", - deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn - ); + "found future delta layer {} on timeline {} remote_consistent_lsn is {}", + deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn + ); continue; } - - trace!("downloading delta file: {remote_layer_name:?}"); - let sz = remote_client - .download_layer_file(remote_layer_name, &remote_layer_metadata) - .await - .with_context(|| { - format!("failed to download delta layer {remote_layer_name:?}") - })?; - trace!("done"); - - let delta_layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename); - - self.layers - .write() - .unwrap() - .insert_historic(Arc::new(delta_layer)); - self.metrics.current_physical_size_gauge.add(sz); + let remote_layer = RemoteLayer::new_delta( + self.tenant_id, + self.timeline_id, + deltafilename, + &remote_layer_metadata, + ); + let remote_layer = Arc::new(remote_layer); + self.layers.write().unwrap().insert_historic(remote_layer); } #[cfg(test)] LayerFileName::Test(_) => unreachable!(), @@ -1154,22 +1282,22 @@ impl Timeline { Ok(local_only_layers) } + /// This function will synchronize local state with what we have in remote storage. /// - /// This function will synchronize local data with what we have in remote storage. - /// 1. It will download missing layer files. - /// 2. It will update local metadata if remote one has greater `disk_consistent_lsn`. - /// 3. It will upload files that are missing on the remote - /// 4. It will update index file on the remote accordingly - /// TODO may be a bit cleaner to do things based on populated remote client, - /// and then do things based on its upload_queue.latest_files + /// Steps taken: + /// 1. Initialize upload queue based on `index_part`. + /// 2. Create `RemoteLayer` instances for layers that exist only on the remote. + /// The list of layers on the remote comes from `index_part`. + /// The list of local layers is given by the layer map's `iter_historic_layers()`. + /// So, the layer map must have been loaded already. + /// 3. Schedule upload of local-only layer files (which will then also update the remote + /// IndexPart to include the new layer files). /// - /// This is used during tenant attach. The layer map must have been loaded - /// with local filesystem contents already. - /// - /// The caller should provide IndexPart if it exists on the remote storage. If it's None, - /// we assume that it is missing on the remote storage, which means that we initialized - /// a timeline and then restarted before successful upload was performed + /// Refer to the `storage_sync2` module comment for more context. /// + /// # TODO + /// May be a bit cleaner to do things based on populated remote client, + /// and then do things based on its upload_queue.latest_files. #[instrument(skip(self, index_part, up_to_date_metadata))] pub async fn reconcile_with_remote( &self, @@ -1199,9 +1327,10 @@ impl Timeline { index_part.timeline_layers.len() ); remote_client.init_upload_queue(index_part)?; - - self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn) - .await? + let local_only_filenames = self + .create_remote_layers(index_part, local_layers, disk_consistent_lsn) + .await?; + local_only_filenames } None => { info!("initializing upload queue as empty"); @@ -1323,9 +1452,15 @@ impl Timeline { let calculation = async { let cancel = cancel.child_token(); - spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn, cancel)) - .await - .context("Failed to spawn calculation result task")? + tokio::task::spawn_blocking(move || { + // Run in a separate thread since this can do a lot of + // synchronous file IO without .await inbetween + // if there are no RemoteLayers that would require downloading. + let h = tokio::runtime::Handle::current(); + h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel)) + }) + .await + .context("Failed to spawn calculation result task")? }; let timeline_state_cancellation = async { loop { @@ -1376,7 +1511,7 @@ impl Timeline { /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. - pub fn calculate_logical_size( + async fn calculate_logical_size( &self, up_to_lsn: Lsn, cancel: CancellationToken, @@ -1421,7 +1556,9 @@ impl Timeline { } else { self.metrics.logical_size_histo.start_timer() }; - let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn, cancel)?; + let logical_size = self + .get_current_logical_size_non_incremental(up_to_lsn, cancel) + .await?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); Ok(logical_size) @@ -1458,7 +1595,7 @@ impl TraversalLayerExt for Arc { match self.local_path() { Some(local_path) => { debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())), - "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary", + "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary", ); format!("{}", local_path.display()) } @@ -1497,7 +1634,7 @@ impl Timeline { key: Key, request_lsn: Lsn, reconstruct_state: &mut ValueReconstructState, - ) -> Result<(), PageReconstructError> { + ) -> PageReconstructResult<()> { // Start from the current timeline. let mut timeline_owned; let mut timeline = self; @@ -1524,12 +1661,12 @@ impl Timeline { // The function should have updated 'state' //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); match result { - ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Complete => return PageReconstructResult::Success(()), ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { self.metrics.materialized_page_cache_hit_counter.inc_by(1); - return Ok(()); + return PageReconstructResult::Success(()); } if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid @@ -1562,7 +1699,10 @@ impl Timeline { timeline.ancestor_lsn, cont_lsn ); - let ancestor = timeline.get_ancestor_timeline()?; + let ancestor = match timeline.get_ancestor_timeline() { + Ok(timeline) => timeline, + Err(e) => return PageReconstructResult::from(e), + }; timeline_owned = ancestor; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); @@ -1580,11 +1720,14 @@ impl Timeline { // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); - result = open_layer.get_value_reconstruct_data( + result = match open_layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; traversal_path.push((result, cont_lsn, open_layer.traversal_id())); continue; @@ -1595,11 +1738,14 @@ impl Timeline { if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); - result = frozen_layer.get_value_reconstruct_data( + result = match frozen_layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); continue 'outer; @@ -1609,12 +1755,24 @@ impl Timeline { if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + // If it's a remote layer, the caller can do the download and retry. + if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) { + info!("need remote layer {}", layer.traversal_id()); + return PageReconstructResult::NeedsDownload( + Weak::clone(&timeline.myself), + Arc::downgrade(&remote_layer), + ); + } + let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = layer.get_value_reconstruct_data( + result = match layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; traversal_path.push((result, cont_lsn, layer.traversal_id())); } else if timeline.ancestor_timeline.is_some() { @@ -1840,9 +1998,11 @@ impl Timeline { let lsn_range = frozen_layer.get_lsn_range(); let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - let (partitioning, _lsn) = - self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? + let (partitioning, _lsn) = self + .repartition(self.initdb_lsn, self.get_compaction_target_size()) + .await?; + self.create_image_layers(&partitioning, self.initdb_lsn, true) + .await? } else { // normal case, write out a L0 delta layer file. let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; @@ -1979,7 +2139,7 @@ impl Timeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); - self.metrics.current_physical_size_gauge.add(sz); + self.metrics.resident_physical_size_gauge.add(sz); // update metrics self.metrics.num_persistent_files_created.inc_by(1); self.metrics.persistent_bytes_written.inc_by(sz); @@ -1987,15 +2147,28 @@ impl Timeline { Ok((new_delta_filename, LayerFileMetadata::new(sz))) } - fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> { - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 == Lsn(0) - || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + async fn repartition( + &self, + lsn: Lsn, + partition_size: u64, + ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { - let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(partition_size); + let partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 != Lsn(0) + && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold + { + // no repartitioning needed + return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); + } + } + let keyspace = self.collect_keyspace(lsn).await?; + let partitioning = keyspace.partition(partition_size); + + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if lsn > partitioning_guard.1 { *partitioning_guard = (partitioning, lsn); - return Ok((partitioning_guard.0.clone(), lsn)); + } else { + warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless"); } Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } @@ -2041,7 +2214,7 @@ impl Timeline { Ok(false) } - fn create_image_layers( + async fn create_image_layers( &self, partitioning: &KeyPartitioning, lsn: Lsn, @@ -2068,7 +2241,7 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = match self.get(key, lsn) { + let img = match self.get_download(key, lsn).await { Ok(img) => img, Err(err) => { // If we fail to reconstruct a VM or FSM page, we can zero the @@ -2131,7 +2304,9 @@ impl Timeline { layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); layers.insert_historic(Arc::new(l)); } drop(layers); @@ -2443,7 +2618,9 @@ impl Timeline { } // update the timeline's physical size - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); @@ -2456,7 +2633,7 @@ impl Timeline { for l in deltas_to_compact { if let Some(path) = l.local_path() { self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .sub(path.metadata()?.len()); } layer_names_to_delete.push(l.filename()); @@ -2526,7 +2703,10 @@ impl Timeline { if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match self.find_lsn_for_timestamp(pitr_timestamp)? { + match self + .find_lsn_for_timestamp(pitr_timestamp) + .no_ondemand_download()? + { LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, LsnForTimestamp::Future(lsn) => { debug!("future({})", lsn); @@ -2743,11 +2923,11 @@ impl Timeline { for doomed_layer in layers_to_remove { if let Some(path) = doomed_layer.local_path() { self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .sub(path.metadata()?.len()); } layer_names_to_delete.push(doomed_layer.filename()); - doomed_layer.delete()?; + doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning? layers.remove_historic(doomed_layer); result.layers_removed += 1; } @@ -2778,7 +2958,7 @@ impl Timeline { key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - ) -> anyhow::Result { + ) -> PageReconstructResult { // Perform WAL redo if needed data.records.reverse(); @@ -2790,9 +2970,11 @@ impl Timeline { key, img_lsn ); - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { - bail!("base image for {} at {} not found", key, request_lsn); + PageReconstructResult::from(anyhow!( + "base image for {key} at {request_lsn} not found" + )) } } else { // We need to do WAL redo. @@ -2800,12 +2982,12 @@ impl Timeline { // If we don't have a base image, then the oldest WAL record better initialize // the page if data.img.is_none() && !data.records.first().unwrap().1.will_init() { - bail!( + PageReconstructResult::from(anyhow!( "Base image for {} at {} not found, but got {} WAL records", key, request_lsn, data.records.len() - ); + )) } else { if data.img.is_some() { trace!( @@ -2820,14 +3002,18 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = self + let img = match self .walredo_mgr .request_redo(key, request_lsn, data.img, data.records, self.pg_version) - .context("Failed to reconstruct a page image:")?; + .context("Failed to reconstruct a page image:") + { + Ok(img) => img, + Err(e) => return PageReconstructResult::from(e), + }; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); - cache + if let Err(e) = cache .memorize_materialized_page( self.tenant_id, self.timeline_id, @@ -2835,30 +3021,324 @@ impl Timeline { last_rec_lsn, &img, ) - .context("Materialized page memoization failed")?; + .context("Materialized page memoization failed") + { + return PageReconstructResult::from(e); + } } - Ok(img) + PageReconstructResult::Success(img) } } } + + /// Download a layer file from remote storage and insert it into the layer map. + /// + /// It's safe to call this function for the same layer concurrently. In that case: + /// - If the layer has already been downloaded, `OK(...)` is returned. + /// - If the layer is currently being downloaded, we wait until that download succeeded / failed. + /// - If it succeeded, we return `Ok(...)`. + /// - If it failed, we or another concurrent caller will initiate a new download attempt. + /// + /// Download errors are classified and retried if appropriate by the underlying RemoteTimelineClient function. + /// It has an internal limit for the maximum number of retries and prints appropriate log messages. + /// If we exceed the limit, it returns an error, and this function passes it through. + /// The caller _could_ retry further by themselves by calling this function again, but _should not_ do it. + /// The reason is that they cannot distinguish permanent errors from temporary ones, whereas + /// the underlying RemoteTimelineClient can. + /// + /// There is no internal timeout or slowness detection. + /// If the caller has a deadline or needs a timeout, they can simply stop polling: + /// we're **cancellation-safe** because the download happens in a separate task_mgr task. + /// So, the current download attempt will run to completion even if we stop polling. + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + pub async fn download_remote_layer( + self: Arc, + remote_layer: Arc, + ) -> anyhow::Result<()> { + let permit = match Arc::clone(&remote_layer.ongoing_download) + .acquire_owned() + .await + { + Ok(permit) => permit, + Err(_closed) => { + info!("download of layer has already finished"); + return Ok(()); + } + }; + + let (sender, receiver) = tokio::sync::oneshot::channel(); + // Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline. + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::RemoteDownloadTask, + Some(self.tenant_id), + Some(self.timeline_id), + &format!("download layer {}", remote_layer.short_id()), + false, + async move { + let remote_client = self.remote_client.as_ref().unwrap(); + + // Does retries + exponential back-off internally. + // When this fails, don't layer further retry attempts here. + let result = remote_client + .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata) + .await; + + if let Ok(size) = &result { + // XXX the temp file is still around in Err() case + // and consumes space until we clean up upon pageserver restart. + self.metrics.resident_physical_size_gauge.add(*size); + + // Download complete. Replace the RemoteLayer with the corresponding + // Delta- or ImageLayer in the layer map. + let new_layer = remote_layer.create_downloaded_layer(self.conf, *size); + let mut layers = self.layers.write().unwrap(); + { + let l: Arc = remote_layer.clone(); + layers.remove_historic(l); + } + layers.insert_historic(new_layer); + drop(layers); + + // Now that we've inserted the download into the layer map, + // close the semaphore. This will make other waiters for + // this download return Ok(()). + assert!(!remote_layer.ongoing_download.is_closed()); + remote_layer.ongoing_download.close(); + } else { + // Keep semaphore open. We'll drop the permit at the end of the function. + } + + // Don't treat it as an error if the task that triggered the download + // is no longer interested in the result. + sender.send(result.map(|_sz| ())).ok(); + + // In case we failed and there are other waiters, this will make one + // of them retry the download in a new task. + // XXX: This resets the exponential backoff because it's a new call to + // download_layer file. + drop(permit); + + Ok(()) + }, + ); + + receiver.await.context("download task cancelled")? + } + + pub async fn spawn_download_all_remote_layers( + self: Arc, + ) -> Result { + let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap(); + if let Some(st) = &*status_guard { + match &st.state { + DownloadRemoteLayersTaskState::Running => { + return Err(st.clone()); + } + DownloadRemoteLayersTaskState::ShutDown + | DownloadRemoteLayersTaskState::Completed => { + *status_guard = None; + } + } + } + + let self_clone = Arc::clone(&self); + let task_id = task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::DownloadAllRemoteLayers, + Some(self.tenant_id), + Some(self.timeline_id), + "download all remote layers task", + false, + async move { + self_clone.download_all_remote_layers().await; + let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); + match &mut *status_guard { + None => { + warn!("tasks status is supposed to be Some(), since we are running"); + } + Some(st) => { + let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap()); + if st.task_id != exp_task_id { + warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id); + } else { + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + }; + Ok(()) + } + .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id)) + ); + + let initial_info = DownloadRemoteLayersTaskInfo { + task_id: format!("{task_id}"), + state: DownloadRemoteLayersTaskState::Running, + total_layer_count: 0, + successful_download_count: 0, + failed_download_count: 0, + }; + *status_guard = Some(initial_info.clone()); + + Ok(initial_info) + } + + async fn download_all_remote_layers(self: &Arc) { + let mut downloads: FuturesUnordered<_> = { + let layers = self.layers.read().unwrap(); + layers + .iter_historic_layers() + .filter_map(|l| l.downcast_remote_layer()) + .map({ + |l| { + let self_clone = Arc::clone(self); + self_clone.download_remote_layer(l) + } + }) + .collect() + }; + + macro_rules! lock_status { + ($st:ident) => { + let mut st = self.download_all_remote_layers_task_info.write().unwrap(); + let st = st + .as_mut() + .expect("this function is only called after the task has been spawned"); + assert_eq!( + st.task_id, + format!( + "{}", + task_mgr::current_task_id().expect("we run inside a task_mgr task") + ) + ); + let $st = st; + }; + } + + { + lock_status!(st); + st.total_layer_count = downloads.len().try_into().unwrap(); + } + loop { + tokio::select! { + dl = downloads.next() => { + lock_status!(st); + match dl { + None => break, + Some(Ok(())) => { + st.successful_download_count += 1; + }, + Some(Err(e)) => { + error!(error = %e, "layer download failed"); + st.failed_download_count += 1; + } + } + } + _ = task_mgr::shutdown_watcher() => { + // Kind of pointless to watch for shutdowns here, + // as download_remote_layer spawns other task_mgr tasks internally. + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::ShutDown; + } + } + } + { + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + + pub fn get_download_all_remote_layers_task_info(&self) -> Option { + self.download_all_remote_layers_task_info + .read() + .unwrap() + .clone() + } } -/// An error happened in a get() operation. -#[derive(thiserror::Error)] -pub enum PageReconstructError { - #[error(transparent)] - Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error - - #[error(transparent)] - WalRedo(#[from] crate::walredo::WalRedoError), -} - -impl std::fmt::Debug for PageReconstructError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> { - match self { - PageReconstructError::Other(err) => err.fmt(f), - PageReconstructError::WalRedo(err) => err.fmt(f), +/// Helper function to deal with [`PageReconstructResult`]. +/// +/// Takes a sync closure that returns a [`PageReconstructResult`]. +/// If it is [`PageReconstructResult::NeedsDownload`], +/// do the download and retry the closure. +/// +/// ### Background +/// +/// This is a crutch to make on-demand downloads efficient in +/// our async-sync-async sandwich codebase. Some context: +/// +/// - The code that does the downloads uses async Rust. +/// - The code that initiates download is many levels of sync Rust. +/// - The sync code must wait for the download to finish to +/// make further progress. +/// - The sync code is invoked directly from async functions upstack. +/// +/// Example (there are also much worse ones where the sandwich is taller) +/// +/// async handle_get_page_at_lsn_request page_service.rs +/// sync get_rel_page_at_lsn timeline.rs +/// sync timeline.get timeline.rs +/// sync get_reconstruct_data timeline.rs +/// async download_remote_layer timeline.rs +/// +/// It is not possible to Timeline::download_remote_layer().await within +/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`] +/// which contains references to the [`Timeline`] and [`RemoteLayer`]. +/// We bubble that error upstack to the async code, which can then call +/// `Timeline::download_remote_layer().await`. +/// That is _efficient_ because tokio can use the same OS thread to do +/// other work while we're waiting for the download. +/// +/// It is a deliberate decision to use a new result type to communicate +/// the need for download instead of adding another variant to [`PageReconstructError`]. +/// The reason is that with the latter approach, any place that does +/// `?` on a `Result` will implicitly ignore the +/// need for download. We want that to be explicit, so that +/// - the code base becomes greppable for places that don't do a download +/// - future code changes will need to explicilty address for on-demand download +/// +/// Alternatives to consider in the future: +/// +/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread +/// and use it to block_on the download_remote_layer future. +/// That is obviously inefficient as it creates one thread per download. +/// - Convert everything to async. The problem here is that the sync +/// functions are used by many other sync functions. So, the scope +/// creep of such a conversion is tremendous. +/// - Compromise between the two: implement async functions for each sync +/// function. Switch over the hot code paths (GetPage()) to use the +/// async path, so that the hot path doesn't spawn threads. Other code +/// paths would remain sync initially, and get converted to async over time. +/// +pub async fn with_ondemand_download(mut f: F) -> Result +where + F: Send + FnMut() -> PageReconstructResult, + T: Send, +{ + loop { + let closure_result = f(); + match closure_result { + PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => { + // if the timeline is gone, it has likely been deleted / tenant detached + let tl = weak_timeline.upgrade().context("timeline is gone")?; + // if the remote layer got removed, retry the function, it might succeed now + let remote_layer = match weak_remote_layer.upgrade() { + None => { + info!("remote layer is gone, retrying closure"); + continue; + } + Some(l) => l, + }; + // Does retries internally + tl.download_remote_layer(remote_layer).await?; + // Download successful, retry the closure + continue; + } + PageReconstructResult::Success(closure_value) => return Ok(closure_value), + PageReconstructResult::Error(e) => { + return Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) + } } } } @@ -2868,7 +3348,7 @@ impl std::fmt::Debug for PageReconstructError { fn layer_traversal_error( msg: String, path: Vec<(ValueReconstructResult, Lsn, TraversalId)>, -) -> Result<(), PageReconstructError> { +) -> PageReconstructResult<()> { // We want the original 'msg' to be the outermost context. The outermost context // is the most high-level information, which also gets propagated to the client. let mut msg_iter = path @@ -2885,7 +3365,7 @@ fn layer_traversal_error( // Append all subsequent traversals, and the error message 'msg', as contexts. let msg = msg_iter.fold(err, |err, msg| err.context(msg)); - Err(PageReconstructError::Other(msg)) + PageReconstructResult::from(msg) } /// Various functions to mutate the timeline. diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 46e4acd50c..fb216123c1 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -12,7 +12,7 @@ //! use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; use once_cell::sync::OnceCell; -use std::fs::{File, OpenOptions}; +use std::fs::{self, File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; @@ -240,6 +240,10 @@ impl VirtualFile { self.with_file("fsync", |file| file.sync_all())? } + pub fn metadata(&self) -> Result { + self.with_file("metadata", |file| file.metadata())? + } + /// Helper function that looks up the underlying File for this VirtualFile, /// opening it and evicting some other File if necessary. It calls 'func' /// with the physical File. diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e8a2e99f06..e3453dfe06 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -31,7 +31,10 @@ use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use crate::pgdatadir_mapping::*; +use crate::tenant::PageReconstructResult; use crate::tenant::Timeline; +use crate::try_no_ondemand_download; +use crate::try_page_reconstruct_result as try_prr; use crate::walrecord::*; use crate::ZERO_PAGE; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -52,10 +55,10 @@ pub struct WalIngest<'a> { } impl<'a> WalIngest<'a> { - pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result { + pub fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; + let checkpoint_bytes = timeline.get_checkpoint(startpoint).no_ondemand_download()?; let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); @@ -80,10 +83,12 @@ impl<'a> WalIngest<'a> { lsn: Lsn, modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, - ) -> Result<()> { + ) -> PageReconstructResult<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded, self.timeline.pg_version) - .context("failed decoding wal record")?; + try_prr!( + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record") + ); let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -98,7 +103,7 @@ impl<'a> WalIngest<'a> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, modification, decoded)?; + try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded)); } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -106,13 +111,13 @@ impl<'a> WalIngest<'a> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(modification, &create)?; + try_prr!(self.ingest_xlog_smgr_create(modification, &create)); } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(modification, &truncate)?; + try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate)); } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { debug!( "handle RM_DBASE_ID for Postgres version {:?}", @@ -125,14 +130,14 @@ impl<'a> WalIngest<'a> { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); - self.ingest_xlog_dbase_create(modification, &createdb)?; + try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); } } } else if self.timeline.pg_version == 15 { @@ -148,14 +153,14 @@ impl<'a> WalIngest<'a> { // So we can reuse XlCreateDatabase here. debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; + try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); } } } @@ -167,38 +172,38 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::Clog, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(modification, &xlrec)?; + try_prr!(self.ingest_clog_truncate_record(modification, &xlrec)); } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( + try_prr!(self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, - )?; + )); } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED || info == pg_constants::XLOG_XACT_ABORT_PREPARED { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( + try_prr!(self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, - )?; + )); // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", @@ -206,9 +211,10 @@ impl<'a> WalIngest<'a> { parsed_xact.xid, lsn, ); - modification.drop_twophase_file(parsed_xact.xid)?; + try_prr!(modification.drop_twophase_file(parsed_xact.xid)); } else if info == pg_constants::XLOG_XACT_PREPARE { - modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?; + try_prr!(modification + .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))); } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -217,34 +223,34 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::MultiXactOffsets, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::MultiXactMembers, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(modification, &xlrec)?; + try_prr!(self.ingest_multixact_create_record(modification, &xlrec)); } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(modification, &xlrec)?; + try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec)); } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded)?; + try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded)); } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -258,7 +264,9 @@ impl<'a> WalIngest<'a> { { let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; + let xlog_checkpoint = try_prr!( + CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint") + ); trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", xlog_checkpoint.oldestXid, @@ -279,22 +287,23 @@ impl<'a> WalIngest<'a> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(modification, lsn, decoded, blk)?; + try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk)); } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { - let new_checkpoint_bytes = self.checkpoint.encode()?; + let new_checkpoint_bytes = + try_prr!(self.checkpoint.encode().context("encode checkpoint")); - modification.put_checkpoint(new_checkpoint_bytes)?; + try_prr!(modification.put_checkpoint(new_checkpoint_bytes)); self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit()?; + try_prr!(modification.commit()); - Ok(()) + PageReconstructResult::Success(()) } fn ingest_decoded_block( @@ -303,7 +312,7 @@ impl<'a> WalIngest<'a> { lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, - ) -> Result<()> { + ) -> PageReconstructResult<()> { let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, @@ -323,7 +332,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)) { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -345,15 +354,20 @@ impl<'a> WalIngest<'a> { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); - self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; + try_no_ondemand_download!(self.put_rel_page_image( + modification, + rel, + blk.blkno, + image.freeze() + )); } else { let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - self.put_rel_wal_record(modification, rel, blk.blkno, rec)?; + try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec)); } - Ok(()) + PageReconstructResult::Success(()) } fn ingest_heapam_record( @@ -505,7 +519,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, rec: &XlCreateDatabase, - ) -> Result<()> { + ) -> anyhow::Result<()> { let db_id = rec.db_id; let tablespace_id = rec.tablespace_id; let src_db_id = rec.src_db_id; @@ -520,14 +534,16 @@ impl<'a> WalIngest<'a> { let rels = modification .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn)?; + .list_rels(src_tablespace_id, src_db_id, req_lsn) + .no_ondemand_download()?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); // Copy relfilemap let filemap = modification .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn) + .no_ondemand_download()?; modification.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; @@ -536,7 +552,10 @@ impl<'a> WalIngest<'a> { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?; + let nblocks = modification + .tline + .get_rel_size(src_rel, req_lsn, true) + .no_ondemand_download()?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, @@ -553,7 +572,8 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?; + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true) + .no_ondemand_download()?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } @@ -657,7 +677,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Record update of CLOG pages let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE; let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -713,7 +733,11 @@ impl<'a> WalIngest<'a> { relnode: xnode.relnode, }; let last_lsn = self.timeline.get_last_record_lsn(); - if modification.tline.get_rel_exists(rel, last_lsn, true)? { + if modification + .tline + .get_rel_exists(rel, last_lsn, true) + .no_ondemand_download()? + { self.put_rel_drop(modification, rel)?; } } @@ -725,7 +749,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, xlrec: &XlClogTruncate, - ) -> Result<()> { + ) -> anyhow::Result<()> { info!( "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db @@ -767,7 +791,8 @@ impl<'a> WalIngest<'a> { let req_lsn = modification.tline.get_last_record_lsn(); for segno in modification .tline - .list_slru_segments(SlruKind::Clog, req_lsn)? + .list_slru_segments(SlruKind::Clog, req_lsn) + .no_ondemand_download()? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; if slru_may_delete_clogsegment(segpage, xlrec.pageno) { @@ -923,10 +948,10 @@ impl<'a> WalIngest<'a> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; - modification.put_rel_page_image(rel, blknum, img)?; - Ok(()) + ) -> PageReconstructResult<()> { + try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum)); + try_prr!(modification.put_rel_page_image(rel, blknum, img)); + PageReconstructResult::Success(()) } fn put_rel_wal_record( @@ -936,7 +961,8 @@ impl<'a> WalIngest<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; + self.handle_rel_extend(modification, rel, blknum) + .no_ondemand_download()?; modification.put_rel_wal_record(rel, blknum, rec)?; Ok(()) } @@ -946,7 +972,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { modification.put_rel_truncation(rel, nblocks)?; Ok(()) } @@ -956,11 +982,17 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? { + fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result { + let nblocks = if !self + .timeline + .get_rel_exists(rel, lsn, true) + .no_ondemand_download()? + { 0 } else { - self.timeline.get_rel_size(rel, lsn, true)? + self.timeline + .get_rel_size(rel, lsn, true) + .no_ondemand_download()? }; Ok(nblocks) } @@ -970,30 +1002,31 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, - ) -> Result<()> { + ) -> PageReconstructResult<()> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? { - // create it with 0 size initially, the logic below will extend it - modification.put_rel_creation(rel, 0)?; - 0 - } else { - self.timeline.get_rel_size(rel, last_lsn, true)? - }; + let old_nblocks = + if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) { + // create it with 0 size initially, the logic below will extend it + try_prr!(modification.put_rel_creation(rel, 0)); + 0 + } else { + try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true)) + }; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); - modification.put_rel_extend(rel, new_nblocks)?; + try_prr!(modification.put_rel_extend(rel, new_nblocks)); // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())); } } - Ok(()) + PageReconstructResult::Success(()) } fn put_slru_page_image( @@ -1015,7 +1048,7 @@ impl<'a> WalIngest<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1027,13 +1060,16 @@ impl<'a> WalIngest<'a> { let last_lsn = self.timeline.get_last_record_lsn(); let old_nblocks = if !self .timeline - .get_slru_segment_exists(kind, segno, last_lsn)? + .get_slru_segment_exists(kind, segno, last_lsn) + .no_ondemand_download()? { // create it with 0 size initially, the logic below will extend it modification.put_slru_segment_creation(kind, segno, 0)?; 0 } else { - self.timeline.get_slru_segment_size(kind, segno, last_lsn)? + self.timeline + .get_slru_segment_size(kind, segno, last_lsn) + .no_ondemand_download()? }; if new_nblocks > old_nblocks { @@ -1099,58 +1135,103 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A)?; - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x30)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x50)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5")) + .no_ondemand_download()?; m.commit()?; assert_current_logical_size(&*tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); // Check page contents at each LSN assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1161,20 +1242,36 @@ mod tests { assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); // should still see the truncated block with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1182,35 +1279,62 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x68)); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x68), false) + .no_ondemand_download()?, + 0 + ); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1")) + .no_ondemand_download()?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x70), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false) + .no_ondemand_download()?, ZERO_PAGE ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500")) + .no_ondemand_download()?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + 1501 + ); for blk in 2..1500 { assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false) + .no_ondemand_download()?, ZERO_PAGE ); } assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1500") ); @@ -1226,12 +1350,24 @@ mod tests { let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .no_ondemand_download()?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); @@ -1239,19 +1375,36 @@ mod tests { m.commit()?; // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x30), false) + .no_ondemand_download()?, + false + ); // FIXME: should fail //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4")) + .no_ondemand_download()?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + 1 + ); Ok(()) } @@ -1270,23 +1423,45 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .no_ondemand_download()?; } m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1298,24 +1473,38 @@ mod tests { m.commit()?; // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 1 + ); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } // should still see all blocks with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + relsize + ); for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1326,18 +1515,32 @@ mod tests { let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .no_ondemand_download()?; } m.commit()?; - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1358,14 +1561,18 @@ mod tests { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); - walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img) + .no_ondemand_download()?; m.commit()?; } assert_current_logical_size(&*tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE + 1 ); @@ -1374,7 +1581,12 @@ mod tests { let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, + RELSEG_SIZE + ); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block @@ -1383,7 +1595,9 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE - 1 ); assert_current_logical_size(&*tline, Lsn(lsn)); @@ -1397,7 +1611,9 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, size as BlockNumber ); diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index a65703bca9..aeb7601af7 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -407,7 +407,7 @@ impl WalreceiverState { .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id)) + .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) }); let now = Utc::now().naive_utc(); diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 5b7e60aa5e..cc318cccc8 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,7 +20,9 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; -use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; +use crate::{ + metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate, +}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -248,9 +250,16 @@ pub async fn handle_walreceiver_connection( // at risk of hitting a deadlock. ensure!(lsn.is_aligned()); - walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded) - .context("could not ingest record at {lsn}")?; + with_ondemand_download(|| { + walingest.ingest_record( + recdata.clone(), + lsn, + &mut modification, + &mut decoded, + ) + }) + .await + .with_context(|| format!("could not ingest record at {lsn}"))?; fail_point!("walreceiver-after-ingest"); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 38fb9a4247..7581140934 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1,6 +1,7 @@ //! //! Functions for parsing WAL records. //! + use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::pg_constants; diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 8ea3f13bf5..d83a74ae14 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -318,14 +318,8 @@ def remote_consistent_lsn( detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: - # No remote information at all. This happens right after creating - # a timeline, before any part of it has been uploaded to remote - # storage yet. - return 0 - else: - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload( diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 5fe6c43528..9236137d19 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -49,7 +49,7 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", - "pageserver_current_physical_size", + "pageserver_resident_physical_size", "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d52ca38447..5b00ebdea7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -26,6 +26,7 @@ import asyncpg import backoff # type: ignore import boto3 import jwt +import prometheus_client import psycopg2 import pytest import requests @@ -41,6 +42,7 @@ from fixtures.utils import ( get_self_dir, subprocess_capture, ) +from prometheus_client.parser import text_string_to_metric_families # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -1204,8 +1206,22 @@ class PageserverHttpClient(requests.Session): # there are no tests for those right now. return size - def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") + def timeline_list( + self, + tenant_id: TenantId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + ) -> List[Dict[str, Any]]: + + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -1239,13 +1255,13 @@ class PageserverHttpClient(requests.Session): tenant_id: TenantId, timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, - include_non_incremental_physical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, ) -> Dict[Any, Any]: params = {} if include_non_incremental_logical_size: params["include-non-incremental-logical-size"] = "yes" - if include_non_incremental_physical_size: - params["include-non-incremental-physical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", @@ -1320,11 +1336,88 @@ class PageserverHttpClient(requests.Session): res_json = res.json() assert res_json is None + def timeline_spawn_download_remote_layers( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> dict[str, Any]: + + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_poll_download_remote_layers_status( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + spawn_response: dict[str, Any], + poll_state=None, + ) -> None | dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + + # assumption in this API client here is that nobody else spawns the task + assert res_json["task_id"] == spawn_response["task_id"] + + if poll_state is None or res_json["state"] == poll_state: + return res_json + return None + + def timeline_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + errors_ok=False, + at_least_one_download=True, + ): + res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id) + while True: + completed = self.timeline_poll_download_remote_layers_status( + tenant_id, timeline_id, res, poll_state="Completed" + ) + if not completed: + time.sleep(0.1) + continue + if not errors_ok: + assert completed["failed_download_count"] == 0 + if at_least_one_download: + assert completed["successful_download_count"] > 0 + return completed + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) return res.text + def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str): + raw = self.get_metrics() + family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw)) + [metric] = [m for m in family if m.name == metric_name] + [sample] = [ + s + for s in metric.samples + if s.labels["tenant_id"] == str(tenant_id) + and s.labels["timeline_id"] == str(timeline_id) + ] + return sample.value + + def get_metric_value(self, name: str) -> Optional[str]: + metrics = self.get_metrics() + relevant = [line for line in metrics.splitlines() if line.startswith(name)] + if len(relevant) == 0: + log.info(f'could not find metric "{name}"') + return None + assert len(relevant) == 1 + return relevant[0].lstrip(name).strip() + @dataclass class PageserverPort: @@ -1622,7 +1715,12 @@ class NeonCli(AbstractNeonCli): pageserver_config_override=self.env.pageserver.config_override, ) - res = self.raw_cli(cmd) + s3_env_vars = None + if self.env.remote_storage is not None and isinstance( + self.env.remote_storage, S3Storage + ): + s3_env_vars = self.env.remote_storage.access_env_vars() + res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) res.check_returncode() return res @@ -2996,13 +3094,55 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def assert_no_in_progress_downloads_for_tenant( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, +def wait_until(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. + """ + last_exception = None + for i in range(number_of_iterations): + try: + res = func() + except Exception as e: + log.info("waiting for %s iteration %s failed", func, i + 1) + last_exception = e + time.sleep(interval) + continue + return res + raise Exception("timed out while waiting for %s" % func) from last_exception + + +def wait_while(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns false, or throws an exception. + """ + for i in range(number_of_iterations): + try: + if not func(): + return + log.info("waiting for %s iteration %s failed", func, i + 1) + time.sleep(interval) + continue + except Exception: + return + raise Exception("timed out while waiting for %s" % func) + + +def assert_tenant_status( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str ): tenant_status = pageserver_http_client.tenant_status(tenant) - assert tenant_status["has_in_progress_downloads"] is False, tenant_status - assert tenant_status["state"] == "Active" + log.info(f"tenant_status: {tenant_status}") + assert tenant_status["state"] == expected_status, tenant_status + + +def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId): + tenants = ps_http.tenant_list() + matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] + assert len(matching) < 2 + if len(matching) == 0: + return None + return matching[0] def remote_consistent_lsn( @@ -3010,14 +3150,15 @@ def remote_consistent_lsn( ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: + if detail["remote_consistent_lsn"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return Lsn(0) - assert isinstance(lsn_str, str) - return Lsn(lsn_str) + else: + lsn_str = detail["remote_consistent_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) def wait_for_upload( @@ -3030,6 +3171,7 @@ def wait_for_upload( for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: + log.info("wait finished") return log.info( "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 71964f622f..05d5788028 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*Failed to load delta layer.*", + ".*Failed to reconstruct the page.*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -87,9 +87,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}" ) - # Second timeline has no ancestors, only the metadata file and no layer files. - # That is checked explicitly in the pageserver, and causes the tenant to be marked - # as broken. + # Second timeline has no ancestors, only the metadata file and no layer files locally, + # and we don't have the remote storage enabled. It is loaded into memory, but getting + # the basebackup from it will fail. with pytest.raises( Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken" ) as err: @@ -97,8 +97,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): log.info(f"As expected, compute startup failed for timeline with missing layers: {err}") # Third timeline will also fail during basebackup, because the layer file is corrupt. + # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) - with pytest.raises(Exception, match="Failed to load delta layer") as err: + with pytest.raises(Exception, match="Failed to reconstruct the page") as err: pg3.start() log.info( f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}" diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 7f86d92962..fa1bf0fbb2 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -37,7 +37,7 @@ def metrics_handler(request: Request) -> Response: checks = { "written_size": lambda value: value > 0, - "physical_size": lambda value: value >= 0, + "resident_size": lambda value: value >= 0, # >= 0 check here is to avoid race condition when we receive metrics before # remote_uploaded is updated "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py new file mode 100644 index 0000000000..352ae4b95c --- /dev/null +++ b/test_runner/regress/test_ondemand_download.py @@ -0,0 +1,437 @@ +# It's possible to run any regular test with the local fs remote storage via +# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... + +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + RemoteStorageKind, + assert_tenant_status, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, + wait_for_upload, + wait_until, +) +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +def get_num_downloaded_layers(client, tenant_id, timeline_id): + value = client.get_metric_value( + f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}' + ) + if value is None: + return 0 + return int(value) + + +# +# If you have a large relation, check that the pageserver downloads parts of it as +# require by queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_large_rel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_large_rel", + ) + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB + "compaction_threshold": "3", + "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + # We want to make sure that the data is large enough that the keyspace is partitioned. + num_rows = 1000000 + + with pg.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute( + f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) + cur.execute("CREATE INDEX ON tbl (id)") + cur.execute("VACUUM tbl") + + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data + pg.stop() + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + pg.start() + before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + + # Probe in the middle of the table. There's a high chance that the beginning + # and end of the table was stored together in the same layer files with data + # from other tables, and with the entry that stores the size of the + # relation, so they are likely already downloaded. But the middle of the + # table should not have been needed by anything yet. + with pg.cursor() as cur: + assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1 + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + log.info(f"layers downloaded before {before_downloads} and after {after_downloads}") + assert after_downloads > before_downloads + + +# +# If you have a relation with a long history of updates,the pageserver downloads the layer +# files containing the history as needed by timetravel queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_timetravel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_timetravel", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + lsns = [] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + lsns.append((0, current_lsn)) + + for checkpoint_number in range(1, 20): + with pg.cursor() as cur: + cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((checkpoint_number, current_lsn)) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + # current_physical_size reports sum of layer file sizes, regardless of local or remote + assert filled_current_physical == get_api_current_physical_size() + + num_layers_downloaded = [0] + physical_size = [get_resident_physical_size()] + for (checkpoint_number, lsn) in lsns: + pg_old = env.postgres.create_start( + branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn + ) + with pg_old.cursor() as cur: + # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000 + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}", + ) + == 0 + ) + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number={checkpoint_number}", + ) + == table_len + ) + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + num_layers_downloaded.append(after_downloads) + log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}") + + # Check that on each query, we need to download at least one more layer file. However in + # practice, thanks to compaction and the fact that some requests need to download + # more history, some points-in-time are covered by earlier downloads already. But + # in broad strokes, as we query more points-in-time, more layers need to be downloaded. + # + # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded + # more files than we had three iterations ago. + log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}") + if len(num_layers_downloaded) > 4: + assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4] + + # Likewise, assert that the physical_size metric grows as layers are downloaded + physical_size.append(get_resident_physical_size()) + log.info(f"physical_size[-1]={physical_size[-1]}") + if len(physical_size) > 4: + assert physical_size[-1] > physical_size[len(physical_size) - 4] + + # current_physical_size reports sum of layer file sizes, regardless of local or remote + assert filled_current_physical == get_api_current_physical_size() + + +# +# Ensure that the `download_remote_layers` API works +# +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_download_remote_layers_api( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_download_remote_layers_api", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + + env.postgres.stop_all() + + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + # XXX only delete some of the layer files, to show that it really just downloads all the layers + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + # Shut down safekeepers before starting the pageserver. + # If we don't, the tenant's walreceiver handler will trigger the + # the logical size computation task, and that downloads layes, + # which makes our assertions on size fail. + for sk in env.safekeepers: + sk.stop(immediate=True) + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) + env.pageserver.allowed_errors.extend( + [ + f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint", + f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size", + ] + ) + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + ###### Phase 1: exercise download error code path + assert ( + filled_current_physical == get_api_current_physical_size() + ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + post_unlink_size = get_resident_physical_size() + log.info(post_unlink_size) + assert ( + post_unlink_size < filled_size + ), "we just deleted layers and didn't cause anything to re-download them yet" + assert filled_size - post_unlink_size > 5 * ( + 1024**2 + ), "we may be downloading some layers as part of tenant activation" + + # issue downloads that we know will fail + info = client.timeline_download_remote_layers( + tenant_id, timeline_id, errors_ok=True, at_least_one_download=False + ) + log.info(f"info={info}") + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] == 0 + assert ( + info["failed_download_count"] > 0 + ) # can't assert == total_layer_count because attach + tenant status downloads some layers + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + assert get_api_current_physical_size() == filled_current_physical + assert ( + get_resident_physical_size() == post_unlink_size + ), "didn't download anything new due to failpoint" + # would be nice to assert that the layers in the layer map are still RemoteLayer + + ##### Retry, this time without failpoints + client.configure_failpoints(("remote-storage-download-pre-rename", "off")) + info = client.timeline_download_remote_layers(tenant_id, timeline_id, errors_ok=False) + log.info(f"info={info}") + + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] > 0 + assert info["failed_download_count"] == 0 + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + + refilled_size = get_resident_physical_size() + log.info(refilled_size) + + assert filled_size == refilled_size, "we redownloaded all the layers" + assert get_api_current_physical_size() == filled_current_physical + + for sk in env.safekeepers: + sk.start() + + # ensure that all the data is back + pg_old = env.postgres.create_start(branch_name="main") + with pg_old.cursor() as cur: + assert query_scalar(cur, "select count(*) from testtab") == table_len diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 94e483cdb5..32c25b2e8c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PageserverApiException, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, available_remote_storages, wait_for_last_flush_lsn, wait_for_last_record_lsn, @@ -62,9 +61,9 @@ def test_remote_storage_backup_and_restore( neon_env_builder.pageserver_config_override = "test_remote_failures=1" data_id = 1 - data_secret = "very secret secret" + data = "just some data" - ##### First start, insert secret data and upload it to the remote storage + ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() # FIXME: Is this expected? @@ -97,8 +96,8 @@ def test_remote_storage_backup_and_restore( with pg.cursor() as cur: cur.execute( f""" - CREATE TABLE t{checkpoint_number}(id int primary key, secret text); - INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + CREATE TABLE t{checkpoint_number}(id int primary key, data text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data}|{checkpoint_number}'); """ ) current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -133,36 +132,53 @@ def test_remote_storage_backup_and_restore( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - # Introduce failpoint in download - pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) - + # Introduce failpoint in list remote timelines code path to make tenant_attach fail. + # This is before the failures injected by test_remote_failures, so it's a permanent error. + pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) + env.pageserver.allowed_errors.append( + ".*error attaching tenant: storage-sync-list-remote-timelines", + ) + # Attach it. This HTTP request will succeed and launch a + # background task to load the tenant. In that background task, + # listing the remote timelines will fail because of the failpoint, + # and the tenant will be marked as Broken. client.tenant_attach(tenant_id) - - # is there a better way to assert that failpoint triggered? wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) - # assert cannot attach timeline that is scheduled for download - # FIXME implement layer download retries + # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): client.tenant_attach(tenant_id) - tenant_status = client.tenant_status(tenant_id) - log.info("Tenant status with active failpoint: %s", tenant_status) - # FIXME implement layer download retries - # assert tenant_status["has_in_progress_downloads"] is True - - # trigger temporary download files removal + # Restart again, this implicitly clears the failpoint. + # test_remote_failures=1 remains active, though, as it's in the pageserver config. + # This means that any of the remote client operations after restart will exercise the + # retry code path. + # + # The initiated attach operation should survive the restart, and continue from where it was. env.pageserver.stop() + layer_download_failed_regex = ( + r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure" + ) + assert not env.pageserver.log_contains( + layer_download_failed_regex + ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" env.pageserver.start() - # ensure that an initiated attach operation survives pageserver restart + # Ensure that the pageserver remembers that the tenant was attaching, by + # trying to attach it again. It should fail. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): client.tenant_attach(tenant_id) - log.info("waiting for timeline redownload") + log.info("waiting for tenant to become active. this should be quick with on-demand download") + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + wait_until( - number_of_iterations=20, + number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=tenant_active, ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -171,14 +187,18 @@ def test_remote_storage_backup_and_restore( Lsn(detail["last_record_lsn"]) >= current_lsn ), "current db Lsn should should not be less than the one stored on remote storage" + log.info("select some data, this will cause layers to be downloaded") pg = env.postgres.create_start("main") with pg.cursor() as cur: for checkpoint_number in checkpoint_numbers: assert ( - query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") - == f"{data_secret}|{checkpoint_number}" + query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};") + == f"{data}|{checkpoint_number}" ) + log.info("ensure that we neede to retry downloads due to test_remote_failures=1") + assert env.pageserver.log_contains(layer_download_failed_regex) + # Exercises the upload queue retry code paths. # - Use failpoints to cause all storage ops to fail @@ -338,7 +358,6 @@ def test_remote_storage_upload_queue_retries( def tenant_active(): all_states = client.tenant_list() [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["has_in_progress_downloads"] is False assert tenant["state"] == "Active" wait_until(30, 1, tenant_active) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 081fd0fc2f..1b58937e2a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -13,12 +13,15 @@ from fixtures.neon_fixtures import ( PageserverHttpClient, PortDistributor, Postgres, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, + tenant_exists, wait_for_last_record_lsn, wait_for_upload, + wait_until, + wait_while, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until +from fixtures.utils import query_scalar, start_in_background, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -406,17 +409,13 @@ def test_tenant_relocation( # call to attach timeline to new pageserver new_pageserver_http.tenant_attach(tenant_id) - # check that it shows that download is in progress + # wait for tenant to finish attaching tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status.get("has_in_progress_downloads"), tenant_status - - # wait until tenant is downloaded + assert tenant_status["state"] in ["Attaching", "Active"] wait_until( number_of_iterations=10, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant( - new_pageserver_http, tenant_id - ), + func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"), ) check_timeline_attached( @@ -459,9 +458,15 @@ def test_tenant_relocation( # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver - # is no longer involved, and if it is, we will see the errors + # is no longer involved, and if it is, we will see the error pageserver_http.tenant_detach(tenant_id) + # Wait a little, so that the detach operation has time to finish. + wait_while( + number_of_iterations=100, + interval=1, + func=lambda: tenant_exists(pageserver_http, tenant_id), + ) post_migration_check(pg_main, 500500, old_local_path_main) post_migration_check(pg_second, 1001000, old_local_path_second) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index ddae1a67ff..4eba4ce942 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -20,44 +20,48 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): matching = [t for t in all_states if TenantId(t["id"]) == tenant] return get_only_element(matching)["state"] - def get_metric_value(name): - metrics = client.get_metrics() - relevant = [line for line in metrics.splitlines() if line.startswith(name)] - if len(relevant) == 0: - return 0 - line = get_only_element(relevant) - value = line.lstrip(name).strip() - return int(value) - def delete_all_timelines(tenant: TenantId): timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) + def assert_active(tenant): + assert get_state(tenant) == "Active" + # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) + assert ( + get_state(tenant) == "Active" + ), "Pageserver should activate a tenant and start background jobs if timelines are loaded" # Stop compute pg.stop() - # Delete all timelines on all tenants + # Delete all timelines on all tenants. + # + # FIXME: we used to check that the background jobs are stopped when all timelines + # are removed, but we don't stop them anymore. Not sure if this test still makes sense + # or we should just remove it. for tenant_info in client.tenant_list(): tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active(tenant_id)) # Assert that all tasks finish quickly after tenant is detached - assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + assert task_starts is not None + assert int(task_starts) > 0 client.tenant_detach(tenant) client.tenant_detach(env.initial_tenant) def assert_tasks_finish(): - tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') - tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') - tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}') + tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}') log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended - assert tasks_panicked == 0 + assert tasks_panicked is None or int(tasks_panicked) == 0 wait_until(10, 0.2, assert_tasks_finish) diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 4cd74e17e9..6a5b4278da 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -21,7 +21,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, Postgres, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, available_remote_storages, wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, @@ -179,14 +179,6 @@ def test_tenants_attached_after_download( tenant_id, timeline_id, env.safekeepers, env.pageserver ) - detail_before = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert ( - detail_before["current_physical_size_non_incremental"] - == detail_before["current_physical_size"] - ) - env.pageserver.stop() timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -200,13 +192,16 @@ def test_tenants_attached_after_download( assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + # FIXME: just starting the pageserver no longer downloads the + # layer files. Do we want to force download, or maybe run some + # queries, or is it enough that it starts up without layer files? env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -218,12 +213,6 @@ def test_tenants_attached_after_download( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" - # Check that the physical size matches after re-downloading - detail_after = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert detail_before["current_physical_size"] == detail_after["current_physical_size"] - # Check that we had to retry the downloads assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*") @@ -297,7 +286,7 @@ def test_tenant_upgrades_index_json_from_v0( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), ) pg = env.postgres.create_start("main") @@ -404,7 +393,7 @@ def test_tenant_ignores_backup_file( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), ) pg = env.postgres.create_start("main") @@ -484,14 +473,15 @@ def test_tenant_redownloads_truncated_file_on_startup( index_part = local_fs_index_part(env, tenant_id, timeline_id) assert index_part["layer_metadata"][path.name]["file_size"] == expected_size - ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + ## Start the pageserver. It will notice that the file size doesn't match, and + ## rename away the local file. It will be re-downloaded when it's needed. env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -503,6 +493,10 @@ def test_tenant_redownloads_truncated_file_on_startup( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + # Request non-incremental logical size. Calculating it needs the layer file that + # we corrupted, forcing it to be redownloaded. + client.timeline_detail(tenant_id, timeline_id, include_non_incremental_logical_size=True) + assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" # the remote side of local_layer_truncated diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 523c946a68..3b41cc5c90 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -20,10 +20,12 @@ from fixtures.neon_fixtures import ( PortDistributor, Postgres, VanillaPostgres, + assert_tenant_status, wait_for_last_flush_lsn, + wait_until, ) from fixtures.types import TenantId, TimelineId -from fixtures.utils import get_timeline_dir_size, wait_until +from fixtures.utils import get_timeline_dir_size def test_timeline_size(neon_simple_env: NeonEnv): @@ -320,7 +322,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env.pageserver.stop() env.pageserver.start() - assert_physical_size(env, env.initial_tenant, new_timeline_id) + # Wait for the tenant to be loaded + client = env.pageserver.http_client() + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"), + ) + + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): @@ -341,7 +353,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): @@ -376,7 +390,9 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): @@ -415,7 +431,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) # The timeline logical and physical sizes are also exposed as prometheus metrics. @@ -448,7 +466,7 @@ def test_timeline_size_metrics( # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() matches = re.search( - f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', + f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -507,11 +525,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: TimelineId): - res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) - return res["current_physical_size_non_incremental"] + def get_timeline_resident_physical_size(timeline: TimelineId): + sizes = get_physical_size_values(env, tenant, timeline) + assert_physical_size_invariants(sizes) + return sizes.prometheus_resident_physical - timeline_total_size = get_timeline_physical_size(timeline) + timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline) for i in range(10): n_rows = random.randint(100, 1000) @@ -528,22 +547,54 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, tenant, timeline) pageserver_http.timeline_checkpoint(tenant, timeline) - timeline_total_size += get_timeline_physical_size(timeline) + timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) pg.stop() - tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"]) - assert tenant_physical_size == timeline_total_size + # ensure that tenant_status current_physical size reports sum of timeline current_physical_size + tenant_current_physical_size = int( + client.tenant_status(tenant_id=tenant)["current_physical_size"] + ) + assert tenant_current_physical_size == sum( + [tl["current_physical_size"] for tl in client.timeline_list(tenant_id=tenant)] + ) + # since we don't do layer eviction, current_physical_size is identical to resident physical size + assert timeline_total_resident_physical_size == tenant_current_physical_size -def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): - """Check the current physical size returned from timeline API - matches the total physical size of the timeline on disk""" +class TimelinePhysicalSizeValues: + api_current_physical: int + prometheus_resident_physical: int + python_timelinedir_layerfiles_physical: int + + +def get_physical_size_values( + env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId +) -> TimelinePhysicalSizeValues: + res = TimelinePhysicalSizeValues() + client = env.pageserver.http_client() - res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True) + + res.prometheus_resident_physical = client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + detail = client.timeline_detail( + tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True + ) + res.api_current_physical = detail["current_physical_size"] + timeline_path = env.timeline_dir(tenant_id, timeline_id) - assert res["current_physical_size"] == res["current_physical_size_non_incremental"] - assert res["current_physical_size"] == get_timeline_dir_size(timeline_path) + res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path) + + return res + + +def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): + # resident phyiscal size is defined as + assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical + # we don't do layer eviction, so, all layers are resident + assert sizes.api_current_physical == sizes.prometheus_resident_physical # Timeline logical size initialization is an asynchronous background task that runs once, diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index d88ed319b5..77ec33f8b0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -585,17 +585,23 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re if elapsed > wait_lsn_timeout: raise RuntimeError("Timed out waiting for WAL redo") - pageserver_lsn = Lsn( - env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] - ) - lag = last_lsn - pageserver_lsn + tenant_status = ps_cli.tenant_status(tenant_id) + if tenant_status["state"] == "Loading": + log.debug(f"Tenant {tenant_id} is still loading, retrying") + else: + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + ) + lag = last_lsn - pageserver_lsn - if time.time() > last_debug_print + 10 or lag <= 0: - last_debug_print = time.time() - log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") + if time.time() > last_debug_print + 10 or lag <= 0: + last_debug_print = time.time() + log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") - if lag <= 0: - break + if lag <= 0: + break time.sleep(1)