From 7ff591ffbfc8f084e0b1b5cdbed8bd69e008d4c0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 5 Dec 2022 10:20:24 -0500 Subject: [PATCH] On-Demand Download MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code in this change was extracted from #2595 (Heikki’s on-demand download draft PR). High-Level Changes - New RemoteLayer Type - On-Demand Download As An Effect Of Page Reconstruction - Breaking Semantics For Physical Size Metrics There are several follow-up work items planned. Refer to the Epic issue on GitHub: https://github.com/neondatabase/neon/issues/2029 closes https://github.com/neondatabase/neon/pull/3013 Co-authored-by: Kirill Bulatov Co-authored-by: Christian Schwarz New RemoteLayer Type ==================== Instead of downloading all layers during tenant attach, we create RemoteLayer instances for each of them and add them to the layer map. On-Demand Download As An Effect Of Page Reconstruction ====================================================== At the heart of pageserver is Timeline::get_reconstruct_data(). It traverses the layer map until it has collected all the data it needs to produce the page image. Most code in the code base uses it, though many layers of indirection. Before this patch, the function would use synchronous filesystem IO to load data from disk-resident layer files if the data was not cached. That is not possible with RemoteLayer, because the layer file has not been downloaded yet. So, we do the download when get_reconstruct_data gets there, i.e., “on demand”. The mechanics of how the download is done are rather involved, because of the infamous async-sync-async sandwich problem that plagues the async Rust world. We use the new PageReconstructResult type to work around this. Its introduction is the cause for a good amount of code churn in this patch. Refer to the block comment on `with_ondemand_download()` for details. Breaking Semantics For Physical Size Metrics ============================================ We rename prometheus metric pageserver_{current,resident}_physical_size to reflect what this metric actually represents with on-demand download. This intentionally BREAKS existing grafana dashboard and the cost model data pipeline. Breaking is desirable because the meaning of this metrics has changed with on-demand download. See https://docs.google.com/document/d/12AFpvKY-7FZdR5a4CaD6Ir_rI3QokdCLSPJ6upHxJBo/edit# for how we will handle this breakage. Likewise, we rename the new billing_metrics’s PhysicalSize => ResidentSize. This is not yet used anywhere, so, this is not a breaking change. There is still a field called TimelineInfo::current_physical_size. It is now the sum of the layer sizes in layer map, regardless of whether local or remote. To compute that sum, we added a new trait method PersistentLayer::file_size(). When updating the Python tests, we got rid of current_physical_size_non_incremental. An earlier commit removed it from the OpenAPI spec already, so this is not a breaking change. test_timeline_size.py has grown additional assertions on the resident_physical_size metric. --- libs/pageserver_api/src/models.rs | 23 +- pageserver/src/basebackup.rs | 65 +- pageserver/src/billing_metrics.rs | 28 +- pageserver/src/http/routes.rs | 114 ++- pageserver/src/import_datadir.rs | 14 +- pageserver/src/lib.rs | 2 +- pageserver/src/metrics.rs | 22 +- pageserver/src/page_service.rs | 30 +- pageserver/src/pgdatadir_mapping.rs | 311 ++++--- pageserver/src/storage_sync2.rs | 46 +- pageserver/src/storage_sync2/download.rs | 4 + pageserver/src/task_mgr.rs | 18 + pageserver/src/tenant.rs | 62 +- pageserver/src/tenant/delta_layer.rs | 25 +- pageserver/src/tenant/image_layer.rs | 25 +- pageserver/src/tenant/remote_layer.rs | 212 +++++ pageserver/src/tenant/size.rs | 2 - pageserver/src/tenant/storage_layer.rs | 27 + pageserver/src/tenant/timeline.rs | 832 ++++++++++++++---- pageserver/src/virtual_file.rs | 6 +- pageserver/src/walingest.rs | 470 +++++++--- .../src/walreceiver/connection_manager.rs | 2 +- .../src/walreceiver/walreceiver_connection.rs | 17 +- pageserver/src/walrecord.rs | 1 + scripts/export_import_between_pageservers.py | 10 +- test_runner/fixtures/metrics.py | 2 +- test_runner/fixtures/neon_fixtures.py | 172 +++- test_runner/regress/test_broken_timeline.py | 11 +- test_runner/regress/test_metric_collection.py | 2 +- test_runner/regress/test_ondemand_download.py | 437 +++++++++ test_runner/regress/test_remote_storage.py | 69 +- test_runner/regress/test_tenant_relocation.py | 25 +- test_runner/regress/test_tenant_tasks.py | 34 +- .../test_tenants_with_remote_storage.py | 34 +- test_runner/regress/test_timeline_size.py | 89 +- test_runner/regress/test_wal_acceptor.py | 24 +- 36 files changed, 2556 insertions(+), 711 deletions(-) create mode 100644 pageserver/src/tenant/remote_layer.rs create mode 100644 test_runner/regress/test_ondemand_download.py diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 586ce2a73a..88603d9539 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -163,6 +163,8 @@ pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: TenantId, pub state: TenantState, + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } @@ -191,9 +193,12 @@ pub struct TimelineInfo { #[serde_as(as = "DisplayFromStr")] pub remote_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, - pub current_physical_size_non_incremental: Option, + + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, #[serde_as(as = "Option")] @@ -205,6 +210,22 @@ pub struct TimelineInfo { pub state: TimelineState, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct DownloadRemoteLayersTaskInfo { + pub task_id: String, + pub state: DownloadRemoteLayersTaskState, + pub total_layer_count: u64, // stable once `completed` + pub successful_download_count: u64, // stable once `completed` + pub failed_download_count: u64, // stable once `completed` +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum DownloadRemoteLayersTaskState { + Running, + Completed, + ShutDown, +} + pub type ConfigureFailpointsRequest = Vec; /// Information for configuring a single fail point diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 973c3cd3a6..aa87865a8a 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,7 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::tenant::Timeline; +use crate::task_mgr; +use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; @@ -152,23 +153,29 @@ where SlruKind::MultiXactOffsets, SlruKind::MultiXactMembers, ] { - for segno in self.timeline.list_slru_segments(kind, self.lsn)? { + for segno in + with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))? + { self.add_slru_segment(kind, segno)?; } } // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { + for ((spcnode, dbnode), has_relmap_file) in + with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))? + { self.add_dbdir(spcnode, dbnode, has_relmap_file)?; // Gather and send relational files in each database if full backup is requested. if self.full_backup { - for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? { + for rel in with_ondemand_download_sync(|| { + self.timeline.list_rels(spcnode, dbnode, self.lsn) + })? { self.add_rel(rel)?; } } } - for xid in self.timeline.list_twophase_files(self.lsn)? { + for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? { self.add_twophase_file(xid)?; } @@ -185,7 +192,8 @@ where } fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { - let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?; + let nblocks = + with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?; // Function that adds relation segment data to archive let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { @@ -208,7 +216,8 @@ where for blknum in blocks { let img = self .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?; + .get_rel_page_at_lsn(tag, blknum, self.lsn, false) + .no_ondemand_download()?; segment_data.extend_from_slice(&img[..]); } @@ -222,13 +231,16 @@ where // Generate SLRU segment files from repository. // fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; + let nblocks = with_ondemand_download_sync(|| { + self.timeline.get_slru_segment_size(slru, segno, self.lsn) + })?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { - let img = self - .timeline - .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; + let img = with_ondemand_download_sync(|| { + self.timeline + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn) + })?; if slru == SlruKind::Clog { ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); @@ -260,7 +272,9 @@ where has_relmap_file: bool, ) -> anyhow::Result<()> { let relmap_img = if has_relmap_file { - let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; + let img = with_ondemand_download_sync(|| { + self.timeline.get_relmap_file(spcnode, dbnode, self.lsn) + })?; ensure!(img.len() == 512); Some(img) } else { @@ -295,7 +309,8 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, self.lsn)? + .list_rels(spcnode, dbnode, self.lsn) + .no_ondemand_download()? .is_empty() { return Ok(()); @@ -327,7 +342,7 @@ where // Extract twophase state files // fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self.timeline.get_twophase_file(xid, self.lsn)?; + let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -361,14 +376,12 @@ where zenith_signal.as_bytes(), )?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) - .context("failed get control bytes")?; + let checkpoint_bytes = + with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn)) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = + with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn)) + .context("failed get control bytes")?; let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( &pg_control_bytes, @@ -490,3 +503,11 @@ where } } } + +fn with_ondemand_download_sync(f: F) -> anyhow::Result +where + F: Send + Fn() -> PageReconstructResult, + T: Send, +{ + task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f)) +} diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs index c5da54b8fc..f9d3e8553f 100644 --- a/pageserver/src/billing_metrics.rs +++ b/pageserver/src/billing_metrics.rs @@ -73,10 +73,10 @@ pub enum BillingMetricKind { /// This is an absolute, per-tenant metric. /// This is the same metric that tenant/tenant_id/size endpoint returns. SyntheticStorageSize, - /// Size of all the files in the tenant's directory on disk on the pageserver. + /// Size of all the layer files in the tenant's directory on disk on the pageserver. /// This is an absolute, per-tenant metric. - /// See also prometheus metric CURRENT_PHYSICAL_SIZE. - PhysicalSize, + /// See also prometheus metric RESIDENT_PHYSICAL_SIZE. + ResidentSize, /// Size of the remote storage (S3) directory. /// This is an absolute, per-tenant metric. RemoteStorageSize, @@ -89,7 +89,7 @@ impl FromStr for BillingMetricKind { match s { "written_size" => Ok(Self::WrittenSize), "synthetic_storage_size" => Ok(Self::SyntheticStorageSize), - "physical_size" => Ok(Self::PhysicalSize), + "resident_size" => Ok(Self::ResidentSize), "remote_storage_size" => Ok(Self::RemoteStorageSize), _ => anyhow::bail!("invalid value \"{s}\" for metric type"), } @@ -101,7 +101,7 @@ impl fmt::Display for BillingMetricKind { f.write_str(match self { BillingMetricKind::WrittenSize => "written_size", BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size", - BillingMetricKind::PhysicalSize => "physical_size", + BillingMetricKind::ResidentSize => "resident_size", BillingMetricKind::RemoteStorageSize => "remote_storage_size", }) } @@ -171,7 +171,7 @@ pub async fn collect_metrics_task( let tenant = tenant_mgr::get_tenant(tenant_id, true).await?; - let mut tenant_physical_size = 0; + let mut tenant_resident_size = 0; // iterate through list of timelines in tenant for timeline in tenant.list_timelines().iter() { @@ -186,27 +186,27 @@ pub async fn collect_metrics_task( timeline_written_size, )); - let timeline_size = timeline.get_physical_size(); - tenant_physical_size += timeline_size; + let timeline_resident_size = timeline.get_resident_physical_size(); + tenant_resident_size += timeline_resident_size; debug!( - "per-timeline current metrics for tenant: {}: timeline {} physical_size={} last_record_lsn {} (as bytes)", - tenant_id, timeline.timeline_id, timeline_size, timeline_written_size) + "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)", + tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size) } let tenant_remote_size = tenant.get_remote_size().await?; debug!( - "collected current metrics for tenant: {}: state={:?} tenant_physical_size={} remote_size={}", - tenant_id, tenant_state, tenant_physical_size, tenant_remote_size + "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}", + tenant_id, tenant_state, tenant_resident_size, tenant_remote_size ); current_metrics.push(( BillingMetricsKey { tenant_id, timeline_id: None, - metric: BillingMetricKind::PhysicalSize, + metric: BillingMetricKind::ResidentSize, }, - tenant_physical_size, + tenant_resident_size, )); current_metrics.push(( diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 937a6144b6..6d97f3206e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -12,7 +12,7 @@ use super::models::{ TimelineCreateRequest, TimelineInfo, }; use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::tenant::Timeline; +use crate::tenant::{with_ondemand_download, Timeline}; use crate::tenant_config::TenantConfOpt; use crate::{config::PageServerConf, tenant_mgr}; use utils::{ @@ -78,25 +78,23 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res } // Helper function to construct a TimelineInfo struct for a timeline -fn build_timeline_info( +async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, ) -> anyhow::Result { let mut info = build_timeline_info_common(timeline)?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. // Otherwise, if someone deletes the timeline / detaches the tenant while // we're executing this function, we will outlive the timeline on-disk state. - info.current_logical_size_non_incremental = - Some(timeline.get_current_logical_size_non_incremental( - info.last_record_lsn, - CancellationToken::new(), - )?); - } - if include_non_incremental_physical_size { - info.current_physical_size_non_incremental = - Some(timeline.get_physical_size_non_incremental()?) + info.current_logical_size_non_incremental = Some( + timeline + .get_current_logical_size_non_incremental( + info.last_record_lsn, + CancellationToken::new(), + ) + .await?, + ); } Ok(info) } @@ -128,7 +126,7 @@ fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result) -> anyhow::Result) -> Result, let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); - let include_non_incremental_physical_size = - query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; let response_data = async { @@ -210,17 +206,16 @@ async fn timeline_list_handler(request: Request) -> Result, let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { - let timeline_info = build_timeline_info( - &timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) - .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") - .map_err(ApiError::InternalServerError)?; + let timeline_info = + build_timeline_info(&timeline, include_non_incremental_logical_size) + .await + .context( + "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}", + ) + .map_err(ApiError::InternalServerError)?; response_data.push(timeline_info); } - Ok(response_data) } .instrument(info_span!("timeline_list", tenant = %tenant_id)) @@ -264,8 +259,6 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result(timeline_info) } @@ -308,10 +298,11 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result format!("{lsn}"), LsnForTimestamp::Future(_lsn) => "future".into(), LsnForTimestamp::Past(_lsn) => "past".into(), @@ -433,7 +424,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { - current_physical_size += timeline.get_physical_size(); + current_physical_size += timeline.layer_size_sum().approximate_is_ok(); } let state = tenant.current_state(); @@ -786,6 +777,45 @@ async fn timeline_checkpoint_handler(request: Request) -> Result, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + match timeline.spawn_download_all_remote_layers().await { + Ok(st) => json_response(StatusCode::ACCEPTED, st), + Err(st) => json_response(StatusCode::CONFLICT, st), + } +} + +async fn timeline_download_remote_layers_handler_get( + request: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + let info = timeline + .get_download_all_remote_layers_task_info() + .context("task never started since last pageserver process start") + .map_err(ApiError::NotFound)?; + json_response(StatusCode::OK, info) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -870,6 +900,14 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", testing_api!("run timeline checkpoint", timeline_checkpoint_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_post, + ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_get, + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index db83bdb3a1..1684ca3c64 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -187,13 +187,13 @@ fn import_slru( path: &Path, mut reader: Reader, len: usize, -) -> Result<()> { - trace!("importing slru file {}", path.display()); +) -> anyhow::Result<()> { + info!("importing slru file {path:?}"); let mut buf: [u8; 8192] = [0u8; 8192]; let filename = &path .file_name() - .expect("missing slru filename") + .with_context(|| format!("missing slru filename for path {path:?}"))? .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; @@ -279,7 +279,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .no_ondemand_download()?; last_lsn = lsn; nrecords += 1; @@ -405,7 +407,9 @@ pub fn import_wal_from_tar( let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .no_ondemand_download()?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 626d5e99e3..e01eb12b7b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -91,7 +91,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { } } -fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { +pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 } else { diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 308f9cd4eb..205ee0ffad 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,13 +84,10 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -// Metrics for determining timeline's physical size. -// A layered timeline's physical is defined as the total size of -// (delta/image) layer files on disk. -static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { +static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "pageserver_current_physical_size", - "Current physical size grouped by timeline", + "pageserver_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem.", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric") @@ -146,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 1.0, // 1 sec ]; -const STORAGE_IO_TIME_OPERATIONS: &[&str] = - &["open", "close", "read", "write", "seek", "fsync", "gc"]; +const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ + "open", "close", "read", "write", "seek", "fsync", "gc", "metadata", +]; const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; @@ -375,7 +373,7 @@ pub struct TimelineMetrics { pub load_layer_map_histo: Histogram, pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, - pub current_physical_size_gauge: UIntGauge, + pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -416,7 +414,7 @@ impl TimelineMetrics { let wait_lsn_time_histo = WAIT_LSN_TIME .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); let current_logical_size_gauge = CURRENT_LOGICAL_SIZE @@ -442,7 +440,7 @@ impl TimelineMetrics { load_layer_map_histo, last_record_gauge, wait_lsn_time_histo, - current_physical_size_gauge, + resident_physical_size_gauge, current_logical_size_gauge, num_persistent_files_created, persistent_bytes_written, @@ -458,7 +456,7 @@ impl Drop for TimelineMetrics { let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); - let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d9c19d04b7..fd4353a421 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -541,7 +541,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; + let exists = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_exists(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, @@ -558,7 +561,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; + let n_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_size(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -575,9 +581,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let total_blocks = - timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; - + let total_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest) + }) + .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { @@ -603,11 +610,14 @@ impl PageServerHandler { } */ - // FIXME: this profiling now happens at different place than it used to. The - // current profiling is based on a thread-local variable, so it doesn't work - // across awaits - let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; + let page = crate::tenant::with_ondemand_download(|| { + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); + timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7b4b05ed18..77910bceda 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,11 +6,12 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use super::tenant::PageReconstructResult; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::repository::*; use crate::tenant::Timeline; use crate::walrecord::NeonWalRecord; -use anyhow::{self, bail, ensure, Context}; +use crate::{repository::*, try_no_ondemand_download}; +use anyhow::Context; use bytes::{Buf, Bytes}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -97,16 +98,18 @@ impl Timeline { blknum: BlockNumber, lsn: Lsn, latest: bool, - ) -> anyhow::Result { - ensure!(tag.relnode != 0, "invalid relnode"); + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } - let nblocks = self.get_rel_size(tag, lsn, latest)?; + let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest)); if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, lsn, nblocks ); - return Ok(ZERO_PAGE.clone()); + return PageReconstructResult::Success(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); @@ -120,38 +123,45 @@ impl Timeline { dbnode: Oid, lsn: Lsn, latest: bool, - ) -> anyhow::Result { + ) -> PageReconstructResult { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn)?; + let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn)); for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest)?; + let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest)); total_blocks += n_blocks as usize; } - Ok(total_blocks) + PageReconstructResult::Success(total_blocks) } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> anyhow::Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_size( + &self, + tag: RelTag, + lsn: Lsn, + latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(nblocks); + return PageReconstructResult::Success(nblocks); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest)? + && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest)) { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, // without extending it. Tolerate that by claiming that // any non-existent FSM fork has size 0. - return Ok(0); + return PageReconstructResult::Success(0); } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn)?; + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); let nblocks = buf.get_u32_le(); if latest { @@ -164,25 +174,35 @@ impl Timeline { // associated with most recent value of LSN. self.update_cached_rel_size(tag, lsn, nblocks); } - Ok(nblocks) + PageReconstructResult::Success(nblocks) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> anyhow::Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_exists( + &self, + tag: RelTag, + lsn: Lsn, + _latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } // first try to lookup relation in cache if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(true); + return PageReconstructResult::Success(true); } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); - - Ok(exists) + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Get a list of all existing relations in given tablespace and database. @@ -191,21 +211,25 @@ impl Timeline { spcnode: Oid, dbnode: Oid, lsn: Lsn, - ) -> anyhow::Result> { + ) -> PageReconstructResult> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); - Ok(rels) + PageReconstructResult::Success(rels) + } + Err(e) => PageReconstructResult::from(e), + } } /// Look up given SLRU page version. @@ -215,7 +239,7 @@ impl Timeline { segno: u32, blknum: BlockNumber, lsn: Lsn, - ) -> anyhow::Result { + ) -> PageReconstructResult { let key = slru_block_to_key(kind, segno, blknum); self.get(key, lsn) } @@ -226,10 +250,10 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, - ) -> anyhow::Result { + ) -> PageReconstructResult { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn)?; - Ok(buf.get_u32_le()) + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf.get_u32_le()) } /// Get size of an SLRU segment @@ -238,14 +262,18 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, - ) -> anyhow::Result { + ) -> PageReconstructResult { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.segments.get(&segno).is_some(); - Ok(exists) + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.segments.get(&segno).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Locate LSN, such that all transactions that committed before @@ -258,7 +286,7 @@ impl Timeline { pub fn find_lsn_for_timestamp( &self, search_timestamp: TimestampTz, - ) -> anyhow::Result { + ) -> PageReconstructResult { let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; let max_lsn = self.get_last_record_lsn(); @@ -274,12 +302,12 @@ impl Timeline { // cannot overflow, high and low are both smaller than u64::MAX / 2 let mid = (high + low) / 2; - let cmp = self.is_latest_commit_timestamp_ge_than( + let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than( search_timestamp, Lsn(mid * 8), &mut found_smaller, &mut found_larger, - )?; + )); if cmp { high = mid; @@ -291,15 +319,15 @@ impl Timeline { (false, false) => { // This can happen if no commit records have been processed yet, e.g. // just after importing a cluster. - Ok(LsnForTimestamp::NoData(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn)) } (true, false) => { // Didn't find any commit timestamps larger than the request - Ok(LsnForTimestamp::Future(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn)) } (false, true) => { // Didn't find any commit timestamps smaller than the request - Ok(LsnForTimestamp::Past(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn)) } (true, true) => { // low is the LSN of the first commit record *after* the search_timestamp, @@ -309,7 +337,7 @@ impl Timeline { // Otherwise, if you restore to the returned LSN, the database will // include physical changes from later commits that will be marked // as aborted, and will need to be vacuumed away. - Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8))) + PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8))) } } } @@ -327,12 +355,20 @@ impl Timeline { probe_lsn: Lsn, found_smaller: &mut bool, found_larger: &mut bool, - ) -> anyhow::Result { - for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? { - let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?; + ) -> PageReconstructResult { + for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) { + let nblocks = try_no_ondemand_download!(self.get_slru_segment_size( + SlruKind::Clog, + segno, + probe_lsn + )); for blknum in (0..nblocks).rev() { - let clog_page = - self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; + let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn( + SlruKind::Clog, + segno, + blknum, + probe_lsn + )); if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; @@ -341,61 +377,75 @@ impl Timeline { if timestamp >= search_timestamp { *found_larger = true; - return Ok(true); + return PageReconstructResult::Success(true); } else { *found_smaller = true; } } } } - Ok(false) + PageReconstructResult::Success(false) } /// Get a list of SLRU segments - pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> anyhow::Result> { + pub fn list_slru_segments( + &self, + kind: SlruKind, + lsn: Lsn, + ) -> PageReconstructResult> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; - - Ok(dir.segments) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.segments), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> anyhow::Result { + pub fn get_relmap_file( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + ) -> PageReconstructResult { let key = relmap_file_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_dbdirs(&self, lsn: Lsn) -> anyhow::Result> { + pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(DBDIR_KEY, lsn)?; - let dir = DbDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn)); - Ok(dir.dbdirs) + match DbDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.dbdirs), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result { + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult { let key = twophase_file_key(xid); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_twophase_files(&self, lsn: Lsn) -> anyhow::Result> { + pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let dir = TwoPhaseDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn)); - Ok(dir.xids) + match TwoPhaseDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.xids), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_control_file(&self, lsn: Lsn) -> anyhow::Result { + pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult { self.get(CONTROLFILE_KEY, lsn) } - pub fn get_checkpoint(&self, lsn: Lsn) -> anyhow::Result { + pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult { self.get(CHECKPOINT_KEY, lsn) } @@ -404,23 +454,26 @@ impl Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental( + pub async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, cancel: CancellationToken, - ) -> std::result::Result { + ) -> Result { // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self.list_rels(*spcnode, *dbnode, lsn)? { + for rel in + crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn)) + .await? + { if cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); total_size += relsize as u64; @@ -433,7 +486,7 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - pub fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result { + pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -441,8 +494,8 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; - let dbdir = DbDirectory::des(&buf)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; + let dbdir = DbDirectory::des(&buf).context("deserialization failure")?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); dbs.sort_unstable(); @@ -451,14 +504,15 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn)? + .list_rels(spcnode, dbnode, lsn) + .no_ondemand_download()? .iter() .cloned() .collect(); rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -474,13 +528,13 @@ impl Timeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = self.get_download(slrudir_key, lsn).await?; + let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn)?; + let mut buf = self.get_download(segsize_key, lsn).await?; let segsize = buf.get_u32_le(); result.add_range( @@ -492,8 +546,8 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; + let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?; + let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); for xid in xids { @@ -606,7 +660,7 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -633,7 +687,7 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } @@ -652,7 +706,7 @@ impl<'a> DatadirModification<'a> { /// Store a relmapper file (pg_filenode.map) in the repository pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> { // Add it to the directory (if it doesn't exist already) - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dbdir = DbDirectory::des(&buf)?; let r = dbdir.dbdirs.insert((spcnode, dbnode), true); @@ -680,10 +734,10 @@ impl<'a> DatadirModification<'a> { pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.insert(xid) { - bail!("twophase file for xid {} already exists", xid); + anyhow::bail!("twophase file for xid {} already exists", xid); } self.put( TWOPHASEDIR_KEY, @@ -707,10 +761,13 @@ impl<'a> DatadirModification<'a> { pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> { let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?; + let total_blocks = self + .tline + .get_db_size(spcnode, dbnode, req_lsn, true) + .no_ondemand_download()?; // Remove entry from dbdir - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; @@ -734,10 +791,10 @@ impl<'a> DatadirModification<'a> { /// /// 'nblocks' is the initial size. pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { // Didn't exist. Update dbdir @@ -749,12 +806,12 @@ impl<'a> DatadirModification<'a> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key)?)? + RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)? }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - bail!("rel {} already exists", rel); + anyhow::bail!("rel {rel} already exists"); } self.put( rel_dir_key, @@ -778,12 +835,16 @@ impl<'a> DatadirModification<'a> { /// Truncate relation pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true)? { + if self + .tline + .get_rel_exists(rel, last_lsn, true) + .no_ondemand_download()? + { let size_key = rel_size_to_key(rel); // Fetch the old size first - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // Update the entry with the new size. let buf = nblocks.to_le_bytes(); @@ -804,11 +865,11 @@ impl<'a> DatadirModification<'a> { /// Extend relation /// If new size is smaller, do nothing. pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Put size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // only extend relation here. never decrease the size if nblocks > old_size { @@ -825,11 +886,11 @@ impl<'a> DatadirModification<'a> { /// Drop a relation. pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Remove it from the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = RelDirectory::des(&buf)?; if dir.rels.remove(&(rel.relnode, rel.forknum)) { @@ -840,7 +901,7 @@ impl<'a> DatadirModification<'a> { // update logical size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); self.pending_nblocks -= old_size as i64; // Remove enty from relation size cache @@ -860,11 +921,11 @@ impl<'a> DatadirModification<'a> { ) -> anyhow::Result<()> { // Add it to the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - bail!("slru segment {:?}/{} already exists", kind, segno); + anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } self.put( dir_key, @@ -899,7 +960,7 @@ impl<'a> DatadirModification<'a> { pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.remove(&segno) { @@ -925,7 +986,7 @@ impl<'a> DatadirModification<'a> { /// This method is used for marking truncated SLRU files pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { // Remove it from the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -1019,7 +1080,7 @@ impl<'a> DatadirModification<'a> { // Internal helper functions to batch the modifications - fn get(&self, key: Key) -> anyhow::Result { + fn get(&self, key: Key) -> PageReconstructResult { // Have we already updated the same key? Read the pending updated // version in that case. // @@ -1027,14 +1088,16 @@ impl<'a> DatadirModification<'a> { // value that has been removed, deletion only avoids leaking storage. if let Some(value) = self.pending_updates.get(&key) { if let Value::Image(img) = value { - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { // Currently, we never need to read back a WAL record that we // inserted in the same "transaction". All the metadata updates // work directly with Images, and we never need to read actual // data pages. We could handle this if we had to, by calling // the walredo manager, but let's keep it simple for now. - bail!("unexpected pending WAL record"); + return PageReconstructResult::from(anyhow::anyhow!( + "unexpected pending WAL record" + )); } } else { let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); @@ -1400,7 +1463,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { }, key.field6, ), - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } @@ -1426,14 +1489,14 @@ pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber 0x00 => SlruKind::Clog, 0x01 => SlruKind::MultiXactMembers, 0x02 => SlruKind::MultiXactOffsets, - _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), }; let segno = key.field4; let blknum = key.field6; (kind, segno, blknum) } - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs index 9253b250cd..a2337e8fd6 100644 --- a/pageserver/src/storage_sync2.rs +++ b/pageserver/src/storage_sync2.rs @@ -148,31 +148,43 @@ //! following two cases: //! - (1) We had the file locally, deleted it locally, scheduled a remote delete, //! but crashed before it finished remotely. -//! - (2) We never had the file locally because we were still in tenant attach -//! when we crashed. (Similar case for on-demand download in the future.) +//! - (2) We never had the file locally because we haven't on-demand downloaded +//! it yet. //! -//! # Downloads (= Tenant Attach) +//! # Downloads //! //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for -//! downloading files from the remote storage. Downloads are performed immediately, -//! independently of the uploads. +//! downloading files from the remote storage. Downloads are performed immediately +//! against the `RemoteStorage`, independently of the upload queue. //! //! When we attach a tenant, we perform the following steps: //! - create `Tenant` object in `TenantState::Attaching` state -//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s -//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart` -//! - eagerly download all the remote layers using the client's download APIs -//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state. +//! - List timelines that are present in remote storage, and for each: +//! - download their remote [`IndexPart`]s +//! - create `Timeline` struct and a `RemoteTimelineClient` +//! - initialize the client's upload queue with its `IndexPart` +//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart` +//! but not present locally +//! - schedule uploads for layers that are only present locally. +//! - if the remote `IndexPart`'s metadata was newer than the metadata in +//! the local filesystem, write the remote metadata to the local filesystem +//! - After the above is done for each timeline, open the tenant for business by +//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state. +//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops. //! -//! Most of the above happens in [`Timeline::reconcile_with_remote`]. +//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers. //! We keep track of the fact that a client is in `Attaching` state in a marker -//! file on the local disk. -//! However, the distinction is moot for storage sync since we call -//! `reconcile_with_remote` for tenants both with and without the marker file. -//! -//! In the future, downloading will be done on-demand and `reconcile_with_remote` -//! will only be responsible for re-scheduling upload ops after a crash of an -//! `Active` tenant. +//! file on the local disk. This is critical because, when we restart the pageserver, +//! we do not want to do the `List timelines` step for each tenant that has already +//! been successfully attached (for performance & cost reasons). +//! Instead, for a tenant without the attach marker file, we assume that the +//! local state is in sync or ahead of the remote state. This includes the list +//! of all of the tenant's timelines, which is particularly critical to be up-to-date: +//! if there's a timeline on the remote that the pageserver doesn't know about, +//! the GC will not consider its branch point, leading to data loss. +//! So, for a tenant with the attach marker file, we know that we do not yet have +//! persisted all the remote timeline's metadata files locally. To exclude the +//! risk above, we re-run the procedure for such tenants //! //! # Operating Without Remote Storage //! diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs index c81be05981..4256767020 100644 --- a/pageserver/src/storage_sync2/download.rs +++ b/pageserver/src/storage_sync2/download.rs @@ -180,6 +180,10 @@ pub async fn list_remote_timelines<'a>( let tenant_path = conf.timelines_path(&tenant_id); let tenant_storage_path = conf.remote_path(&tenant_path)?; + fail::fail_point!("storage-sync-list-remote-timelines", |_| { + anyhow::bail!("storage-sync-list-remote-timelines"); + }); + let timelines = download_retry( || storage.list_prefixes(Some(&tenant_storage_path)), &format!("list prefixes for {tenant_path:?}"), diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index fe3ad1a57d..a1b3ad26b0 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -35,6 +35,7 @@ #![allow(clippy::declare_interior_mutable_const)] use std::collections::HashMap; +use std::fmt; use std::future::Future; use std::panic::AssertUnwindSafe; use std::sync::atomic::{AtomicU64, Ordering}; @@ -134,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create background op runtime") }); +#[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); +impl fmt::Display for PageserverTaskId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + /// Each task that we track is associated with a "task ID". It's just an /// increasing number that we assign. Note that it is different from tokio::task::Id. static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1); @@ -198,6 +206,9 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, + // Task that downloads a file from remote storage + RemoteDownloadTask, + // task that handles the initial downloading of all tenants InitialLoad, @@ -206,6 +217,9 @@ pub enum TaskKind { // task that handhes metrics collection MetricsCollection, + + // task that drives downloading layers + DownloadAllRemoteLayers, } #[derive(Default)] @@ -437,6 +451,10 @@ pub fn current_task_kind() -> Option { CURRENT_TASK.try_with(|ct| ct.kind).ok() } +pub fn current_task_id() -> Option { + CURRENT_TASK.try_with(|ct| ct.task_id).ok() +} + /// A Future that can be used to check if the current task has been requested to /// shut down. pub async fn shutdown_watcher() { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 799a34fb3b..1240a3b4fb 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -81,6 +81,7 @@ pub mod filename; mod image_layer; mod inmemory_layer; pub mod layer_map; +mod remote_layer; pub mod metadata; mod par_fsync; @@ -90,7 +91,7 @@ mod timeline; pub mod size; -pub use timeline::Timeline; +pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -2780,9 +2781,18 @@ mod tests { writer.finish_write(Lsn(0x20)); drop(writer); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); Ok(()) } @@ -2859,15 +2869,15 @@ mod tests { // Check page contents on both branches assert_eq!( - from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "foo at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "bar at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?, "foobar at 0x20" ); @@ -3026,7 +3036,10 @@ mod tests { tenant .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; - assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); + assert!(newtline + .get(*TEST_KEY, Lsn(0x25)) + .no_ondemand_download() + .is_ok()); Ok(()) } @@ -3056,7 +3069,7 @@ mod tests { // Check that the data is still accessible on the branch. assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50))?, + newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?, TEST_IMG(&format!("foo at {}", Lsn(0x40))) ); @@ -3203,11 +3216,26 @@ mod tests { tline.freeze_and_flush().await?; tline.compact().await?; - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?, + TEST_IMG("foo at 0x30") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?, + TEST_IMG("foo at 0x40") + ); Ok(()) } @@ -3315,7 +3343,7 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } @@ -3401,7 +3429,7 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } @@ -3476,7 +3504,7 @@ mod tests { println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, *lsn)?, + tline.get(test_key, *lsn).no_ondemand_download()?, TEST_IMG(&format!("{idx} {blknum} at {lsn}")) ); } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index e1006dfe00..5b724b6263 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -39,7 +39,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::{BufWriter, Write}; use std::io::{Seek, SeekFrom}; use std::ops::Range; @@ -183,6 +183,8 @@ pub struct DeltaLayer { pub key_range: Range, pub lsn_range: Range, + pub file_size: u64, + inner: RwLock, } @@ -411,6 +413,10 @@ impl PersistentLayer for DeltaLayer { fs::remove_file(self.path())?; Ok(()) } + + fn file_size(&self) -> Option { + Some(self.file_size) + } } impl DeltaLayer { @@ -535,6 +541,7 @@ impl DeltaLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &DeltaFileName, + file_size: u64, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), @@ -542,6 +549,7 @@ impl DeltaLayer { tenant_id, key_range: filename.key_range.clone(), lsn_range: filename.lsn_range.clone(), + file_size, inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -554,21 +562,23 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn_range: summary.lsn_range, + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -725,6 +735,10 @@ impl DeltaLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + // Note: Because we opened the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -734,6 +748,7 @@ impl DeltaLayerWriterInner { timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs index b1dbbfb683..1e129fc01d 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -36,10 +36,11 @@ use bytes::Bytes; use hex; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::Write; use std::io::{Seek, SeekFrom}; use std::ops::Range; +use std::os::unix::prelude::FileExt; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard}; use tracing::*; @@ -105,6 +106,7 @@ pub struct ImageLayer { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub key_range: Range, + pub file_size: u64, // This entry contains an image of all pages as of this LSN pub lsn: Lsn, @@ -228,6 +230,10 @@ impl PersistentLayer for ImageLayer { fs::remove_file(self.path())?; Ok(()) } + + fn file_size(&self) -> Option { + Some(self.file_size) + } } impl ImageLayer { @@ -344,6 +350,7 @@ impl ImageLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &ImageFileName, + file_size: u64, ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), @@ -351,6 +358,7 @@ impl ImageLayer { tenant_id, key_range: filename.key_range.clone(), lsn: filename.lsn, + file_size, inner: RwLock::new(ImageLayerInner { loaded: false, file: None, @@ -363,21 +371,21 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: std::os::unix::prelude::FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; - + let metadata = file + .metadata() + .context("get file metadata to determine size")?; Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn: summary.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { file: None, loaded: false, @@ -523,6 +531,10 @@ impl ImageLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get metadata to determine file size")?; + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -532,6 +544,7 @@ impl ImageLayerWriterInner { tenant_id: self.tenant_id, key_range: self.key_range.clone(), lsn: self.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/remote_layer.rs b/pageserver/src/tenant/remote_layer.rs new file mode 100644 index 0000000000..affe8ca0a8 --- /dev/null +++ b/pageserver/src/tenant/remote_layer.rs @@ -0,0 +1,212 @@ +//! A RemoteLayer is an in-memory placeholder for a layer file that exists +//! in remote storage. +//! +use crate::config::PageServerConf; +use crate::repository::Key; +use crate::storage_sync::index::LayerFileMetadata; +use crate::tenant::delta_layer::DeltaLayer; +use crate::tenant::filename::{DeltaFileName, ImageFileName}; +use crate::tenant::image_layer::ImageLayer; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use anyhow::{bail, Result}; +use std::ops::Range; +use std::path::PathBuf; +use std::sync::Arc; + +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use super::filename::LayerFileName; +use super::storage_layer::{LayerIter, LayerKeyIter, PersistentLayer}; + +#[derive(Debug)] +pub struct RemoteLayer { + tenantid: TenantId, + timelineid: TimelineId, + key_range: Range, + lsn_range: Range, + + pub file_name: LayerFileName, + + pub layer_metadata: LayerFileMetadata, + + is_delta: bool, + + is_incremental: bool, + + pub(crate) ongoing_download: Arc, +} + +impl Layer for RemoteLayer { + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_state: &mut ValueReconstructState, + ) -> Result { + bail!( + "layer {} needs to be downloaded", + self.filename().file_name() + ); + } + + fn is_incremental(&self) -> bool { + self.is_incremental + } + + /// debugging function to print out the contents of the layer + fn dump(&self, _verbose: bool) -> Result<()> { + println!( + "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenantid, + self.timelineid, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end + ); + + Ok(()) + } + + fn short_id(&self) -> String { + self.filename().file_name() + } +} + +impl PersistentLayer for RemoteLayer { + fn get_tenant_id(&self) -> TenantId { + self.tenantid + } + + fn get_timeline_id(&self) -> TimelineId { + self.timelineid + } + + fn filename(&self) -> LayerFileName { + if self.is_delta { + DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + } + .into() + } else { + ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + } + .into() + } + } + + fn local_path(&self) -> Option { + None + } + + fn iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn key_iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn delete(&self) -> Result<()> { + Ok(()) + } + + fn downcast_remote_layer<'a>(self: Arc) -> Option> { + Some(self) + } + + fn is_remote_layer(&self) -> bool { + true + } + + fn file_size(&self) -> Option { + self.layer_metadata.file_size() + } +} + +impl RemoteLayer { + pub fn new_img( + tenantid: TenantId, + timelineid: TimelineId, + fname: &ImageFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn..(fname.lsn + 1), + is_delta: false, + is_incremental: false, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + pub fn new_delta( + tenantid: TenantId, + timelineid: TimelineId, + fname: &DeltaFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn_range.clone(), + is_delta: true, + is_incremental: true, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + /// Create a Layer struct representing this layer, after it has been downloaded. + pub fn create_downloaded_layer( + &self, + conf: &'static PageServerConf, + file_size: u64, + ) -> Arc { + if self.is_delta { + let fname = DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + }; + Arc::new(DeltaLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } else { + let fname = ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + }; + Arc::new(ImageLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } + } +} diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 5ce0837562..aa11985cbe 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -97,8 +97,6 @@ pub(super) async fn gather_inputs( // used to determine the `retention_period` for the size model let mut max_cutoff_distance = None; - // this will probably conflict with on-demand downloaded layers, or at least force them all - // to be downloaded for timeline in timelines { let last_record_lsn = timeline.get_last_record_lsn(); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 79eaa96591..8bfac5df8e 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -8,6 +8,7 @@ use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; +use std::sync::Arc; use utils::{ id::{TenantId, TimelineId}, @@ -15,6 +16,8 @@ use utils::{ }; use super::filename::LayerFileName; +use super::remote_layer::RemoteLayer; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -161,4 +164,28 @@ pub trait PersistentLayer: Layer { /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; + + fn downcast_remote_layer(self: Arc) -> Option> { + None + } + + fn is_remote_layer(&self) -> bool { + false + } + + /// Returns None if the layer file size is not known. + /// + /// Should not change over the lifetime of the layer object because + /// current_physical_size is computed as the som of this value. + fn file_size(&self) -> Option; +} + +pub fn downcast_remote_layer( + layer: &Arc, +) -> Option> { + if layer.is_remote_layer() { + Arc::clone(layer).downcast_remote_layer() + } else { + None + } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 61d619a17b..f4288fea36 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3,11 +3,14 @@ use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use itertools::Itertools; use once_cell::sync::OnceCell; -use pageserver_api::models::TimelineState; +use pageserver_api::models::{ + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState, +}; use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; -use tokio::task::spawn_blocking; use tokio_util::sync::CancellationToken; use tracing::*; @@ -22,6 +25,7 @@ use std::time::{Duration, Instant, SystemTime}; use crate::storage_sync::index::IndexPart; use crate::storage_sync::RemoteTimelineClient; +use crate::tenant::remote_layer::RemoteLayer; use crate::tenant::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, @@ -76,7 +80,7 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - _myself: Weak, + myself: Weak, pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -93,10 +97,7 @@ pub struct Timeline { walredo_mgr: Arc, /// Remote storage client. - /// - /// If Some, use it to upload all newly created layers to the remote storage, - /// and keep remote metadata file in sync. In the future, also use it to download - /// layer files on-demand. + /// See [`storage_sync2`] module comment for details. pub remote_client: Option>, // What page versions do we hold in the repository? If we get a @@ -187,6 +188,8 @@ pub struct Timeline { /// Relation size cache pub rel_size_cache: RwLock>, + download_all_remote_layers_task_info: RwLock>, + state: watch::Sender, } @@ -308,12 +311,68 @@ impl LogicalSize { } } +/// Returned by [`Timeline::layer_size_sum`] +pub enum LayerSizeSum { + /// The result is accurate. + Accurate(u64), + // We don't know the layer file size of one or more layers. + // They contribute to the sum with a value of 0. + // Hence, the sum is a lower bound for the actualy layer file size sum. + ApproximateLowerBound(u64), +} + +impl LayerSizeSum { + pub fn approximate_is_ok(self) -> u64 { + match self { + LayerSizeSum::Accurate(v) => v, + LayerSizeSum::ApproximateLowerBound(v) => v, + } + } +} + pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, pub last_received_msg_ts: u128, } +/// Like `?`, but for [`PageReconstructResult`]. +/// Use it to bubble up the `NeedsDownload` and `Error` to the caller. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_no_ondemand_download { + ($result:expr) => {{ + let result = $result; + match result { + PageReconstructResult::Success(value) => value, + PageReconstructResult::NeedsDownload(timeline, layer) => { + return PageReconstructResult::NeedsDownload(timeline, layer); + } + PageReconstructResult::Error(e) => return PageReconstructResult::Error(e), + } + }}; +} + +/// Replacement for `?` in functions that return [`PageReconstructResult`]. +/// +/// Given an `expr: Result`, use `try_page_reconstruct_result!(expr)` +/// instead of `(expr)?`. +/// If `expr` is `Ok(v)`, the macro evaluates to `v`. +/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_page_reconstruct_result { + ($result:expr) => {{ + let result = $result; + match result { + Ok(v) => v, + Err(e) => return PageReconstructResult::from(e), + } + }}; +} + /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -343,6 +402,77 @@ pub struct GcInfo { pub pitr_cutoff: Lsn, } +pub enum PageReconstructResult { + Success(T), + /// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map + /// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then + /// retry the operation that returned this error. + NeedsDownload(Weak, Weak), + Error(PageReconstructError), +} + +/// An error happened in a get() operation. +#[derive(thiserror::Error)] +pub enum PageReconstructError { + #[error(transparent)] + Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error + + #[error(transparent)] + WalRedo(#[from] crate::walredo::WalRedoError), +} + +impl std::fmt::Debug for PageReconstructError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Self::Other(err) => err.fmt(f), + Self::WalRedo(err) => err.fmt(f), + } + } +} + +/// This impl makes it so you can substitute return type +/// `Result` with `PageReconstructError` in functions +/// and existing `?` will generally continue to work. +/// The reason why thanks to +/// anyhow::Error that `(some error type)ensures that exis +impl From for PageReconstructResult +where + E: Into, +{ + fn from(e: E) -> Self { + Self::Error(e.into()) + } +} + +impl PageReconstructResult { + /// Treat the need for on-demand download as an error. + /// + /// **Avoid this function in new code** if you can help it, + /// as on-demand download will become the norm in the future, + /// especially once we implement layer file eviction. + /// + /// If you are in an async function, use [`with_ondemand_download`] + /// to do the download right here. + /// + /// If you are in a sync function, change its return type from + /// `Result` to `PageReconstructResult` and bubble up + /// the non-success cases of `PageReconstructResult` to the caller. + /// This gives them a chance to do the download and retry. + /// Consider using [`try_no_ondemand_download`] for convenience. + /// + /// For more background, read the comment on [`with_ondemand_download`]. + pub fn no_ondemand_download(self) -> anyhow::Result { + match self { + PageReconstructResult::Success(value) => Ok(value), + // TODO print more info about the timeline + PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"), + PageReconstructResult::Error(e) => { + Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) + } + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -370,8 +500,10 @@ impl Timeline { /// the Repository implementation may incorrectly return a value from an ancestor /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// - pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result { - anyhow::ensure!(lsn.is_valid(), "Invalid LSN"); + pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult { + if !lsn.is_valid() { + return PageReconstructResult::from(anyhow!("Invalid LSN")); + } // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image @@ -381,7 +513,7 @@ impl Timeline { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image Ordering::Greater => { unreachable!("the returned lsn should never be after the requested lsn") } @@ -396,13 +528,18 @@ impl Timeline { img: cached_page_img, }; - self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state)); self.metrics .reconstruct_time_histo .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } + // Like get(), but if a remote layer file is needed, it is downloaded as part of this call. + pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result { + with_ondemand_download(|| self.get(key, lsn)).await + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -429,30 +566,27 @@ impl Timeline { } } - /// Get the physical size of the timeline at the latest LSN - pub fn get_physical_size(&self) -> u64 { - self.metrics.current_physical_size_gauge.get() + /// The sum of the file size of all historic layers in the layer map. + /// This method makes no distinction between local and remote layers. + /// Hence, the result **does not represent local filesystem usage**. + pub fn layer_size_sum(&self) -> LayerSizeSum { + let layer_map = self.layers.read().unwrap(); + let mut size = 0; + let mut no_size_cnt = 0; + for l in layer_map.iter_historic_layers() { + let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); + size += l_size; + no_size_cnt += l_no_size; + } + if no_size_cnt == 0 { + LayerSizeSum::Accurate(size) + } else { + LayerSizeSum::ApproximateLowerBound(size) + } } - /// Get the physical size of the timeline at the latest LSN non incrementally - pub fn get_physical_size_non_incremental(&self) -> anyhow::Result { - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - // total size of layer files in the current timeline directory - let mut total_physical_size = 0; - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_string_lossy(); - - if ImageFileName::parse_str(&fname).is_some() - || DeltaFileName::parse_str(&fname).is_some() - { - total_physical_size += direntry.metadata()?.len(); - } - } - - Ok(total_physical_size) + pub fn get_resident_physical_size(&self) -> u64 { + self.metrics.resident_physical_size_gauge.get() } /// @@ -560,14 +694,18 @@ impl Timeline { // Define partitioning schema if needed - match self.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - ) { + match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) + .await + { Ok((partitioning, lsn)) => { // 2. Create new image layers for partitions that have been modified // "enough". - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + let layer_paths_to_upload = + self.create_image_layers(&partitioning, lsn, false).await?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -761,7 +899,7 @@ impl Timeline { let mut result = Timeline { conf, tenant_conf, - _myself: myself.clone(), + myself: myself.clone(), timeline_id, tenant_id, pg_version, @@ -817,6 +955,9 @@ impl Timeline { last_received_wal: Mutex::new(None), rel_size_cache: RwLock::new(HashMap::new()), + + download_all_remote_layers_task_info: RwLock::new(None), + state, }; result.repartition_threshold = result.get_checkpoint_distance() / 10; @@ -935,11 +1076,18 @@ impl Timeline { continue; } - let layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + let file_size = direntry_path.metadata()?.len(); + + let layer = ImageLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &imgfilename, + file_size, + ); trace!("found layer {}", layer.path().display()); - total_physical_size += layer.path().metadata()?.len(); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { @@ -959,11 +1107,18 @@ impl Timeline { continue; } - let layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + let file_size = direntry_path.metadata()?.len(); + + let layer = DeltaLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &deltafilename, + file_size, + ); trace!("found layer {}", layer.path().display()); - total_physical_size += layer.path().metadata()?.len(); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { @@ -997,7 +1152,7 @@ impl Timeline { num_layers, disk_consistent_lsn, total_physical_size ); self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .set(total_physical_size); timer.stop_and_record(); @@ -1005,21 +1160,14 @@ impl Timeline { Ok(()) } - async fn download_missing( + async fn create_remote_layers( &self, index_part: &IndexPart, - remote_client: &RemoteTimelineClient, local_layers: HashMap>, up_to_date_disk_consistent_lsn: Lsn, ) -> anyhow::Result>> { // Are we missing some files that are present in remote storage? - // Download them now. - // TODO Downloading many files this way is not efficient. - // Better to use FuturesUnordered. Maybe keep as is because: - // a) inplace download is a throw-away code, on-demand patch doesnt need that - // b) typical case now is that there is nothing to sync, this downloads a lot - // 1) if there was another pageserver that came and generated new files - // 2) during attach of a timeline with big history which we currently do not do + // Create RemoteLayer instances for them. let mut local_only_layers = local_layers; for remote_layer_name in &index_part.timeline_layers { let local_layer = local_only_layers.remove(remote_layer_name); @@ -1033,7 +1181,7 @@ impl Timeline { // Is the local layer's size different from the size stored in the // remote index file? // If so, rename_to_backup those files & replace their local layer with - // a RemoteLayer in the laye rmap so that we re-download them on-demand. + // a RemoteLayer in the layer map so that we re-download them on-demand. if let Some(local_layer) = local_layer { let local_layer_path = local_layer .local_path() @@ -1058,7 +1206,7 @@ impl Timeline { assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { - self.metrics.current_physical_size_gauge.sub(local_size); + self.metrics.resident_physical_size_gauge.sub(local_size); self.layers.write().unwrap().remove_historic(local_layer); // fall-through to adding the remote layer } @@ -1079,7 +1227,7 @@ impl Timeline { } info!( - "remote layer does not exist locally, downloading it now: {}", + "remote layer does not exist locally, creating remote layer: {}", remote_layer_name.file_name() ); @@ -1093,28 +1241,18 @@ impl Timeline { continue; } - trace!("downloading image file: {remote_layer_name:?}"); - let downloaded_size = remote_client - .download_layer_file(remote_layer_name, &remote_layer_metadata) - .await - .with_context(|| { - format!("failed to download image layer {remote_layer_name:?}") - })?; - trace!("done"); + let remote_layer = RemoteLayer::new_img( + self.tenant_id, + self.timeline_id, + imgfilename, + &remote_layer_metadata, + ); + let remote_layer = Arc::new(remote_layer); - let image_layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename); - - self.layers - .write() - .unwrap() - .insert_historic(Arc::new(image_layer)); - self.metrics - .current_physical_size_gauge - .add(downloaded_size); + self.layers.write().unwrap().insert_historic(remote_layer); } LayerFileName::Delta(deltafilename) => { - // Create a DeltaLayer struct for each delta file. + // Create a RemoteLayer for the delta file. // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is // OK for a delta layer to have end LSN 101, but if the end LSN @@ -1122,29 +1260,19 @@ impl Timeline { // before crash. if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { warn!( - "found future delta layer {} on timeline {} remote_consistent_lsn is {}", - deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn - ); + "found future delta layer {} on timeline {} remote_consistent_lsn is {}", + deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn + ); continue; } - - trace!("downloading delta file: {remote_layer_name:?}"); - let sz = remote_client - .download_layer_file(remote_layer_name, &remote_layer_metadata) - .await - .with_context(|| { - format!("failed to download delta layer {remote_layer_name:?}") - })?; - trace!("done"); - - let delta_layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename); - - self.layers - .write() - .unwrap() - .insert_historic(Arc::new(delta_layer)); - self.metrics.current_physical_size_gauge.add(sz); + let remote_layer = RemoteLayer::new_delta( + self.tenant_id, + self.timeline_id, + deltafilename, + &remote_layer_metadata, + ); + let remote_layer = Arc::new(remote_layer); + self.layers.write().unwrap().insert_historic(remote_layer); } #[cfg(test)] LayerFileName::Test(_) => unreachable!(), @@ -1154,22 +1282,22 @@ impl Timeline { Ok(local_only_layers) } + /// This function will synchronize local state with what we have in remote storage. /// - /// This function will synchronize local data with what we have in remote storage. - /// 1. It will download missing layer files. - /// 2. It will update local metadata if remote one has greater `disk_consistent_lsn`. - /// 3. It will upload files that are missing on the remote - /// 4. It will update index file on the remote accordingly - /// TODO may be a bit cleaner to do things based on populated remote client, - /// and then do things based on its upload_queue.latest_files + /// Steps taken: + /// 1. Initialize upload queue based on `index_part`. + /// 2. Create `RemoteLayer` instances for layers that exist only on the remote. + /// The list of layers on the remote comes from `index_part`. + /// The list of local layers is given by the layer map's `iter_historic_layers()`. + /// So, the layer map must have been loaded already. + /// 3. Schedule upload of local-only layer files (which will then also update the remote + /// IndexPart to include the new layer files). /// - /// This is used during tenant attach. The layer map must have been loaded - /// with local filesystem contents already. - /// - /// The caller should provide IndexPart if it exists on the remote storage. If it's None, - /// we assume that it is missing on the remote storage, which means that we initialized - /// a timeline and then restarted before successful upload was performed + /// Refer to the `storage_sync2` module comment for more context. /// + /// # TODO + /// May be a bit cleaner to do things based on populated remote client, + /// and then do things based on its upload_queue.latest_files. #[instrument(skip(self, index_part, up_to_date_metadata))] pub async fn reconcile_with_remote( &self, @@ -1199,9 +1327,10 @@ impl Timeline { index_part.timeline_layers.len() ); remote_client.init_upload_queue(index_part)?; - - self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn) - .await? + let local_only_filenames = self + .create_remote_layers(index_part, local_layers, disk_consistent_lsn) + .await?; + local_only_filenames } None => { info!("initializing upload queue as empty"); @@ -1323,9 +1452,15 @@ impl Timeline { let calculation = async { let cancel = cancel.child_token(); - spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn, cancel)) - .await - .context("Failed to spawn calculation result task")? + tokio::task::spawn_blocking(move || { + // Run in a separate thread since this can do a lot of + // synchronous file IO without .await inbetween + // if there are no RemoteLayers that would require downloading. + let h = tokio::runtime::Handle::current(); + h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel)) + }) + .await + .context("Failed to spawn calculation result task")? }; let timeline_state_cancellation = async { loop { @@ -1376,7 +1511,7 @@ impl Timeline { /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. - pub fn calculate_logical_size( + async fn calculate_logical_size( &self, up_to_lsn: Lsn, cancel: CancellationToken, @@ -1421,7 +1556,9 @@ impl Timeline { } else { self.metrics.logical_size_histo.start_timer() }; - let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn, cancel)?; + let logical_size = self + .get_current_logical_size_non_incremental(up_to_lsn, cancel) + .await?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); Ok(logical_size) @@ -1458,7 +1595,7 @@ impl TraversalLayerExt for Arc { match self.local_path() { Some(local_path) => { debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())), - "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary", + "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary", ); format!("{}", local_path.display()) } @@ -1497,7 +1634,7 @@ impl Timeline { key: Key, request_lsn: Lsn, reconstruct_state: &mut ValueReconstructState, - ) -> Result<(), PageReconstructError> { + ) -> PageReconstructResult<()> { // Start from the current timeline. let mut timeline_owned; let mut timeline = self; @@ -1524,12 +1661,12 @@ impl Timeline { // The function should have updated 'state' //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); match result { - ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Complete => return PageReconstructResult::Success(()), ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { self.metrics.materialized_page_cache_hit_counter.inc_by(1); - return Ok(()); + return PageReconstructResult::Success(()); } if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid @@ -1562,7 +1699,10 @@ impl Timeline { timeline.ancestor_lsn, cont_lsn ); - let ancestor = timeline.get_ancestor_timeline()?; + let ancestor = match timeline.get_ancestor_timeline() { + Ok(timeline) => timeline, + Err(e) => return PageReconstructResult::from(e), + }; timeline_owned = ancestor; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); @@ -1580,11 +1720,14 @@ impl Timeline { // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); - result = open_layer.get_value_reconstruct_data( + result = match open_layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; traversal_path.push((result, cont_lsn, open_layer.traversal_id())); continue; @@ -1595,11 +1738,14 @@ impl Timeline { if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); - result = frozen_layer.get_value_reconstruct_data( + result = match frozen_layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); continue 'outer; @@ -1609,12 +1755,24 @@ impl Timeline { if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + // If it's a remote layer, the caller can do the download and retry. + if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) { + info!("need remote layer {}", layer.traversal_id()); + return PageReconstructResult::NeedsDownload( + Weak::clone(&timeline.myself), + Arc::downgrade(&remote_layer), + ); + } + let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = layer.get_value_reconstruct_data( + result = match layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; traversal_path.push((result, cont_lsn, layer.traversal_id())); } else if timeline.ancestor_timeline.is_some() { @@ -1840,9 +1998,11 @@ impl Timeline { let lsn_range = frozen_layer.get_lsn_range(); let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - let (partitioning, _lsn) = - self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? + let (partitioning, _lsn) = self + .repartition(self.initdb_lsn, self.get_compaction_target_size()) + .await?; + self.create_image_layers(&partitioning, self.initdb_lsn, true) + .await? } else { // normal case, write out a L0 delta layer file. let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; @@ -1979,7 +2139,7 @@ impl Timeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); - self.metrics.current_physical_size_gauge.add(sz); + self.metrics.resident_physical_size_gauge.add(sz); // update metrics self.metrics.num_persistent_files_created.inc_by(1); self.metrics.persistent_bytes_written.inc_by(sz); @@ -1987,15 +2147,28 @@ impl Timeline { Ok((new_delta_filename, LayerFileMetadata::new(sz))) } - fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> { - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 == Lsn(0) - || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + async fn repartition( + &self, + lsn: Lsn, + partition_size: u64, + ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { - let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(partition_size); + let partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 != Lsn(0) + && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold + { + // no repartitioning needed + return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); + } + } + let keyspace = self.collect_keyspace(lsn).await?; + let partitioning = keyspace.partition(partition_size); + + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if lsn > partitioning_guard.1 { *partitioning_guard = (partitioning, lsn); - return Ok((partitioning_guard.0.clone(), lsn)); + } else { + warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless"); } Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } @@ -2041,7 +2214,7 @@ impl Timeline { Ok(false) } - fn create_image_layers( + async fn create_image_layers( &self, partitioning: &KeyPartitioning, lsn: Lsn, @@ -2068,7 +2241,7 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = match self.get(key, lsn) { + let img = match self.get_download(key, lsn).await { Ok(img) => img, Err(err) => { // If we fail to reconstruct a VM or FSM page, we can zero the @@ -2131,7 +2304,9 @@ impl Timeline { layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); layers.insert_historic(Arc::new(l)); } drop(layers); @@ -2443,7 +2618,9 @@ impl Timeline { } // update the timeline's physical size - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); @@ -2456,7 +2633,7 @@ impl Timeline { for l in deltas_to_compact { if let Some(path) = l.local_path() { self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .sub(path.metadata()?.len()); } layer_names_to_delete.push(l.filename()); @@ -2526,7 +2703,10 @@ impl Timeline { if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match self.find_lsn_for_timestamp(pitr_timestamp)? { + match self + .find_lsn_for_timestamp(pitr_timestamp) + .no_ondemand_download()? + { LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, LsnForTimestamp::Future(lsn) => { debug!("future({})", lsn); @@ -2743,11 +2923,11 @@ impl Timeline { for doomed_layer in layers_to_remove { if let Some(path) = doomed_layer.local_path() { self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .sub(path.metadata()?.len()); } layer_names_to_delete.push(doomed_layer.filename()); - doomed_layer.delete()?; + doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning? layers.remove_historic(doomed_layer); result.layers_removed += 1; } @@ -2778,7 +2958,7 @@ impl Timeline { key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - ) -> anyhow::Result { + ) -> PageReconstructResult { // Perform WAL redo if needed data.records.reverse(); @@ -2790,9 +2970,11 @@ impl Timeline { key, img_lsn ); - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { - bail!("base image for {} at {} not found", key, request_lsn); + PageReconstructResult::from(anyhow!( + "base image for {key} at {request_lsn} not found" + )) } } else { // We need to do WAL redo. @@ -2800,12 +2982,12 @@ impl Timeline { // If we don't have a base image, then the oldest WAL record better initialize // the page if data.img.is_none() && !data.records.first().unwrap().1.will_init() { - bail!( + PageReconstructResult::from(anyhow!( "Base image for {} at {} not found, but got {} WAL records", key, request_lsn, data.records.len() - ); + )) } else { if data.img.is_some() { trace!( @@ -2820,14 +3002,18 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = self + let img = match self .walredo_mgr .request_redo(key, request_lsn, data.img, data.records, self.pg_version) - .context("Failed to reconstruct a page image:")?; + .context("Failed to reconstruct a page image:") + { + Ok(img) => img, + Err(e) => return PageReconstructResult::from(e), + }; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); - cache + if let Err(e) = cache .memorize_materialized_page( self.tenant_id, self.timeline_id, @@ -2835,30 +3021,324 @@ impl Timeline { last_rec_lsn, &img, ) - .context("Materialized page memoization failed")?; + .context("Materialized page memoization failed") + { + return PageReconstructResult::from(e); + } } - Ok(img) + PageReconstructResult::Success(img) } } } + + /// Download a layer file from remote storage and insert it into the layer map. + /// + /// It's safe to call this function for the same layer concurrently. In that case: + /// - If the layer has already been downloaded, `OK(...)` is returned. + /// - If the layer is currently being downloaded, we wait until that download succeeded / failed. + /// - If it succeeded, we return `Ok(...)`. + /// - If it failed, we or another concurrent caller will initiate a new download attempt. + /// + /// Download errors are classified and retried if appropriate by the underlying RemoteTimelineClient function. + /// It has an internal limit for the maximum number of retries and prints appropriate log messages. + /// If we exceed the limit, it returns an error, and this function passes it through. + /// The caller _could_ retry further by themselves by calling this function again, but _should not_ do it. + /// The reason is that they cannot distinguish permanent errors from temporary ones, whereas + /// the underlying RemoteTimelineClient can. + /// + /// There is no internal timeout or slowness detection. + /// If the caller has a deadline or needs a timeout, they can simply stop polling: + /// we're **cancellation-safe** because the download happens in a separate task_mgr task. + /// So, the current download attempt will run to completion even if we stop polling. + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + pub async fn download_remote_layer( + self: Arc, + remote_layer: Arc, + ) -> anyhow::Result<()> { + let permit = match Arc::clone(&remote_layer.ongoing_download) + .acquire_owned() + .await + { + Ok(permit) => permit, + Err(_closed) => { + info!("download of layer has already finished"); + return Ok(()); + } + }; + + let (sender, receiver) = tokio::sync::oneshot::channel(); + // Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline. + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::RemoteDownloadTask, + Some(self.tenant_id), + Some(self.timeline_id), + &format!("download layer {}", remote_layer.short_id()), + false, + async move { + let remote_client = self.remote_client.as_ref().unwrap(); + + // Does retries + exponential back-off internally. + // When this fails, don't layer further retry attempts here. + let result = remote_client + .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata) + .await; + + if let Ok(size) = &result { + // XXX the temp file is still around in Err() case + // and consumes space until we clean up upon pageserver restart. + self.metrics.resident_physical_size_gauge.add(*size); + + // Download complete. Replace the RemoteLayer with the corresponding + // Delta- or ImageLayer in the layer map. + let new_layer = remote_layer.create_downloaded_layer(self.conf, *size); + let mut layers = self.layers.write().unwrap(); + { + let l: Arc = remote_layer.clone(); + layers.remove_historic(l); + } + layers.insert_historic(new_layer); + drop(layers); + + // Now that we've inserted the download into the layer map, + // close the semaphore. This will make other waiters for + // this download return Ok(()). + assert!(!remote_layer.ongoing_download.is_closed()); + remote_layer.ongoing_download.close(); + } else { + // Keep semaphore open. We'll drop the permit at the end of the function. + } + + // Don't treat it as an error if the task that triggered the download + // is no longer interested in the result. + sender.send(result.map(|_sz| ())).ok(); + + // In case we failed and there are other waiters, this will make one + // of them retry the download in a new task. + // XXX: This resets the exponential backoff because it's a new call to + // download_layer file. + drop(permit); + + Ok(()) + }, + ); + + receiver.await.context("download task cancelled")? + } + + pub async fn spawn_download_all_remote_layers( + self: Arc, + ) -> Result { + let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap(); + if let Some(st) = &*status_guard { + match &st.state { + DownloadRemoteLayersTaskState::Running => { + return Err(st.clone()); + } + DownloadRemoteLayersTaskState::ShutDown + | DownloadRemoteLayersTaskState::Completed => { + *status_guard = None; + } + } + } + + let self_clone = Arc::clone(&self); + let task_id = task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::DownloadAllRemoteLayers, + Some(self.tenant_id), + Some(self.timeline_id), + "download all remote layers task", + false, + async move { + self_clone.download_all_remote_layers().await; + let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); + match &mut *status_guard { + None => { + warn!("tasks status is supposed to be Some(), since we are running"); + } + Some(st) => { + let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap()); + if st.task_id != exp_task_id { + warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id); + } else { + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + }; + Ok(()) + } + .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id)) + ); + + let initial_info = DownloadRemoteLayersTaskInfo { + task_id: format!("{task_id}"), + state: DownloadRemoteLayersTaskState::Running, + total_layer_count: 0, + successful_download_count: 0, + failed_download_count: 0, + }; + *status_guard = Some(initial_info.clone()); + + Ok(initial_info) + } + + async fn download_all_remote_layers(self: &Arc) { + let mut downloads: FuturesUnordered<_> = { + let layers = self.layers.read().unwrap(); + layers + .iter_historic_layers() + .filter_map(|l| l.downcast_remote_layer()) + .map({ + |l| { + let self_clone = Arc::clone(self); + self_clone.download_remote_layer(l) + } + }) + .collect() + }; + + macro_rules! lock_status { + ($st:ident) => { + let mut st = self.download_all_remote_layers_task_info.write().unwrap(); + let st = st + .as_mut() + .expect("this function is only called after the task has been spawned"); + assert_eq!( + st.task_id, + format!( + "{}", + task_mgr::current_task_id().expect("we run inside a task_mgr task") + ) + ); + let $st = st; + }; + } + + { + lock_status!(st); + st.total_layer_count = downloads.len().try_into().unwrap(); + } + loop { + tokio::select! { + dl = downloads.next() => { + lock_status!(st); + match dl { + None => break, + Some(Ok(())) => { + st.successful_download_count += 1; + }, + Some(Err(e)) => { + error!(error = %e, "layer download failed"); + st.failed_download_count += 1; + } + } + } + _ = task_mgr::shutdown_watcher() => { + // Kind of pointless to watch for shutdowns here, + // as download_remote_layer spawns other task_mgr tasks internally. + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::ShutDown; + } + } + } + { + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + + pub fn get_download_all_remote_layers_task_info(&self) -> Option { + self.download_all_remote_layers_task_info + .read() + .unwrap() + .clone() + } } -/// An error happened in a get() operation. -#[derive(thiserror::Error)] -pub enum PageReconstructError { - #[error(transparent)] - Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error - - #[error(transparent)] - WalRedo(#[from] crate::walredo::WalRedoError), -} - -impl std::fmt::Debug for PageReconstructError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> { - match self { - PageReconstructError::Other(err) => err.fmt(f), - PageReconstructError::WalRedo(err) => err.fmt(f), +/// Helper function to deal with [`PageReconstructResult`]. +/// +/// Takes a sync closure that returns a [`PageReconstructResult`]. +/// If it is [`PageReconstructResult::NeedsDownload`], +/// do the download and retry the closure. +/// +/// ### Background +/// +/// This is a crutch to make on-demand downloads efficient in +/// our async-sync-async sandwich codebase. Some context: +/// +/// - The code that does the downloads uses async Rust. +/// - The code that initiates download is many levels of sync Rust. +/// - The sync code must wait for the download to finish to +/// make further progress. +/// - The sync code is invoked directly from async functions upstack. +/// +/// Example (there are also much worse ones where the sandwich is taller) +/// +/// async handle_get_page_at_lsn_request page_service.rs +/// sync get_rel_page_at_lsn timeline.rs +/// sync timeline.get timeline.rs +/// sync get_reconstruct_data timeline.rs +/// async download_remote_layer timeline.rs +/// +/// It is not possible to Timeline::download_remote_layer().await within +/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`] +/// which contains references to the [`Timeline`] and [`RemoteLayer`]. +/// We bubble that error upstack to the async code, which can then call +/// `Timeline::download_remote_layer().await`. +/// That is _efficient_ because tokio can use the same OS thread to do +/// other work while we're waiting for the download. +/// +/// It is a deliberate decision to use a new result type to communicate +/// the need for download instead of adding another variant to [`PageReconstructError`]. +/// The reason is that with the latter approach, any place that does +/// `?` on a `Result` will implicitly ignore the +/// need for download. We want that to be explicit, so that +/// - the code base becomes greppable for places that don't do a download +/// - future code changes will need to explicilty address for on-demand download +/// +/// Alternatives to consider in the future: +/// +/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread +/// and use it to block_on the download_remote_layer future. +/// That is obviously inefficient as it creates one thread per download. +/// - Convert everything to async. The problem here is that the sync +/// functions are used by many other sync functions. So, the scope +/// creep of such a conversion is tremendous. +/// - Compromise between the two: implement async functions for each sync +/// function. Switch over the hot code paths (GetPage()) to use the +/// async path, so that the hot path doesn't spawn threads. Other code +/// paths would remain sync initially, and get converted to async over time. +/// +pub async fn with_ondemand_download(mut f: F) -> Result +where + F: Send + FnMut() -> PageReconstructResult, + T: Send, +{ + loop { + let closure_result = f(); + match closure_result { + PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => { + // if the timeline is gone, it has likely been deleted / tenant detached + let tl = weak_timeline.upgrade().context("timeline is gone")?; + // if the remote layer got removed, retry the function, it might succeed now + let remote_layer = match weak_remote_layer.upgrade() { + None => { + info!("remote layer is gone, retrying closure"); + continue; + } + Some(l) => l, + }; + // Does retries internally + tl.download_remote_layer(remote_layer).await?; + // Download successful, retry the closure + continue; + } + PageReconstructResult::Success(closure_value) => return Ok(closure_value), + PageReconstructResult::Error(e) => { + return Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) + } } } } @@ -2868,7 +3348,7 @@ impl std::fmt::Debug for PageReconstructError { fn layer_traversal_error( msg: String, path: Vec<(ValueReconstructResult, Lsn, TraversalId)>, -) -> Result<(), PageReconstructError> { +) -> PageReconstructResult<()> { // We want the original 'msg' to be the outermost context. The outermost context // is the most high-level information, which also gets propagated to the client. let mut msg_iter = path @@ -2885,7 +3365,7 @@ fn layer_traversal_error( // Append all subsequent traversals, and the error message 'msg', as contexts. let msg = msg_iter.fold(err, |err, msg| err.context(msg)); - Err(PageReconstructError::Other(msg)) + PageReconstructResult::from(msg) } /// Various functions to mutate the timeline. diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 46e4acd50c..fb216123c1 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -12,7 +12,7 @@ //! use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; use once_cell::sync::OnceCell; -use std::fs::{File, OpenOptions}; +use std::fs::{self, File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; @@ -240,6 +240,10 @@ impl VirtualFile { self.with_file("fsync", |file| file.sync_all())? } + pub fn metadata(&self) -> Result { + self.with_file("metadata", |file| file.metadata())? + } + /// Helper function that looks up the underlying File for this VirtualFile, /// opening it and evicting some other File if necessary. It calls 'func' /// with the physical File. diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e8a2e99f06..e3453dfe06 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -31,7 +31,10 @@ use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use crate::pgdatadir_mapping::*; +use crate::tenant::PageReconstructResult; use crate::tenant::Timeline; +use crate::try_no_ondemand_download; +use crate::try_page_reconstruct_result as try_prr; use crate::walrecord::*; use crate::ZERO_PAGE; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -52,10 +55,10 @@ pub struct WalIngest<'a> { } impl<'a> WalIngest<'a> { - pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result { + pub fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; + let checkpoint_bytes = timeline.get_checkpoint(startpoint).no_ondemand_download()?; let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); @@ -80,10 +83,12 @@ impl<'a> WalIngest<'a> { lsn: Lsn, modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, - ) -> Result<()> { + ) -> PageReconstructResult<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded, self.timeline.pg_version) - .context("failed decoding wal record")?; + try_prr!( + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record") + ); let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -98,7 +103,7 @@ impl<'a> WalIngest<'a> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, modification, decoded)?; + try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded)); } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -106,13 +111,13 @@ impl<'a> WalIngest<'a> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(modification, &create)?; + try_prr!(self.ingest_xlog_smgr_create(modification, &create)); } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(modification, &truncate)?; + try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate)); } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { debug!( "handle RM_DBASE_ID for Postgres version {:?}", @@ -125,14 +130,14 @@ impl<'a> WalIngest<'a> { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); - self.ingest_xlog_dbase_create(modification, &createdb)?; + try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); } } } else if self.timeline.pg_version == 15 { @@ -148,14 +153,14 @@ impl<'a> WalIngest<'a> { // So we can reuse XlCreateDatabase here. debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; + try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); } } } @@ -167,38 +172,38 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::Clog, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(modification, &xlrec)?; + try_prr!(self.ingest_clog_truncate_record(modification, &xlrec)); } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( + try_prr!(self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, - )?; + )); } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED || info == pg_constants::XLOG_XACT_ABORT_PREPARED { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( + try_prr!(self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, - )?; + )); // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", @@ -206,9 +211,10 @@ impl<'a> WalIngest<'a> { parsed_xact.xid, lsn, ); - modification.drop_twophase_file(parsed_xact.xid)?; + try_prr!(modification.drop_twophase_file(parsed_xact.xid)); } else if info == pg_constants::XLOG_XACT_PREPARE { - modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?; + try_prr!(modification + .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))); } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -217,34 +223,34 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::MultiXactOffsets, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::MultiXactMembers, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(modification, &xlrec)?; + try_prr!(self.ingest_multixact_create_record(modification, &xlrec)); } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(modification, &xlrec)?; + try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec)); } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded)?; + try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded)); } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -258,7 +264,9 @@ impl<'a> WalIngest<'a> { { let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; + let xlog_checkpoint = try_prr!( + CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint") + ); trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", xlog_checkpoint.oldestXid, @@ -279,22 +287,23 @@ impl<'a> WalIngest<'a> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(modification, lsn, decoded, blk)?; + try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk)); } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { - let new_checkpoint_bytes = self.checkpoint.encode()?; + let new_checkpoint_bytes = + try_prr!(self.checkpoint.encode().context("encode checkpoint")); - modification.put_checkpoint(new_checkpoint_bytes)?; + try_prr!(modification.put_checkpoint(new_checkpoint_bytes)); self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit()?; + try_prr!(modification.commit()); - Ok(()) + PageReconstructResult::Success(()) } fn ingest_decoded_block( @@ -303,7 +312,7 @@ impl<'a> WalIngest<'a> { lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, - ) -> Result<()> { + ) -> PageReconstructResult<()> { let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, @@ -323,7 +332,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)) { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -345,15 +354,20 @@ impl<'a> WalIngest<'a> { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); - self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; + try_no_ondemand_download!(self.put_rel_page_image( + modification, + rel, + blk.blkno, + image.freeze() + )); } else { let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - self.put_rel_wal_record(modification, rel, blk.blkno, rec)?; + try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec)); } - Ok(()) + PageReconstructResult::Success(()) } fn ingest_heapam_record( @@ -505,7 +519,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, rec: &XlCreateDatabase, - ) -> Result<()> { + ) -> anyhow::Result<()> { let db_id = rec.db_id; let tablespace_id = rec.tablespace_id; let src_db_id = rec.src_db_id; @@ -520,14 +534,16 @@ impl<'a> WalIngest<'a> { let rels = modification .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn)?; + .list_rels(src_tablespace_id, src_db_id, req_lsn) + .no_ondemand_download()?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); // Copy relfilemap let filemap = modification .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn) + .no_ondemand_download()?; modification.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; @@ -536,7 +552,10 @@ impl<'a> WalIngest<'a> { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?; + let nblocks = modification + .tline + .get_rel_size(src_rel, req_lsn, true) + .no_ondemand_download()?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, @@ -553,7 +572,8 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?; + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true) + .no_ondemand_download()?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } @@ -657,7 +677,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Record update of CLOG pages let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE; let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -713,7 +733,11 @@ impl<'a> WalIngest<'a> { relnode: xnode.relnode, }; let last_lsn = self.timeline.get_last_record_lsn(); - if modification.tline.get_rel_exists(rel, last_lsn, true)? { + if modification + .tline + .get_rel_exists(rel, last_lsn, true) + .no_ondemand_download()? + { self.put_rel_drop(modification, rel)?; } } @@ -725,7 +749,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, xlrec: &XlClogTruncate, - ) -> Result<()> { + ) -> anyhow::Result<()> { info!( "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db @@ -767,7 +791,8 @@ impl<'a> WalIngest<'a> { let req_lsn = modification.tline.get_last_record_lsn(); for segno in modification .tline - .list_slru_segments(SlruKind::Clog, req_lsn)? + .list_slru_segments(SlruKind::Clog, req_lsn) + .no_ondemand_download()? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; if slru_may_delete_clogsegment(segpage, xlrec.pageno) { @@ -923,10 +948,10 @@ impl<'a> WalIngest<'a> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; - modification.put_rel_page_image(rel, blknum, img)?; - Ok(()) + ) -> PageReconstructResult<()> { + try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum)); + try_prr!(modification.put_rel_page_image(rel, blknum, img)); + PageReconstructResult::Success(()) } fn put_rel_wal_record( @@ -936,7 +961,8 @@ impl<'a> WalIngest<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; + self.handle_rel_extend(modification, rel, blknum) + .no_ondemand_download()?; modification.put_rel_wal_record(rel, blknum, rec)?; Ok(()) } @@ -946,7 +972,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { modification.put_rel_truncation(rel, nblocks)?; Ok(()) } @@ -956,11 +982,17 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? { + fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result { + let nblocks = if !self + .timeline + .get_rel_exists(rel, lsn, true) + .no_ondemand_download()? + { 0 } else { - self.timeline.get_rel_size(rel, lsn, true)? + self.timeline + .get_rel_size(rel, lsn, true) + .no_ondemand_download()? }; Ok(nblocks) } @@ -970,30 +1002,31 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, - ) -> Result<()> { + ) -> PageReconstructResult<()> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? { - // create it with 0 size initially, the logic below will extend it - modification.put_rel_creation(rel, 0)?; - 0 - } else { - self.timeline.get_rel_size(rel, last_lsn, true)? - }; + let old_nblocks = + if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) { + // create it with 0 size initially, the logic below will extend it + try_prr!(modification.put_rel_creation(rel, 0)); + 0 + } else { + try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true)) + }; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); - modification.put_rel_extend(rel, new_nblocks)?; + try_prr!(modification.put_rel_extend(rel, new_nblocks)); // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())); } } - Ok(()) + PageReconstructResult::Success(()) } fn put_slru_page_image( @@ -1015,7 +1048,7 @@ impl<'a> WalIngest<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1027,13 +1060,16 @@ impl<'a> WalIngest<'a> { let last_lsn = self.timeline.get_last_record_lsn(); let old_nblocks = if !self .timeline - .get_slru_segment_exists(kind, segno, last_lsn)? + .get_slru_segment_exists(kind, segno, last_lsn) + .no_ondemand_download()? { // create it with 0 size initially, the logic below will extend it modification.put_slru_segment_creation(kind, segno, 0)?; 0 } else { - self.timeline.get_slru_segment_size(kind, segno, last_lsn)? + self.timeline + .get_slru_segment_size(kind, segno, last_lsn) + .no_ondemand_download()? }; if new_nblocks > old_nblocks { @@ -1099,58 +1135,103 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A)?; - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x30)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x50)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5")) + .no_ondemand_download()?; m.commit()?; assert_current_logical_size(&*tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); // Check page contents at each LSN assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1161,20 +1242,36 @@ mod tests { assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); // should still see the truncated block with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1182,35 +1279,62 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x68)); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x68), false) + .no_ondemand_download()?, + 0 + ); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1")) + .no_ondemand_download()?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x70), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false) + .no_ondemand_download()?, ZERO_PAGE ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500")) + .no_ondemand_download()?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + 1501 + ); for blk in 2..1500 { assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false) + .no_ondemand_download()?, ZERO_PAGE ); } assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1500") ); @@ -1226,12 +1350,24 @@ mod tests { let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .no_ondemand_download()?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); @@ -1239,19 +1375,36 @@ mod tests { m.commit()?; // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x30), false) + .no_ondemand_download()?, + false + ); // FIXME: should fail //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4")) + .no_ondemand_download()?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + 1 + ); Ok(()) } @@ -1270,23 +1423,45 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .no_ondemand_download()?; } m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1298,24 +1473,38 @@ mod tests { m.commit()?; // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 1 + ); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } // should still see all blocks with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + relsize + ); for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1326,18 +1515,32 @@ mod tests { let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .no_ondemand_download()?; } m.commit()?; - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1358,14 +1561,18 @@ mod tests { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); - walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img) + .no_ondemand_download()?; m.commit()?; } assert_current_logical_size(&*tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE + 1 ); @@ -1374,7 +1581,12 @@ mod tests { let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, + RELSEG_SIZE + ); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block @@ -1383,7 +1595,9 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE - 1 ); assert_current_logical_size(&*tline, Lsn(lsn)); @@ -1397,7 +1611,9 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, size as BlockNumber ); diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index a65703bca9..aeb7601af7 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -407,7 +407,7 @@ impl WalreceiverState { .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id)) + .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) }); let now = Utc::now().naive_utc(); diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 5b7e60aa5e..cc318cccc8 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,7 +20,9 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; -use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; +use crate::{ + metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate, +}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -248,9 +250,16 @@ pub async fn handle_walreceiver_connection( // at risk of hitting a deadlock. ensure!(lsn.is_aligned()); - walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded) - .context("could not ingest record at {lsn}")?; + with_ondemand_download(|| { + walingest.ingest_record( + recdata.clone(), + lsn, + &mut modification, + &mut decoded, + ) + }) + .await + .with_context(|| format!("could not ingest record at {lsn}"))?; fail_point!("walreceiver-after-ingest"); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 38fb9a4247..7581140934 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1,6 +1,7 @@ //! //! Functions for parsing WAL records. //! + use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::pg_constants; diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 8ea3f13bf5..d83a74ae14 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -318,14 +318,8 @@ def remote_consistent_lsn( detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: - # No remote information at all. This happens right after creating - # a timeline, before any part of it has been uploaded to remote - # storage yet. - return 0 - else: - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload( diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 5fe6c43528..9236137d19 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -49,7 +49,7 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", - "pageserver_current_physical_size", + "pageserver_resident_physical_size", "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d52ca38447..5b00ebdea7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -26,6 +26,7 @@ import asyncpg import backoff # type: ignore import boto3 import jwt +import prometheus_client import psycopg2 import pytest import requests @@ -41,6 +42,7 @@ from fixtures.utils import ( get_self_dir, subprocess_capture, ) +from prometheus_client.parser import text_string_to_metric_families # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -1204,8 +1206,22 @@ class PageserverHttpClient(requests.Session): # there are no tests for those right now. return size - def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") + def timeline_list( + self, + tenant_id: TenantId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + ) -> List[Dict[str, Any]]: + + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -1239,13 +1255,13 @@ class PageserverHttpClient(requests.Session): tenant_id: TenantId, timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, - include_non_incremental_physical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, ) -> Dict[Any, Any]: params = {} if include_non_incremental_logical_size: params["include-non-incremental-logical-size"] = "yes" - if include_non_incremental_physical_size: - params["include-non-incremental-physical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", @@ -1320,11 +1336,88 @@ class PageserverHttpClient(requests.Session): res_json = res.json() assert res_json is None + def timeline_spawn_download_remote_layers( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> dict[str, Any]: + + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_poll_download_remote_layers_status( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + spawn_response: dict[str, Any], + poll_state=None, + ) -> None | dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + + # assumption in this API client here is that nobody else spawns the task + assert res_json["task_id"] == spawn_response["task_id"] + + if poll_state is None or res_json["state"] == poll_state: + return res_json + return None + + def timeline_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + errors_ok=False, + at_least_one_download=True, + ): + res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id) + while True: + completed = self.timeline_poll_download_remote_layers_status( + tenant_id, timeline_id, res, poll_state="Completed" + ) + if not completed: + time.sleep(0.1) + continue + if not errors_ok: + assert completed["failed_download_count"] == 0 + if at_least_one_download: + assert completed["successful_download_count"] > 0 + return completed + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) return res.text + def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str): + raw = self.get_metrics() + family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw)) + [metric] = [m for m in family if m.name == metric_name] + [sample] = [ + s + for s in metric.samples + if s.labels["tenant_id"] == str(tenant_id) + and s.labels["timeline_id"] == str(timeline_id) + ] + return sample.value + + def get_metric_value(self, name: str) -> Optional[str]: + metrics = self.get_metrics() + relevant = [line for line in metrics.splitlines() if line.startswith(name)] + if len(relevant) == 0: + log.info(f'could not find metric "{name}"') + return None + assert len(relevant) == 1 + return relevant[0].lstrip(name).strip() + @dataclass class PageserverPort: @@ -1622,7 +1715,12 @@ class NeonCli(AbstractNeonCli): pageserver_config_override=self.env.pageserver.config_override, ) - res = self.raw_cli(cmd) + s3_env_vars = None + if self.env.remote_storage is not None and isinstance( + self.env.remote_storage, S3Storage + ): + s3_env_vars = self.env.remote_storage.access_env_vars() + res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) res.check_returncode() return res @@ -2996,13 +3094,55 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def assert_no_in_progress_downloads_for_tenant( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, +def wait_until(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. + """ + last_exception = None + for i in range(number_of_iterations): + try: + res = func() + except Exception as e: + log.info("waiting for %s iteration %s failed", func, i + 1) + last_exception = e + time.sleep(interval) + continue + return res + raise Exception("timed out while waiting for %s" % func) from last_exception + + +def wait_while(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns false, or throws an exception. + """ + for i in range(number_of_iterations): + try: + if not func(): + return + log.info("waiting for %s iteration %s failed", func, i + 1) + time.sleep(interval) + continue + except Exception: + return + raise Exception("timed out while waiting for %s" % func) + + +def assert_tenant_status( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str ): tenant_status = pageserver_http_client.tenant_status(tenant) - assert tenant_status["has_in_progress_downloads"] is False, tenant_status - assert tenant_status["state"] == "Active" + log.info(f"tenant_status: {tenant_status}") + assert tenant_status["state"] == expected_status, tenant_status + + +def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId): + tenants = ps_http.tenant_list() + matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] + assert len(matching) < 2 + if len(matching) == 0: + return None + return matching[0] def remote_consistent_lsn( @@ -3010,14 +3150,15 @@ def remote_consistent_lsn( ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: + if detail["remote_consistent_lsn"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return Lsn(0) - assert isinstance(lsn_str, str) - return Lsn(lsn_str) + else: + lsn_str = detail["remote_consistent_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) def wait_for_upload( @@ -3030,6 +3171,7 @@ def wait_for_upload( for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: + log.info("wait finished") return log.info( "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 71964f622f..05d5788028 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*Failed to load delta layer.*", + ".*Failed to reconstruct the page.*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -87,9 +87,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}" ) - # Second timeline has no ancestors, only the metadata file and no layer files. - # That is checked explicitly in the pageserver, and causes the tenant to be marked - # as broken. + # Second timeline has no ancestors, only the metadata file and no layer files locally, + # and we don't have the remote storage enabled. It is loaded into memory, but getting + # the basebackup from it will fail. with pytest.raises( Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken" ) as err: @@ -97,8 +97,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): log.info(f"As expected, compute startup failed for timeline with missing layers: {err}") # Third timeline will also fail during basebackup, because the layer file is corrupt. + # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) - with pytest.raises(Exception, match="Failed to load delta layer") as err: + with pytest.raises(Exception, match="Failed to reconstruct the page") as err: pg3.start() log.info( f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}" diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 7f86d92962..fa1bf0fbb2 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -37,7 +37,7 @@ def metrics_handler(request: Request) -> Response: checks = { "written_size": lambda value: value > 0, - "physical_size": lambda value: value >= 0, + "resident_size": lambda value: value >= 0, # >= 0 check here is to avoid race condition when we receive metrics before # remote_uploaded is updated "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py new file mode 100644 index 0000000000..352ae4b95c --- /dev/null +++ b/test_runner/regress/test_ondemand_download.py @@ -0,0 +1,437 @@ +# It's possible to run any regular test with the local fs remote storage via +# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... + +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + RemoteStorageKind, + assert_tenant_status, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, + wait_for_upload, + wait_until, +) +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +def get_num_downloaded_layers(client, tenant_id, timeline_id): + value = client.get_metric_value( + f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}' + ) + if value is None: + return 0 + return int(value) + + +# +# If you have a large relation, check that the pageserver downloads parts of it as +# require by queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_large_rel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_large_rel", + ) + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB + "compaction_threshold": "3", + "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + # We want to make sure that the data is large enough that the keyspace is partitioned. + num_rows = 1000000 + + with pg.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute( + f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) + cur.execute("CREATE INDEX ON tbl (id)") + cur.execute("VACUUM tbl") + + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data + pg.stop() + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + pg.start() + before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + + # Probe in the middle of the table. There's a high chance that the beginning + # and end of the table was stored together in the same layer files with data + # from other tables, and with the entry that stores the size of the + # relation, so they are likely already downloaded. But the middle of the + # table should not have been needed by anything yet. + with pg.cursor() as cur: + assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1 + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + log.info(f"layers downloaded before {before_downloads} and after {after_downloads}") + assert after_downloads > before_downloads + + +# +# If you have a relation with a long history of updates,the pageserver downloads the layer +# files containing the history as needed by timetravel queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_timetravel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_timetravel", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + lsns = [] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + lsns.append((0, current_lsn)) + + for checkpoint_number in range(1, 20): + with pg.cursor() as cur: + cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((checkpoint_number, current_lsn)) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + # current_physical_size reports sum of layer file sizes, regardless of local or remote + assert filled_current_physical == get_api_current_physical_size() + + num_layers_downloaded = [0] + physical_size = [get_resident_physical_size()] + for (checkpoint_number, lsn) in lsns: + pg_old = env.postgres.create_start( + branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn + ) + with pg_old.cursor() as cur: + # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000 + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}", + ) + == 0 + ) + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number={checkpoint_number}", + ) + == table_len + ) + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + num_layers_downloaded.append(after_downloads) + log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}") + + # Check that on each query, we need to download at least one more layer file. However in + # practice, thanks to compaction and the fact that some requests need to download + # more history, some points-in-time are covered by earlier downloads already. But + # in broad strokes, as we query more points-in-time, more layers need to be downloaded. + # + # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded + # more files than we had three iterations ago. + log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}") + if len(num_layers_downloaded) > 4: + assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4] + + # Likewise, assert that the physical_size metric grows as layers are downloaded + physical_size.append(get_resident_physical_size()) + log.info(f"physical_size[-1]={physical_size[-1]}") + if len(physical_size) > 4: + assert physical_size[-1] > physical_size[len(physical_size) - 4] + + # current_physical_size reports sum of layer file sizes, regardless of local or remote + assert filled_current_physical == get_api_current_physical_size() + + +# +# Ensure that the `download_remote_layers` API works +# +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_download_remote_layers_api( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_download_remote_layers_api", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + + env.postgres.stop_all() + + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + # XXX only delete some of the layer files, to show that it really just downloads all the layers + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + # Shut down safekeepers before starting the pageserver. + # If we don't, the tenant's walreceiver handler will trigger the + # the logical size computation task, and that downloads layes, + # which makes our assertions on size fail. + for sk in env.safekeepers: + sk.stop(immediate=True) + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) + env.pageserver.allowed_errors.extend( + [ + f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint", + f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size", + ] + ) + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + ###### Phase 1: exercise download error code path + assert ( + filled_current_physical == get_api_current_physical_size() + ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + post_unlink_size = get_resident_physical_size() + log.info(post_unlink_size) + assert ( + post_unlink_size < filled_size + ), "we just deleted layers and didn't cause anything to re-download them yet" + assert filled_size - post_unlink_size > 5 * ( + 1024**2 + ), "we may be downloading some layers as part of tenant activation" + + # issue downloads that we know will fail + info = client.timeline_download_remote_layers( + tenant_id, timeline_id, errors_ok=True, at_least_one_download=False + ) + log.info(f"info={info}") + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] == 0 + assert ( + info["failed_download_count"] > 0 + ) # can't assert == total_layer_count because attach + tenant status downloads some layers + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + assert get_api_current_physical_size() == filled_current_physical + assert ( + get_resident_physical_size() == post_unlink_size + ), "didn't download anything new due to failpoint" + # would be nice to assert that the layers in the layer map are still RemoteLayer + + ##### Retry, this time without failpoints + client.configure_failpoints(("remote-storage-download-pre-rename", "off")) + info = client.timeline_download_remote_layers(tenant_id, timeline_id, errors_ok=False) + log.info(f"info={info}") + + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] > 0 + assert info["failed_download_count"] == 0 + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + + refilled_size = get_resident_physical_size() + log.info(refilled_size) + + assert filled_size == refilled_size, "we redownloaded all the layers" + assert get_api_current_physical_size() == filled_current_physical + + for sk in env.safekeepers: + sk.start() + + # ensure that all the data is back + pg_old = env.postgres.create_start(branch_name="main") + with pg_old.cursor() as cur: + assert query_scalar(cur, "select count(*) from testtab") == table_len diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 94e483cdb5..32c25b2e8c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PageserverApiException, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, available_remote_storages, wait_for_last_flush_lsn, wait_for_last_record_lsn, @@ -62,9 +61,9 @@ def test_remote_storage_backup_and_restore( neon_env_builder.pageserver_config_override = "test_remote_failures=1" data_id = 1 - data_secret = "very secret secret" + data = "just some data" - ##### First start, insert secret data and upload it to the remote storage + ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() # FIXME: Is this expected? @@ -97,8 +96,8 @@ def test_remote_storage_backup_and_restore( with pg.cursor() as cur: cur.execute( f""" - CREATE TABLE t{checkpoint_number}(id int primary key, secret text); - INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + CREATE TABLE t{checkpoint_number}(id int primary key, data text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data}|{checkpoint_number}'); """ ) current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -133,36 +132,53 @@ def test_remote_storage_backup_and_restore( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - # Introduce failpoint in download - pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) - + # Introduce failpoint in list remote timelines code path to make tenant_attach fail. + # This is before the failures injected by test_remote_failures, so it's a permanent error. + pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) + env.pageserver.allowed_errors.append( + ".*error attaching tenant: storage-sync-list-remote-timelines", + ) + # Attach it. This HTTP request will succeed and launch a + # background task to load the tenant. In that background task, + # listing the remote timelines will fail because of the failpoint, + # and the tenant will be marked as Broken. client.tenant_attach(tenant_id) - - # is there a better way to assert that failpoint triggered? wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) - # assert cannot attach timeline that is scheduled for download - # FIXME implement layer download retries + # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): client.tenant_attach(tenant_id) - tenant_status = client.tenant_status(tenant_id) - log.info("Tenant status with active failpoint: %s", tenant_status) - # FIXME implement layer download retries - # assert tenant_status["has_in_progress_downloads"] is True - - # trigger temporary download files removal + # Restart again, this implicitly clears the failpoint. + # test_remote_failures=1 remains active, though, as it's in the pageserver config. + # This means that any of the remote client operations after restart will exercise the + # retry code path. + # + # The initiated attach operation should survive the restart, and continue from where it was. env.pageserver.stop() + layer_download_failed_regex = ( + r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure" + ) + assert not env.pageserver.log_contains( + layer_download_failed_regex + ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" env.pageserver.start() - # ensure that an initiated attach operation survives pageserver restart + # Ensure that the pageserver remembers that the tenant was attaching, by + # trying to attach it again. It should fail. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): client.tenant_attach(tenant_id) - log.info("waiting for timeline redownload") + log.info("waiting for tenant to become active. this should be quick with on-demand download") + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + wait_until( - number_of_iterations=20, + number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=tenant_active, ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -171,14 +187,18 @@ def test_remote_storage_backup_and_restore( Lsn(detail["last_record_lsn"]) >= current_lsn ), "current db Lsn should should not be less than the one stored on remote storage" + log.info("select some data, this will cause layers to be downloaded") pg = env.postgres.create_start("main") with pg.cursor() as cur: for checkpoint_number in checkpoint_numbers: assert ( - query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") - == f"{data_secret}|{checkpoint_number}" + query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};") + == f"{data}|{checkpoint_number}" ) + log.info("ensure that we neede to retry downloads due to test_remote_failures=1") + assert env.pageserver.log_contains(layer_download_failed_regex) + # Exercises the upload queue retry code paths. # - Use failpoints to cause all storage ops to fail @@ -338,7 +358,6 @@ def test_remote_storage_upload_queue_retries( def tenant_active(): all_states = client.tenant_list() [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["has_in_progress_downloads"] is False assert tenant["state"] == "Active" wait_until(30, 1, tenant_active) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 081fd0fc2f..1b58937e2a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -13,12 +13,15 @@ from fixtures.neon_fixtures import ( PageserverHttpClient, PortDistributor, Postgres, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, + tenant_exists, wait_for_last_record_lsn, wait_for_upload, + wait_until, + wait_while, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until +from fixtures.utils import query_scalar, start_in_background, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -406,17 +409,13 @@ def test_tenant_relocation( # call to attach timeline to new pageserver new_pageserver_http.tenant_attach(tenant_id) - # check that it shows that download is in progress + # wait for tenant to finish attaching tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status.get("has_in_progress_downloads"), tenant_status - - # wait until tenant is downloaded + assert tenant_status["state"] in ["Attaching", "Active"] wait_until( number_of_iterations=10, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant( - new_pageserver_http, tenant_id - ), + func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"), ) check_timeline_attached( @@ -459,9 +458,15 @@ def test_tenant_relocation( # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver - # is no longer involved, and if it is, we will see the errors + # is no longer involved, and if it is, we will see the error pageserver_http.tenant_detach(tenant_id) + # Wait a little, so that the detach operation has time to finish. + wait_while( + number_of_iterations=100, + interval=1, + func=lambda: tenant_exists(pageserver_http, tenant_id), + ) post_migration_check(pg_main, 500500, old_local_path_main) post_migration_check(pg_second, 1001000, old_local_path_second) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index ddae1a67ff..4eba4ce942 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -20,44 +20,48 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): matching = [t for t in all_states if TenantId(t["id"]) == tenant] return get_only_element(matching)["state"] - def get_metric_value(name): - metrics = client.get_metrics() - relevant = [line for line in metrics.splitlines() if line.startswith(name)] - if len(relevant) == 0: - return 0 - line = get_only_element(relevant) - value = line.lstrip(name).strip() - return int(value) - def delete_all_timelines(tenant: TenantId): timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) + def assert_active(tenant): + assert get_state(tenant) == "Active" + # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) + assert ( + get_state(tenant) == "Active" + ), "Pageserver should activate a tenant and start background jobs if timelines are loaded" # Stop compute pg.stop() - # Delete all timelines on all tenants + # Delete all timelines on all tenants. + # + # FIXME: we used to check that the background jobs are stopped when all timelines + # are removed, but we don't stop them anymore. Not sure if this test still makes sense + # or we should just remove it. for tenant_info in client.tenant_list(): tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active(tenant_id)) # Assert that all tasks finish quickly after tenant is detached - assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + assert task_starts is not None + assert int(task_starts) > 0 client.tenant_detach(tenant) client.tenant_detach(env.initial_tenant) def assert_tasks_finish(): - tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') - tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') - tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}') + tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}') log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended - assert tasks_panicked == 0 + assert tasks_panicked is None or int(tasks_panicked) == 0 wait_until(10, 0.2, assert_tasks_finish) diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 4cd74e17e9..6a5b4278da 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -21,7 +21,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, Postgres, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, available_remote_storages, wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, @@ -179,14 +179,6 @@ def test_tenants_attached_after_download( tenant_id, timeline_id, env.safekeepers, env.pageserver ) - detail_before = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert ( - detail_before["current_physical_size_non_incremental"] - == detail_before["current_physical_size"] - ) - env.pageserver.stop() timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -200,13 +192,16 @@ def test_tenants_attached_after_download( assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + # FIXME: just starting the pageserver no longer downloads the + # layer files. Do we want to force download, or maybe run some + # queries, or is it enough that it starts up without layer files? env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -218,12 +213,6 @@ def test_tenants_attached_after_download( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" - # Check that the physical size matches after re-downloading - detail_after = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert detail_before["current_physical_size"] == detail_after["current_physical_size"] - # Check that we had to retry the downloads assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*") @@ -297,7 +286,7 @@ def test_tenant_upgrades_index_json_from_v0( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), ) pg = env.postgres.create_start("main") @@ -404,7 +393,7 @@ def test_tenant_ignores_backup_file( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), ) pg = env.postgres.create_start("main") @@ -484,14 +473,15 @@ def test_tenant_redownloads_truncated_file_on_startup( index_part = local_fs_index_part(env, tenant_id, timeline_id) assert index_part["layer_metadata"][path.name]["file_size"] == expected_size - ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + ## Start the pageserver. It will notice that the file size doesn't match, and + ## rename away the local file. It will be re-downloaded when it's needed. env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -503,6 +493,10 @@ def test_tenant_redownloads_truncated_file_on_startup( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + # Request non-incremental logical size. Calculating it needs the layer file that + # we corrupted, forcing it to be redownloaded. + client.timeline_detail(tenant_id, timeline_id, include_non_incremental_logical_size=True) + assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" # the remote side of local_layer_truncated diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 523c946a68..3b41cc5c90 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -20,10 +20,12 @@ from fixtures.neon_fixtures import ( PortDistributor, Postgres, VanillaPostgres, + assert_tenant_status, wait_for_last_flush_lsn, + wait_until, ) from fixtures.types import TenantId, TimelineId -from fixtures.utils import get_timeline_dir_size, wait_until +from fixtures.utils import get_timeline_dir_size def test_timeline_size(neon_simple_env: NeonEnv): @@ -320,7 +322,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env.pageserver.stop() env.pageserver.start() - assert_physical_size(env, env.initial_tenant, new_timeline_id) + # Wait for the tenant to be loaded + client = env.pageserver.http_client() + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"), + ) + + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): @@ -341,7 +353,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): @@ -376,7 +390,9 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): @@ -415,7 +431,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) # The timeline logical and physical sizes are also exposed as prometheus metrics. @@ -448,7 +466,7 @@ def test_timeline_size_metrics( # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() matches = re.search( - f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', + f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -507,11 +525,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: TimelineId): - res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) - return res["current_physical_size_non_incremental"] + def get_timeline_resident_physical_size(timeline: TimelineId): + sizes = get_physical_size_values(env, tenant, timeline) + assert_physical_size_invariants(sizes) + return sizes.prometheus_resident_physical - timeline_total_size = get_timeline_physical_size(timeline) + timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline) for i in range(10): n_rows = random.randint(100, 1000) @@ -528,22 +547,54 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, tenant, timeline) pageserver_http.timeline_checkpoint(tenant, timeline) - timeline_total_size += get_timeline_physical_size(timeline) + timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) pg.stop() - tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"]) - assert tenant_physical_size == timeline_total_size + # ensure that tenant_status current_physical size reports sum of timeline current_physical_size + tenant_current_physical_size = int( + client.tenant_status(tenant_id=tenant)["current_physical_size"] + ) + assert tenant_current_physical_size == sum( + [tl["current_physical_size"] for tl in client.timeline_list(tenant_id=tenant)] + ) + # since we don't do layer eviction, current_physical_size is identical to resident physical size + assert timeline_total_resident_physical_size == tenant_current_physical_size -def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): - """Check the current physical size returned from timeline API - matches the total physical size of the timeline on disk""" +class TimelinePhysicalSizeValues: + api_current_physical: int + prometheus_resident_physical: int + python_timelinedir_layerfiles_physical: int + + +def get_physical_size_values( + env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId +) -> TimelinePhysicalSizeValues: + res = TimelinePhysicalSizeValues() + client = env.pageserver.http_client() - res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True) + + res.prometheus_resident_physical = client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + detail = client.timeline_detail( + tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True + ) + res.api_current_physical = detail["current_physical_size"] + timeline_path = env.timeline_dir(tenant_id, timeline_id) - assert res["current_physical_size"] == res["current_physical_size_non_incremental"] - assert res["current_physical_size"] == get_timeline_dir_size(timeline_path) + res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path) + + return res + + +def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): + # resident phyiscal size is defined as + assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical + # we don't do layer eviction, so, all layers are resident + assert sizes.api_current_physical == sizes.prometheus_resident_physical # Timeline logical size initialization is an asynchronous background task that runs once, diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index d88ed319b5..77ec33f8b0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -585,17 +585,23 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re if elapsed > wait_lsn_timeout: raise RuntimeError("Timed out waiting for WAL redo") - pageserver_lsn = Lsn( - env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] - ) - lag = last_lsn - pageserver_lsn + tenant_status = ps_cli.tenant_status(tenant_id) + if tenant_status["state"] == "Loading": + log.debug(f"Tenant {tenant_id} is still loading, retrying") + else: + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + ) + lag = last_lsn - pageserver_lsn - if time.time() > last_debug_print + 10 or lag <= 0: - last_debug_print = time.time() - log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") + if time.time() > last_debug_print + 10 or lag <= 0: + last_debug_print = time.time() + log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") - if lag <= 0: - break + if lag <= 0: + break time.sleep(1)