diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b2c9a19588..fe24f6330e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -53,7 +53,7 @@ jobs: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md - ## Release ${RELEASE_DATE} + ## Storage & Compute release ${RELEASE_DATE} **Please merge this Pull Request using 'Create a merge commit' button** EOF diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 5bf3246f34..87fb218245 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -243,12 +243,15 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/pgvector.patch /pgvector.patch +# By default, pgvector Makefile uses `-march=native`. We don't want that, +# because we build the images on different machines than where we run them. +# Pass OPTFLAGS="" to remove it. RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \ echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index ccf9108895..9f57f3d507 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -178,6 +178,13 @@ impl PgConnectionConfig { } } +impl fmt::Display for PgConnectionConfig { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // The password is intentionally hidden and not part of this display string. + write!(f, "postgresql://{}:{}", self.host, self.port) + } +} + impl fmt::Debug for PgConnectionConfig { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")` diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 220d4ef115..24c1248304 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -29,7 +29,6 @@ use http_types::{StatusCode, Url}; use tokio_util::sync::CancellationToken; use tracing::debug; -use crate::RemoteStorageActivity; use crate::{ error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, @@ -526,10 +525,6 @@ impl RemoteStorage for AzureBlobStorage { // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview Err(TimeTravelError::Unimplemented) } - - fn activity(&self) -> RemoteStorageActivity { - self.concurrency_limiter.activity() - } } pin_project_lite::pin_project! { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index f024021507..708662f20f 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -263,17 +263,6 @@ pub trait RemoteStorage: Send + Sync + 'static { done_if_after: SystemTime, cancel: &CancellationToken, ) -> Result<(), TimeTravelError>; - - /// Query how busy we currently are: may be used by callers which wish to politely - /// back off if there are already a lot of operations underway. - fn activity(&self) -> RemoteStorageActivity; -} - -pub struct RemoteStorageActivity { - pub read_available: usize, - pub read_total: usize, - pub write_available: usize, - pub write_total: usize, } /// DownloadStream is sensitive to the timeout and cancellation used with the original @@ -455,15 +444,6 @@ impl GenericRemoteStorage> { } } } - - pub fn activity(&self) -> RemoteStorageActivity { - match self { - Self::LocalFs(s) => s.activity(), - Self::AwsS3(s) => s.activity(), - Self::AzureBlob(s) => s.activity(), - Self::Unreliable(s) => s.activity(), - } - } } impl GenericRemoteStorage { @@ -794,9 +774,6 @@ struct ConcurrencyLimiter { // The helps to ensure we don't exceed the thresholds. write: Arc, read: Arc, - - write_total: usize, - read_total: usize, } impl ConcurrencyLimiter { @@ -825,21 +802,10 @@ impl ConcurrencyLimiter { Arc::clone(self.for_kind(kind)).acquire_owned().await } - fn activity(&self) -> RemoteStorageActivity { - RemoteStorageActivity { - read_available: self.read.available_permits(), - read_total: self.read_total, - write_available: self.write.available_permits(), - write_total: self.write_total, - } - } - fn new(limit: usize) -> ConcurrencyLimiter { Self { read: Arc::new(Semaphore::new(limit)), write: Arc::new(Semaphore::new(limit)), - read_total: limit, - write_total: limit, } } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index f12f6590a3..1f7bcfc982 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use utils::crashsafe::path_with_suffix_extension; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity, - TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -605,16 +605,6 @@ impl RemoteStorage for LocalFs { ) -> Result<(), TimeTravelError> { Err(TimeTravelError::Unimplemented) } - - fn activity(&self) -> RemoteStorageActivity { - // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available - RemoteStorageActivity { - read_available: 16, - read_total: 16, - write_available: 16, - write_total: 16, - } - } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 0f6772b274..c3d6c75e20 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -47,8 +47,8 @@ use utils::backoff; use super::StorageMetadata; use crate::{ error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, - Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config, - TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel, + MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; pub(super) mod metrics; @@ -975,10 +975,6 @@ impl RemoteStorage for S3Bucket { } Ok(()) } - - fn activity(&self) -> RemoteStorageActivity { - self.concurrency_limiter.activity() - } } /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 66522e04ca..c467a2d196 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken; use crate::{ Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, - RemoteStorageActivity, StorageMetadata, TimeTravelError, + StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { @@ -213,8 +213,4 @@ impl RemoteStorage for UnreliableWrapper { .time_travel_recover(prefix, timestamp, done_if_after, cancel) .await } - - fn activity(&self) -> RemoteStorageActivity { - self.inner.activity() - } } diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index c34176af57..156b99a010 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -135,7 +135,8 @@ impl Gate { let started_at = std::time::Instant::now(); let mut do_close = std::pin::pin!(self.do_close()); - let nag_after = Duration::from_secs(1); + // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms + let nag_after = Duration::from_millis(100); let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else { return; diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index a7c8bd5c1f..776c537d03 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -380,8 +380,8 @@ impl interface::CompactionLayer for MockLayer { } fn file_size(&self) -> u64 { match self { - MockLayer::Delta(this) => this.file_size(), - MockLayer::Image(this) => this.file_size(), + MockLayer::Delta(this) => this.file_size, + MockLayer::Image(this) => this.file_size, } } fn short_id(&self) -> String { diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 0d010eb009..2998b5c732 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use anyhow::Context; use camino::Utf8PathBuf; -use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; use utils::lsn::Lsn; @@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; #[derive(serde::Serialize)] struct Output<'a> { - layer_metadata: &'a HashMap, + layer_metadata: &'a HashMap, disk_consistent_lsn: Lsn, timeline_metadata: &'a TimelineMetadata, } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 7f25e49570..90bd4294bb 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( }); } EvictionLayer::Secondary(layer) => { - let file_size = layer.metadata.file_size(); + let file_size = layer.metadata.file_size; js.spawn(async move { layer @@ -641,7 +641,7 @@ impl EvictionLayer { pub(crate) fn get_file_size(&self) -> u64 { match self { Self::Attached(l) => l.layer_desc().file_size, - Self::Secondary(sl) => sl.metadata.file_size(), + Self::Secondary(sl) => sl.metadata.file_size, } } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 7b55e88096..8a061f3ae1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -395,7 +395,7 @@ async fn build_timeline_info_common( let guard = timeline.last_received_wal.lock().unwrap(); if let Some(info) = guard.as_ref() { ( - Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only. + Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only. Some(info.last_received_msg_lsn), Some(info.last_received_msg_ts), ) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d250864fd6..e9651165b1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -260,6 +260,8 @@ async fn page_service_conn_main( socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); let socket = std::pin::pin!(socket); + fail::fail_point!("ps::connection-start::pre-login"); + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler @@ -603,6 +605,7 @@ impl PageServerHandler { }; trace!("query: {copy_data_bytes:?}"); + fail::fail_point!("ps::handle-pagerequest-message"); // Trace request if needed if let Some(t) = tracer.as_mut() { @@ -617,6 +620,7 @@ impl PageServerHandler { let (response, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx) @@ -626,6 +630,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::Nblocks(req) => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) @@ -635,6 +640,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetPage(req) => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); // shard_id is filled in by the handler let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn); ( @@ -645,6 +651,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::DbSize(req) => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); ( self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) @@ -654,6 +661,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetSlruSegment(req) => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); ( self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) @@ -1505,6 +1513,7 @@ where _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { + fail::fail_point!("ps::connection-start::startup-packet"); Ok(()) } @@ -1519,6 +1528,8 @@ where Err(QueryError::SimulatedConnectionError) }); + fail::fail_point!("ps::connection-start::process-query"); + let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); let parts = query_string.split_whitespace().collect::>(); diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 24b4e4f3ea..2be8816cef 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -238,10 +238,13 @@ impl BlobWriter { io_buf, Err(Error::new( ErrorKind::Other, - format!("blob too large ({} bytes)", len), + format!("blob too large ({len} bytes)"), )), ); } + if len > 0x0fff_ffff { + tracing::warn!("writing blob above future limit ({len} bytes)"); + } let mut len_buf = (len as u32).to_be_bytes(); len_buf[0] |= 0x80; io_buf.extend_from_slice(&len_buf[..]); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index d3adae6841..23904b9da4 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1192,7 +1192,7 @@ impl RemoteTimelineClient { &self.storage_impl, uploaded.local_path(), &remote_path, - uploaded.metadata().file_size(), + uploaded.metadata().file_size, cancel, ) .await @@ -1573,7 +1573,7 @@ impl RemoteTimelineClient { &self.storage_impl, local_path, &remote_path, - layer_metadata.file_size(), + layer_metadata.file_size, &self.cancel, ) .measure_remote_op( @@ -1768,7 +1768,7 @@ impl RemoteTimelineClient { UploadOp::UploadLayer(_, m) => ( RemoteOpFileKind::Layer, RemoteOpKind::Upload, - RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()), + RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), ), UploadOp::UploadMetadata(_, _) => ( RemoteOpFileKind::Index, diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 70c5cae05e..bd75f980e8 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -84,7 +84,7 @@ pub async fn download_layer_file<'a>( ) .await?; - let expected = layer_metadata.file_size(); + let expected = layer_metadata.file_size; if expected != bytes_amount { return Err(DownloadError::Other(anyhow!( "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}", diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 032dda7ff3..f5d939c747 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -17,46 +17,6 @@ use pageserver_api::shard::ShardIndex; use utils::lsn::Lsn; -/// Metadata gathered for each of the layer files. -/// -/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which -/// might have less or more metadata depending if upgrading or rolling back an upgrade. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -//#[cfg_attr(test, derive(Default))] -pub struct LayerFileMetadata { - file_size: u64, - - pub(crate) generation: Generation, - - pub(crate) shard: ShardIndex, -} - -impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { - fn from(other: &IndexLayerMetadata) -> Self { - LayerFileMetadata { - file_size: other.file_size, - generation: other.generation, - shard: other.shard, - } - } -} - -impl LayerFileMetadata { - pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { - LayerFileMetadata { - file_size, - generation, - shard, - } - } - - pub fn file_size(&self) -> u64 { - self.file_size - } -} - -// TODO seems like another part of the remote storage file format -// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 /// In-memory representation of an `index_part.json` file /// /// Contains the data about all files in the timeline, present remotely and its metadata. @@ -77,7 +37,7 @@ pub struct IndexPart { /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated for convenience when reading the serialized structure, but is @@ -127,10 +87,7 @@ impl IndexPart { lineage: Lineage, last_aux_file_policy: Option, ) -> Self { - let layer_metadata = layers_and_metadata - .iter() - .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v))) - .collect(); + let layer_metadata = layers_and_metadata.clone(); Self { version: Self::LATEST_VERSION, @@ -194,9 +151,12 @@ impl From<&UploadQueueInitialized> for IndexPart { } } -/// Serialized form of [`LayerFileMetadata`]. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] -pub struct IndexLayerMetadata { +/// Metadata gathered for each of the layer files. +/// +/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which +/// might have less or more metadata depending if upgrading or rolling back an upgrade. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct LayerFileMetadata { pub file_size: u64, #[serde(default = "Generation::none")] @@ -208,12 +168,12 @@ pub struct IndexLayerMetadata { pub shard: ShardIndex, } -impl From<&LayerFileMetadata> for IndexLayerMetadata { - fn from(other: &LayerFileMetadata) -> Self { - IndexLayerMetadata { - file_size: other.file_size, - generation: other.generation, - shard: other.shard, +impl LayerFileMetadata { + pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { + LayerFileMetadata { + file_size, + generation, + shard, } } } @@ -307,12 +267,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -349,12 +309,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -392,12 +352,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 2, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -480,12 +440,12 @@ mod tests { let expected = IndexPart { version: 4, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -522,12 +482,12 @@ mod tests { let expected = IndexPart { version: 5, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata { file_size: 23289856, generation: Generation::new(1), shard: ShardIndex::unsharded(), }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata { file_size: 1015808, generation: Generation::new(1), shard: ShardIndex::unsharded(), @@ -569,12 +529,12 @@ mod tests { let expected = IndexPart { version: 6, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 870475eb57..0ec1bd649b 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -45,10 +45,10 @@ use crate::tenant::{ use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; -use futures::{Future, StreamExt}; +use futures::Future; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity}; +use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; @@ -67,12 +67,6 @@ use super::{ /// download, if the uploader populated it. const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000); -/// Range of concurrency we may use when downloading layers within a timeline. This is independent -/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in -/// `PageServerConf::secondary_download_concurrency` -const MAX_LAYER_CONCURRENCY: usize = 16; -const MIN_LAYER_CONCURRENCY: usize = 1; - pub(super) async fn downloader_task( tenant_manager: Arc, remote_storage: GenericRemoteStorage, @@ -81,19 +75,18 @@ pub(super) async fn downloader_task( cancel: CancellationToken, root_ctx: RequestContext, ) { - // How many tenants' secondary download operations we will run concurrently - let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency; + let concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, root_ctx, }; - let mut scheduler = Scheduler::new(generator, tenant_concurrency); + let mut scheduler = Scheduler::new(generator, concurrency); scheduler .run(command_queue, background_jobs_can_start, cancel) - .instrument(info_span!("secondary_downloads")) + .instrument(info_span!("secondary_download_scheduler")) .await } @@ -414,7 +407,7 @@ impl JobGenerator { - tracing::debug!("Shut down while downloading"); + tracing::info!("Shut down while downloading"); }, Err(UpdateError::Deserialize(e)) => { tracing::error!("Corrupt content while downloading tenant: {e}"); @@ -716,7 +709,7 @@ impl<'a> TenantDownloader<'a> { let mut layer_byte_count: u64 = timeline_state .on_disk_layers .values() - .map(|l| l.metadata.file_size()) + .map(|l| l.metadata.file_size) .sum(); // Remove on-disk layers that are no longer present in heatmap @@ -727,7 +720,7 @@ impl<'a> TenantDownloader<'a> { .get(layer_file_name) .unwrap() .metadata - .file_size(); + .file_size; let local_path = local_layer_path( self.conf, @@ -848,8 +841,6 @@ impl<'a> TenantDownloader<'a> { tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); - let mut download_futs = Vec::new(); - // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { @@ -886,9 +877,7 @@ impl<'a> TenantDownloader<'a> { } } - if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) - || on_disk.access_time != layer.access_time - { + if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { // We already have this layer on disk. Update its access time. tracing::debug!( "Access time updated for layer {}: {} -> {}", @@ -924,31 +913,14 @@ impl<'a> TenantDownloader<'a> { } } - download_futs.push(self.download_layer( - tenant_shard_id, - &timeline.timeline_id, - layer, - ctx, - )); - } - - // Break up layer downloads into chunks, so that for each chunk we can re-check how much - // concurrency to use based on activity level of remote storage. - while !download_futs.is_empty() { - let chunk = - download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY)); - - let concurrency = Self::layer_concurrency(self.remote_storage.activity()); - - let mut result_stream = futures::stream::iter(chunk).buffered(concurrency); - let mut result_stream = std::pin::pin!(result_stream); - while let Some(result) = result_stream.next().await { - match result { - Err(e) => return Err(e), - Ok(None) => { - // No error, but we didn't download the layer. Don't mark it touched - } - Ok(Some(layer)) => touched.push(layer), + match self + .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) + .await? + { + Some(layer) => touched.push(layer), + None => { + // Not an error but we didn't download it: remote layer is missing. Don't add it to the list of + // things to consider touched. } } } @@ -979,7 +951,7 @@ impl<'a> TenantDownloader<'a> { tenant_shard_id, &timeline.timeline_id, t.name, - LayerFileMetadata::from(&t.metadata), + t.metadata.clone(), t.access_time, local_path, )); @@ -1013,13 +985,18 @@ impl<'a> TenantDownloader<'a> { ); // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally + tracing::info!( + "Starting download of layer {}, size {}", + layer.name, + layer.metadata.file_size + ); let downloaded_bytes = match download_layer_file( self.conf, self.remote_storage, *tenant_shard_id, *timeline_id, &layer.name, - &LayerFileMetadata::from(&layer.metadata), + &layer.metadata, &local_path, &self.secondary_state.cancel, ctx, @@ -1078,19 +1055,6 @@ impl<'a> TenantDownloader<'a> { Ok(Some(layer)) } - - /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage - fn layer_concurrency(activity: RemoteStorageActivity) -> usize { - // When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping - // of our concurrency range to the units available within the remaining 25%. - let clamp_at = (activity.read_total * 3) / 4; - if activity.read_available > clamp_at { - (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at)) - / (activity.read_total - clamp_at) - } else { - MIN_LAYER_CONCURRENCY - } - } } /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline @@ -1180,7 +1144,7 @@ async fn init_timeline_state( tenant_shard_id, &heatmap.timeline_id, name, - LayerFileMetadata::from(&remote_meta.metadata), + remote_meta.metadata.clone(), remote_meta.access_time, file_path, ), @@ -1214,58 +1178,3 @@ async fn init_timeline_state( detail } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn layer_concurrency() { - // Totally idle - assert_eq!( - TenantDownloader::layer_concurrency(RemoteStorageActivity { - read_available: 16, - read_total: 16, - write_available: 16, - write_total: 16 - }), - MAX_LAYER_CONCURRENCY - ); - - // Totally busy - assert_eq!( - TenantDownloader::layer_concurrency(RemoteStorageActivity { - read_available: 0, - read_total: 16, - - write_available: 16, - write_total: 16 - }), - MIN_LAYER_CONCURRENCY - ); - - // Edge of the range at which we interpolate - assert_eq!( - TenantDownloader::layer_concurrency(RemoteStorageActivity { - read_available: 12, - read_total: 16, - - write_available: 16, - write_total: 16 - }), - MIN_LAYER_CONCURRENCY - ); - - // Midpoint of the range in which we interpolate - assert_eq!( - TenantDownloader::layer_concurrency(RemoteStorageActivity { - read_available: 14, - read_total: 16, - - write_available: 16, - write_total: 16 - }), - MAX_LAYER_CONCURRENCY / 2 - ); - } -} diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 2da4a3b9d5..166483ba5d 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,6 +1,6 @@ use std::time::SystemTime; -use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName}; +use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; @@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline { #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { pub(super) name: LayerName, - pub(super) metadata: IndexLayerMetadata, + pub(super) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] pub(super) access_time: SystemTime, @@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer { impl HeatMapLayer { pub(crate) fn new( name: LayerName, - metadata: IndexLayerMetadata, + metadata: LayerFileMetadata, access_time: SystemTime, ) -> Self { Self { diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index fddced3ead..9c7a9c4234 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task( scheduler .run(command_queue, background_jobs_can_start, cancel) - .instrument(info_span!("heatmap_uploader")) + .instrument(info_span!("heatmap_upload_scheduler")) .await } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 3d042f4513..0ec1c7872a 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -179,6 +179,13 @@ where // Schedule some work, if concurrency limit permits it self.spawn_pending(); + // This message is printed every scheduling iteration as proof of liveness when looking at logs + tracing::info!( + "Status: {} tasks running, {} pending", + self.running.len(), + self.pending.len() + ); + // Between scheduling iterations, we will: // - Drain any complete tasks and spawn pending tasks // - Handle incoming administrative commands @@ -258,7 +265,11 @@ where self.tasks.spawn(fut); - self.running.insert(tenant_shard_id, in_progress); + let replaced = self.running.insert(tenant_shard_id, in_progress); + debug_assert!(replaced.is_none()); + if replaced.is_some() { + tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running") + } } /// For all pending tenants that are elegible for execution, spawn their task. @@ -268,7 +279,9 @@ where while !self.pending.is_empty() && self.running.len() < self.concurrency { // unwrap: loop condition includes !is_empty() let pending = self.pending.pop_front().unwrap(); - self.do_spawn(pending); + if !self.running.contains_key(pending.get_tenant_shard_id()) { + self.do_spawn(pending); + } } } @@ -321,7 +334,8 @@ where let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { - tracing::info!("Command already running, waiting for it"); + tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Command already running, waiting for it"); barrier } else { let running = self.spawn_now(job); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index becd1e7a6d..8394b33f19 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -47,7 +47,7 @@ use hex; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; -use pageserver_api::shard::TenantShardId; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::File; @@ -473,7 +473,7 @@ impl ImageLayerInner { ctx: &RequestContext, ) -> Result<(), GetVectoredError> { let reads = self - .plan_reads(keyspace, ctx) + .plan_reads(keyspace, None, ctx) .await .map_err(GetVectoredError::Other)?; @@ -485,9 +485,15 @@ impl ImageLayerInner { Ok(()) } + /// Traverse the layer's index to build read operations on the overlap of the input keyspace + /// and the keys in this layer. + /// + /// If shard_identity is provided, it will be used to filter keys down to those stored on + /// this shard. async fn plan_reads( &self, keyspace: KeySpace, + shard_identity: Option<&ShardIdentity>, ctx: &RequestContext, ) -> anyhow::Result> { let mut planner = VectoredReadPlanner::new( @@ -507,7 +513,6 @@ impl ImageLayerInner { for range in keyspace.ranges.iter() { let mut range_end_handled = false; - let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; range.start.write_to_byte_slice(&mut search_key); @@ -520,12 +525,22 @@ impl ImageLayerInner { let key = Key::from_slice(&raw_key[..KEY_SIZE]); assert!(key >= range.start); + let flag = if let Some(shard_identity) = shard_identity { + if shard_identity.is_key_disposable(&key) { + BlobFlag::Ignore + } else { + BlobFlag::None + } + } else { + BlobFlag::None + }; + if key >= range.end { planner.handle_range_end(offset); range_end_handled = true; break; } else { - planner.handle(key, self.lsn, offset, BlobFlag::None); + planner.handle(key, self.lsn, offset, flag); } } @@ -538,6 +553,50 @@ impl ImageLayerInner { Ok(planner.finish()) } + /// Given a key range, select the parts of that range that should be retained by the ShardIdentity, + /// then execute vectored GET operations, passing the results of all read keys into the writer. + pub(super) async fn filter( + &self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + // Fragment the range into the regions owned by this ShardIdentity + let plan = self + .plan_reads( + KeySpace { + // If asked for the total key space, plan_reads will give us all the keys in the layer + ranges: vec![Key::MIN..Key::MAX], + }, + Some(shard_identity), + ctx, + ) + .await?; + + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut key_count = 0; + for read in plan.into_iter() { + let buf_size = read.size(); + + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; + + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + + key_count += 1; + writer + .put_image(meta.meta.key, img_buf, ctx) + .await + .context(format!("Storing key {}", meta.meta.key))?; + } + } + + Ok(key_count) + } + async fn do_reads_and_update_state( &self, reads: Vec, @@ -650,7 +709,7 @@ impl ImageLayerWriterInner { lsn, }, ); - info!("new image layer {path}"); + trace!("creating image layer {}", path); let mut file = { VirtualFile::open_with_options( &path, @@ -770,7 +829,7 @@ impl ImageLayerWriterInner { // FIXME: why not carry the virtualfile here, it supports renaming? let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; - trace!("created image layer {}", layer.local_path()); + info!("created image layer {}", layer.local_path()); Ok(layer) } @@ -855,3 +914,136 @@ impl Drop for ImageLayerWriter { } } } + +#[cfg(test)] +mod test { + use bytes::Bytes; + use pageserver_api::{ + key::Key, + shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, + }; + use utils::{id::TimelineId, lsn::Lsn}; + + use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + + use super::ImageLayerWriter; + + #[tokio::test] + async fn image_layer_rewrite() { + let harness = TenantHarness::create("test_image_layer_rewrite").unwrap(); + let (tenant, ctx) = harness.load().await; + + // The LSN at which we will create an image layer to filter + let lsn = Lsn(0xdeadbeef0000); + + let timeline_id = TimelineId::generate(); + let timeline = tenant + .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let range = input_start..input_end; + + // Build an image layer to filter + let resident = { + let mut writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + let foo_img = Bytes::from_static(&[1, 2, 3, 4]); + let mut key = range.start; + while key < range.end { + writer.put_image(key, foo_img.clone(), &ctx).await.unwrap(); + + key = key.next(); + } + writer.finish(&timeline, &ctx).await.unwrap() + }; + let original_size = resident.metadata().file_size; + + // Filter for various shards: this exercises cases like values at start of key range, end of key + // range, middle of key range. + for shard_number in 0..4 { + let mut filtered_writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity + // to exercise filter() + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount::new(4), + ShardStripeSize(0x8000), + ) + .unwrap(); + + let wrote_keys = resident + .filter(&shard_identity, &mut filtered_writer, &ctx) + .await + .unwrap(); + let replacement = if wrote_keys > 0 { + Some(filtered_writer.finish(&timeline, &ctx).await.unwrap()) + } else { + None + }; + + // This exact size and those below will need updating as/when the layer encoding changes, but + // should be deterministic for a given version of the format, as we used no randomness generating the input. + assert_eq!(original_size, 1597440); + + match shard_number { + 0 => { + // We should have written out just one stripe for our shard identity + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + + // We should have dropped some of the data + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + + // Assert that we dropped ~3/4 of the data. + assert_eq!(replacement.metadata().file_size, 417792); + } + 1 => { + // Shard 1 has no keys in our input range + assert_eq!(wrote_keys, 0x0); + assert!(replacement.is_none()); + } + 2 => { + // Shard 2 has one stripes in the input range + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 417792); + } + 3 => { + // Shard 3 has two stripes in the input range + assert_eq!(wrote_keys, 0x10000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 811008); + } + _ => unreachable!(), + } + } + } +} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8c64621710..3ac799c69a 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; -use pageserver_api::shard::{ShardIndex, TenantShardId}; +use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; @@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; -use super::image_layer; +use super::image_layer::{self}; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc, - ValueReconstructResult, ValueReconstructState, ValuesReconstructState, + AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, + PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; use utils::generation::Generation; @@ -161,7 +161,7 @@ impl Layer { timeline.tenant_shard_id, timeline.timeline_id, file_name, - metadata.file_size(), + metadata.file_size, ); let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted); @@ -194,7 +194,7 @@ impl Layer { timeline.tenant_shard_id, timeline.timeline_id, file_name, - metadata.file_size(), + metadata.file_size, ); let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident); @@ -227,7 +227,7 @@ impl Layer { timeline .metrics - .resident_physical_size_add(metadata.file_size()); + .resident_physical_size_add(metadata.file_size); ResidentLayer { downloaded, owner } } @@ -1802,16 +1802,15 @@ impl ResidentLayer { use LayerKind::*; let owner = &self.owner.0; - match self.downloaded.get(owner, ctx).await? { Delta(ref d) => { + // this is valid because the DownloadedLayer::kind is a OnceCell, not a + // Mutex, so we cannot go and deinitialize the value with OnceCell::take + // while it's being held. owner .access_stats .record_access(LayerAccessKind::KeyIter, ctx); - // this is valid because the DownloadedLayer::kind is a OnceCell, not a - // Mutex, so we cannot go and deinitialize the value with OnceCell::take - // while it's being held. delta_layer::DeltaLayerInner::load_keys(d, ctx) .await .with_context(|| format!("Layer index is corrupted for {self}")) @@ -1820,6 +1819,23 @@ impl ResidentLayer { } } + /// Read all they keys in this layer which match the ShardIdentity, and write them all to + /// the provided writer. Return the number of keys written. + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] + pub(crate) async fn filter<'a>( + &'a self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + use LayerKind::*; + + match self.downloaded.get(&self.owner.0, ctx).await? { + Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")), + Image(i) => i.filter(shard_identity, writer, ctx).await, + } + } + /// Returns the amount of keys and values written to the writer. pub(crate) async fn copy_delta_prefix( &self, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 881e7f8f3c..1bdbddd95f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -41,6 +41,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{ bin_ser::BeSer, + fs_ext, sync::gate::{Gate, GateGuard}, vec_map::VecMap, }; @@ -60,6 +61,7 @@ use std::{ ops::ControlFlow, }; +use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ @@ -88,9 +90,6 @@ use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{ - pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata, -}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, @@ -1424,7 +1423,7 @@ impl Timeline { let layer_map = guard.layer_map(); let mut size = 0; for l in layer_map.iter_historic_layers() { - size += l.file_size(); + size += l.file_size; } size } @@ -2454,8 +2453,6 @@ impl Timeline { let span = tracing::Span::current(); // Copy to move into the task we're about to spawn - let generation = self.generation; - let shard = self.get_shard_index(); let this = self.myself.upgrade().expect("&self method holds the arc"); let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({ @@ -2469,11 +2466,14 @@ impl Timeline { for discovered in discovered { let (name, kind) = match discovered { - Discovered::Layer(layer_file_name, local_path, file_size) => { - discovered_layers.push((layer_file_name, local_path, file_size)); + Discovered::Layer(layer_file_name, local_metadata) => { + discovered_layers.push((layer_file_name, local_metadata)); continue; } - Discovered::IgnoredBackup => { + Discovered::IgnoredBackup(path) => { + std::fs::remove_file(path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Removing .old file"); continue; } Discovered::Unknown(file_name) => { @@ -2499,13 +2499,8 @@ impl Timeline { ); } - let decided = init::reconcile( - discovered_layers, - index_part.as_ref(), - disk_consistent_lsn, - generation, - shard, - ); + let decided = + init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn); let mut loaded_layers = Vec::new(); let mut needs_cleanup = Vec::new(); @@ -2513,21 +2508,6 @@ impl Timeline { for (name, decision) in decided { let decision = match decision { - Ok(UseRemote { local, remote }) => { - // Remote is authoritative, but we may still choose to retain - // the local file if the contents appear to match - if local.metadata.file_size() == remote.file_size() { - // Use the local file, but take the remote metadata so that we pick up - // the correct generation. - UseLocal(LocalLayerFileMetadata { - metadata: remote, - local_path: local.local_path, - }) - } else { - init::cleanup_local_file_for_remote(&local, &remote)?; - UseRemote { local, remote } - } - } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { if let Some(local) = local { @@ -2545,6 +2525,11 @@ impl Timeline { // this file never existed remotely, we will have to do rework continue; } + Err(DismissedLayer::BadMetadata(local)) => { + init::cleanup_local_file_for_remote(&local)?; + // this file never existed remotely, we will have to do rework + continue; + } }; match &name { @@ -2555,14 +2540,12 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(local) => { - total_physical_size += local.metadata.file_size(); - Layer::for_resident(conf, &this, local.local_path, name, local.metadata) + Resident { local, remote } => { + total_physical_size += local.file_size; + Layer::for_resident(conf, &this, local.local_path, name, remote) .drop_eviction_guard() } - Evicted(remote) | UseRemote { remote, .. } => { - Layer::for_evicted(conf, &this, name, remote) - } + Evicted(remote) => Layer::for_evicted(conf, &this, name, remote), }; loaded_layers.push(layer); @@ -3071,7 +3054,7 @@ impl Timeline { HeatMapLayer::new( layer.layer_desc().layer_name(), - (&layer.metadata()).into(), + layer.metadata(), last_activity_ts, ) }); @@ -4347,7 +4330,7 @@ impl Timeline { let delta_file_accessed = reconstruct_state.get_delta_layers_visited(); let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; - info!( + debug!( "generate image layers for metadata keys: trigger_generation={trigger_generation}, \ delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \ total_key_retrieved={total_key_retrieved}" @@ -4725,11 +4708,16 @@ impl Timeline { async fn rewrite_layers( self: &Arc, - replace_layers: Vec<(Layer, ResidentLayer)>, - drop_layers: Vec, + mut replace_layers: Vec<(Layer, ResidentLayer)>, + mut drop_layers: Vec, ) -> anyhow::Result<()> { let mut guard = self.layers.write().await; + // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want + // to avoid double-removing, and avoid rewriting something that was removed. + replace_layers.retain(|(l, _)| guard.contains(l)); + drop_layers.retain(|l| guard.contains(l)); + guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); @@ -5604,26 +5592,6 @@ fn is_send() { _assert_send::>(); } -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> { - let filename = path - .file_name() - .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?; - let mut new_path = path.to_owned(); - - for i in 0u32.. { - new_path.set_file_name(format!("{filename}.{i}.old")); - if !new_path.exists() { - std::fs::rename(path, &new_path) - .with_context(|| format!("rename {path:?} to {new_path:?}"))?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - #[cfg(test)] mod tests { use utils::{id::TimelineId, lsn::Lsn}; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 2eff469591..07a12f535a 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -9,7 +9,10 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use super::layer_manager::LayerManager; -use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline}; +use super::{ + CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, + RecordedDuration, Timeline, +}; use anyhow::{anyhow, Context}; use enumset::EnumSet; @@ -22,14 +25,13 @@ use tracing::{debug, info, info_span, trace, warn, Instrument}; use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; +use crate::page_cache; use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; -use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole}; +use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome}; use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; -use crate::tenant::PageReconstructError; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; -use crate::{page_cache, ZERO_PAGE}; use crate::keyspace::KeySpace; use crate::repository::Key; @@ -174,13 +176,24 @@ impl Timeline { async fn compact_shard_ancestors( self: &Arc, rewrite_max: usize, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let mut drop_layers = Vec::new(); - let layers_to_rewrite: Vec = Vec::new(); + let mut layers_to_rewrite: Vec = Vec::new(); - // We will use the PITR cutoff as a condition for rewriting layers. - let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr; + // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a + // layer is behind this Lsn, it indicates that the layer is being retained beyond the + // pitr_interval, for example because a branchpoint references it. + // + // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we + // are rewriting layers. + let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn(); + + tracing::info!( + "latest_gc_cutoff: {}, pitr cutoff {}", + *latest_gc_cutoff, + self.gc_info.read().unwrap().cutoffs.pitr + ); let layers = self.layers.read().await; for layer_desc in layers.layer_map().iter_historic_layers() { @@ -239,9 +252,9 @@ impl Timeline { // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually // without incurring the I/O cost of a rewrite. - if layer_desc.get_lsn_range().end >= pitr_cutoff { - debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})", - layer_desc.get_lsn_range().end, pitr_cutoff); + if layer_desc.get_lsn_range().end >= *latest_gc_cutoff { + debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})", + layer_desc.get_lsn_range().end, *latest_gc_cutoff); continue; } @@ -251,13 +264,10 @@ impl Timeline { continue; } - // Only rewrite layers if they would have different remote paths: either they belong to this - // shard but an old generation, or they belonged to another shard. This also implicitly - // guarantees that the layer is persistent in remote storage (as only remote persistent - // layers are carried across shard splits, any local-only layer would be in the current generation) - if layer.metadata().generation == self.generation - && layer.metadata().shard.shard_count == self.shard_identity.count - { + // Only rewrite layers if their generations differ. This guarantees: + // - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one + // - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage + if layer.metadata().generation == self.generation { debug!(%layer, "Skipping rewrite, is not from old generation"); continue; } @@ -270,18 +280,69 @@ impl Timeline { } // Fall through: all our conditions for doing a rewrite passed. - // TODO: implement rewriting - tracing::debug!(%layer, "Would rewrite layer"); + layers_to_rewrite.push(layer); } - // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`] + // Drop read lock on layer map before we start doing time-consuming I/O drop(layers); - // TODO: collect layers to rewrite - let replace_layers = Vec::new(); + let mut replace_image_layers = Vec::new(); + + for layer in layers_to_rewrite { + tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &layer.layer_desc().key_range, + layer.layer_desc().image_layer_lsn(), + ctx, + ) + .await?; + + // Safety of layer rewrites: + // - We are writing to a different local file path than we are reading from, so the old Layer + // cannot interfere with the new one. + // - In the page cache, contents for a particular VirtualFile are stored with a file_id that + // is different for two layers with the same name (in `ImageLayerInner::new` we always + // acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk + // reading the index from one layer file, and then data blocks from the rewritten layer file. + // - Any readers that have a reference to the old layer will keep it alive until they are done + // with it. If they are trying to promote from remote storage, that will fail, but this is the same + // as for compaction generally: compaction is allowed to delete layers that readers might be trying to use. + // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: + // - GC, which at worst witnesses us "undelete" a layer that they just deleted. + // - ingestion, which only inserts layers, therefore cannot collide with us. + let resident = layer.download_and_keep_resident().await?; + + let keys_written = resident + .filter(&self.shard_identity, &mut image_layer_writer, ctx) + .await?; + + if keys_written > 0 { + let new_layer = image_layer_writer.finish(self, ctx).await?; + tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + layer.metadata().file_size, + new_layer.metadata().file_size); + + replace_image_layers.push((layer, new_layer)); + } else { + // Drop the old layer. Usually for this case we would already have noticed that + // the layer has no data for us with the ShardedRange check above, but + drop_layers.push(layer); + } + } + + // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded + // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch + // to remote index) and be removed. This is inefficient but safe. + fail::fail_point!("compact-shard-ancestors-localonly"); // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage - self.rewrite_layers(replace_layers, drop_layers).await?; + self.rewrite_layers(replace_image_layers, drop_layers) + .await?; + + fail::fail_point!("compact-shard-ancestors-enqueued"); // We wait for all uploads to complete before finishing this compaction stage. This is not // necessary for correctness, but it simplifies testing, and avoids proceeding with another @@ -289,6 +350,8 @@ impl Timeline { // load. self.remote_client.wait_completion().await?; + fail::fail_point!("compact-shard-ancestors-persistent"); + Ok(()) } @@ -1150,10 +1213,10 @@ impl TimelineAdaptor { lsn: Lsn, key_range: &Range, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), CreateImageLayersError> { let timer = self.timeline.metrics.create_images_time_histo.start_timer(); - let mut image_layer_writer = ImageLayerWriter::new( + let image_layer_writer = ImageLayerWriter::new( self.timeline.conf, self.timeline.timeline_id, self.timeline.tenant_shard_id, @@ -1164,47 +1227,34 @@ impl TimelineAdaptor { .await?; fail_point!("image-layer-writer-fail-before-finish", |_| { - Err(PageReconstructError::Other(anyhow::anyhow!( + Err(CreateImageLayersError::Other(anyhow::anyhow!( "failpoint image-layer-writer-fail-before-finish" ))) }); - let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?; - for range in &keyspace_ranges { - let mut key = range.start; - while key < range.end { - let img = match self.timeline.get(key, lsn, ctx).await { - Ok(img) => img, - Err(err) => { - // If we fail to reconstruct a VM or FSM page, we can zero the - // page without losing any actual user data. That seems better - // than failing repeatedly and getting stuck. - // - // We had a bug at one point, where we truncated the FSM and VM - // in the pageserver, but the Postgres didn't know about that - // and continued to generate incremental WAL records for pages - // that didn't exist in the pageserver. Trying to replay those - // WAL records failed to find the previous image of the page. - // This special case allows us to recover from that situation. - // See https://github.com/neondatabase/neon/issues/2601. - // - // Unfortunately we cannot do this for the main fork, or for - // any metadata keys, keys, as that would lead to actual data - // loss. - if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { - warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); - ZERO_PAGE.clone() - } else { - return Err(err); - } - } - }; - image_layer_writer.put_image(key, img, ctx).await?; - key = key.next(); - } - } - let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?; - self.new_images.push(image_layer); + let keyspace = KeySpace { + ranges: self.get_keyspace(key_range, lsn, ctx).await?, + }; + // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly + let start = Key::MIN; + let ImageLayerCreationOutcome { + image, + next_start_key: _, + } = self + .timeline + .create_image_layer_for_rel_blocks( + &keyspace, + image_layer_writer, + lsn, + ctx, + key_range.clone(), + start, + ) + .await?; + + if let Some(image_layer) = image { + self.new_images.push(image_layer); + } timer.stop_and_record(); diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index feadc79e5e..5bc67c7133 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -7,19 +7,20 @@ use crate::{ index::{IndexPart, LayerFileMetadata}, }, storage_layer::LayerName, - Generation, }, }; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::shard::ShardIndex; -use std::{collections::HashMap, str::FromStr}; +use std::{ + collections::{hash_map, HashMap}, + str::FromStr, +}; use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerName, Utf8PathBuf, u64), + Layer(LayerName, LocalLayerFileMetadata), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed @@ -27,7 +28,7 @@ pub(super) enum Discovered { /// Temporary on-demand download files, should be removed TemporaryDownload(String), /// Backup file from previously future layers - IgnoredBackup, + IgnoredBackup(Utf8PathBuf), /// Unrecognized, warn about these Unknown(String), } @@ -43,12 +44,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); - Discovered::Layer(file_name, direntry.path().to_owned(), file_size) + Discovered::Layer( + file_name, + LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size), + ) } Err(_) => { if file_name.ends_with(".old") { // ignore these - Discovered::IgnoredBackup + Discovered::IgnoredBackup(direntry.path().to_owned()) } else if remote_timeline_client::is_temp_download_file(direntry.path()) { Discovered::TemporaryDownload(file_name) } else if is_ephemeral_file(&file_name) { @@ -71,37 +75,32 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self { Self { local_path, - metadata: LayerFileMetadata::new(file_size, generation, shard), + file_size, } } } -/// Decision on what to do with a layer file after considering its local and remote metadata. +/// For a layer that is present in remote metadata, this type describes how to handle +/// it during startup: it is either Resident (and we have some metadata about a local file), +/// or it is Evicted (and we only have remote metadata). #[derive(Clone, Debug)] pub(super) enum Decision { /// The layer is not present locally. Evicted(LayerFileMetadata), - /// The layer is present locally, but local metadata does not match remote; we must - /// delete it and treat it as evicted. - UseRemote { + /// The layer is present locally, and metadata matches: we may hook up this layer to the + /// existing file in local storage. + Resident { local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, - /// The layer is present locally, and metadata matches. - UseLocal(LocalLayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -117,77 +116,81 @@ pub(super) enum DismissedLayer { /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. LocalOnly(LocalLayerFileMetadata), + + /// The layer exists in remote storage but the local layer's metadata (e.g. file size) + /// does not match it + BadMetadata(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerName, Utf8PathBuf, u64)>, + local_layers: Vec<(LayerName, LocalLayerFileMetadata)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, - generation: Generation, - shard: ShardIndex, ) -> Vec<(LayerName, Result)> { - use Decision::*; + let Some(index_part) = index_part else { + // If we have no remote metadata, no local layer files are considered valid to load + return local_layers + .into_iter() + .map(|(layer_name, local_metadata)| { + (layer_name, Err(DismissedLayer::LocalOnly(local_metadata))) + }) + .collect(); + }; - // name => (local_metadata, remote_metadata) - type Collected = - HashMap, Option)>; + let mut result = Vec::new(); - let mut discovered = discovered - .into_iter() - .map(|(layer_name, local_path, file_size)| { - ( - layer_name, - // The generation and shard here will be corrected to match IndexPart in the merge below, unless - // it is not in IndexPart, in which case using our current generation makes sense - // because it will be uploaded in this generation. - ( - Some(LocalLayerFileMetadata::new( - local_path, file_size, generation, shard, - )), - None, - ), - ) - }) - .collect::(); + let mut remote_layers = HashMap::new(); - // merge any index_part information, when available + // Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise + // construct DismissedLayers to get rid of them. + for (layer_name, local_metadata) in local_layers { + let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else { + result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))); + continue; + }; + + if remote_metadata.file_size != local_metadata.file_size { + result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata)))); + continue; + } + + remote_layers.insert( + layer_name, + Decision::Resident { + local: local_metadata, + remote: remote_metadata.clone(), + }, + ); + } + + // Construct Decision for layers that were not found locally index_part - .as_ref() - .map(|ip| ip.layer_metadata.iter()) - .into_iter() - .flatten() - .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata))) + .layer_metadata + .iter() .for_each(|(name, metadata)| { - if let Some(existing) = discovered.get_mut(name) { - existing.1 = Some(metadata); - } else { - discovered.insert(name.to_owned(), (None, Some(metadata))); + if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) { + entry.insert(Decision::Evicted(metadata.clone())); } }); - discovered - .into_iter() - .map(|(name, (local, remote))| { - let decision = if name.is_in_future(disk_consistent_lsn) { - Err(DismissedLayer::Future { local }) - } else { - match (local, remote) { - (Some(local), Some(remote)) if local.metadata != remote => { - Ok(UseRemote { local, remote }) - } - (Some(x), Some(_)) => Ok(UseLocal(x)), - (None, Some(x)) => Ok(Evicted(x)), - (Some(x), None) => Err(DismissedLayer::LocalOnly(x)), - (None, None) => { - unreachable!("there must not be any non-local non-remote files") - } - } - }; + // For layers that were found in authoritative remote metadata, apply a final check that they are within + // the disk_consistent_lsn. + result.extend(remote_layers.into_iter().map(|(name, decision)| { + if name.is_in_future(disk_consistent_lsn) { + match decision { + Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })), + Decision::Resident { + local, + remote: _remote, + } => (name, Err(DismissedLayer::Future { local: Some(local) })), + } + } else { + (name, Ok(decision)) + } + })); - (name, decision) - }) - .collect::>() + result } pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { @@ -196,25 +199,15 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}")) } -pub(super) fn cleanup_local_file_for_remote( - local: &LocalLayerFileMetadata, - remote: &LayerFileMetadata, -) -> anyhow::Result<()> { - let local_size = local.metadata.file_size(); - let remote_size = remote.file_size(); +pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> { + let local_size = local.file_size; let path = &local.local_path; - let file_name = path.file_name().expect("must be file path"); - tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); - if let Err(err) = crate::tenant::timeline::rename_to_backup(path) { - assert!( - path.exists(), - "we would leave the local_layer without a file if this does not hold: {path}", - ); - Err(err) - } else { - Ok(()) - } + tracing::warn!( + "removing local file {file_name:?} because it has unexpected length {local_size};" + ); + + std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}")) } pub(super) fn cleanup_future_layer( @@ -236,8 +229,8 @@ pub(super) fn cleanup_local_only_file( ) -> anyhow::Result<()> { let kind = name.kind(); tracing::info!( - "found local-only {kind} layer {name}, metadata {:?}", - local.metadata + "found local-only {kind} layer {name} size {}", + local.file_size ); std::fs::remove_file(&local.local_path)?; Ok(()) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 248420e632..884b71df75 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -212,13 +212,34 @@ impl LayerManager { &mut self, rewrite_layers: &[(Layer, ResidentLayer)], drop_layers: &[Layer], - _metrics: &TimelineMetrics, + metrics: &TimelineMetrics, ) { let mut updates = self.layer_map.batch_update(); + for (old_layer, new_layer) in rewrite_layers { + debug_assert_eq!( + old_layer.layer_desc().key_range, + new_layer.layer_desc().key_range + ); + debug_assert_eq!( + old_layer.layer_desc().lsn_range, + new_layer.layer_desc().lsn_range + ); - // TODO: implement rewrites (currently this code path only used for drops) - assert!(rewrite_layers.is_empty()); + // Safety: we may never rewrite the same file in-place. Callers are responsible + // for ensuring that they only rewrite layers after something changes the path, + // such as an increment in the generation number. + assert_ne!(old_layer.local_path(), new_layer.local_path()); + Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr); + + Self::insert_historic_layer( + new_layer.as_ref().clone(), + &mut updates, + &mut self.layer_fmgr, + ); + + metrics.record_new_file_metrics(new_layer.layer_desc().file_size); + } for l in drop_layers { Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); } diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index c0cc8f3124..02f87303d1 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -213,10 +213,7 @@ impl UploadQueue { let mut files = HashMap::with_capacity(index_part.layer_metadata.len()); for (layer_name, layer_metadata) in &index_part.layer_metadata { - files.insert( - layer_name.to_owned(), - LayerFileMetadata::from(layer_metadata), - ); + files.insert(layer_name.to_owned(), layer_metadata.clone()); } info!( @@ -322,9 +319,7 @@ impl std::fmt::Display for UploadOp { write!( f, "UploadLayer({}, size={:?}, gen={:?})", - layer, - metadata.file_size(), - metadata.generation + layer, metadata.file_size, metadata.generation ) } UploadOp::UploadMetadata(_, lsn) => { diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index f5ce2caff3..a9c8d59c3a 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -51,7 +51,6 @@ int flush_every_n_requests = 8; int neon_protocol_version = 2; -static int n_reconnect_attempts = 0; static int max_reconnect_attempts = 60; static int stripe_size; @@ -95,18 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook; static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; +typedef enum PSConnectionState { + PS_Disconnected, /* no connection yet */ + PS_Connecting_Startup, /* connection starting up */ + PS_Connecting_PageStream, /* negotiating pagestream */ + PS_Connected, /* connected, pagestream established */ +} PSConnectionState; + /* This backend's per-shard connections */ typedef struct { - PGconn *conn; + TimestampTz last_connect_time; /* read-only debug value */ + TimestampTz last_reconnect_time; + uint32 delay_us; + int n_reconnect_attempts; /*--- - * WaitEventSet containing: - * - WL_SOCKET_READABLE on 'conn' - * - WL_LATCH_SET on MyLatch, and - * - WL_EXIT_ON_PM_DEATH. + * Pageserver connection state, i.e. + * disconnected: conn == NULL, wes == NULL; + * conn_startup: connection initiated, waiting for connection establishing + * conn_ps: PageStream query sent, waiting for confirmation + * connected: PageStream established */ - WaitEventSet *wes; + PSConnectionState state; + PGconn *conn; + /*--- + * WaitEventSet containing: + * - WL_SOCKET_READABLE on 'conn' + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ + WaitEventSet *wes_read; + /*--- + * WaitEventSet containing: + * - WL_SOCKET_WRITABLE on 'conn' + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ + WaitEventSet *wes_write; } PageServer; static PageServer page_servers[MAX_SHARDS]; @@ -303,119 +328,269 @@ get_shard_number(BufferTag *tag) return hash % n_shards; } +static inline void +CLEANUP_AND_DISCONNECT(PageServer *shard) +{ + if (shard->wes_read) + { + FreeWaitEventSet(shard->wes_read); + shard->wes_read = NULL; + } + if (shard->wes_write) + { + FreeWaitEventSet(shard->wes_write); + shard->wes_write = NULL; + } + if (shard->conn) + { + PQfinish(shard->conn); + shard->conn = NULL; + } + + shard->state = PS_Disconnected; +} + +/* + * Connect to a pageserver, or continue to try to connect if we're yet to + * complete the connection (e.g. due to receiving an earlier cancellation + * during connection start). + * Returns true if successfully connected; false if the connection failed. + * + * Throws errors in unrecoverable situations, or when this backend's query + * is canceled. + */ static bool pageserver_connect(shardno_t shard_no, int elevel) { - char *query; - int ret; - const char *keywords[3]; - const char *values[3]; - int n; - PGconn *conn; - WaitEventSet *wes; + PageServer *shard = &page_servers[shard_no]; char connstr[MAX_PAGESERVER_CONNSTRING_SIZE]; - static TimestampTz last_connect_time = 0; - static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; - TimestampTz now; - uint64_t us_since_last_connect; - bool broke_from_loop = false; - - Assert(page_servers[shard_no].conn == NULL); - /* * Get the connection string for this shard. If the shard map has been * updated since we last looked, this will also disconnect any existing * pageserver connections as a side effect. + * Note that connstr is used both during connection start, and when we + * log the successful connection. */ load_shard_map(shard_no, connstr, NULL); - now = GetCurrentTimestamp(); - us_since_last_connect = now - last_connect_time; - if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC) + switch (shard->state) { - pg_usleep(delay_us); - delay_us *= 2; - } - else + case PS_Disconnected: { - delay_us = MIN_RECONNECT_INTERVAL_USEC; - } + const char *keywords[3]; + const char *values[3]; + int n_pgsql_params; + TimestampTz now; + int64 us_since_last_attempt; - /* - * Connect using the connection string we got from the - * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment - * variable was set, use that as the password. - * - * The connection options are parsed in the order they're given, so when - * we set the password before the connection string, the connection string - * can override the password from the env variable. Seems useful, although - * we don't currently use that capability anywhere. - */ - n = 0; - if (neon_auth_token) - { - keywords[n] = "password"; - values[n] = neon_auth_token; - n++; + /* Make sure we start with a clean slate */ + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected"); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); + shard->last_reconnect_time = now; + + /* + * If we did other tasks between reconnect attempts, then we won't + * need to wait as long as a full delay. + */ + if (us_since_last_attempt < shard->delay_us) + { + pg_usleep(shard->delay_us - us_since_last_attempt); + } + + /* update the delay metric */ + shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + + /* + * Connect using the connection string we got from the + * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment + * variable was set, use that as the password. + * + * The connection options are parsed in the order they're given, so when + * we set the password before the connection string, the connection string + * can override the password from the env variable. Seems useful, although + * we don't currently use that capability anywhere. + */ + keywords[0] = "dbname"; + values[0] = connstr; + n_pgsql_params = 1; + + if (neon_auth_token) + { + keywords[1] = "password"; + values[1] = neon_auth_token; + n_pgsql_params++; + } + + keywords[n_pgsql_params] = NULL; + values[n_pgsql_params] = NULL; + + shard->conn = PQconnectStartParams(keywords, values, 1); + if (!shard->conn) + { + neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory"); + return false; + } + + shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL); + + shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE, + PQsocket(shard->conn), + NULL, NULL); + + shard->state = PS_Connecting_Startup; + /* fallthrough */ } - keywords[n] = "dbname"; - values[n] = connstr; - n++; - keywords[n] = NULL; - values[n] = NULL; - n++; - conn = PQconnectdbParams(keywords, values, 1); - last_connect_time = GetCurrentTimestamp(); - - if (PQstatus(conn) == CONNECTION_BAD) + case PS_Connecting_Startup: { - char *msg = pchomp(PQerrorMessage(conn)); + char *pagestream_query; + int ps_send_query_ret; + bool connected = false; - PQfinish(conn); + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup"); - ereport(elevel, - (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), - errdetail_internal("%s", msg))); - pfree(msg); - return false; - } - switch (neon_protocol_version) - { + do + { + WaitEvent event; + int poll_result = PQconnectPoll(shard->conn); + + switch (poll_result) + { + default: /* unknown/unused states are handled as a failed connection */ + case PGRES_POLLING_FAILED: + { + char *pqerr = PQerrorMessage(shard->conn); + char *msg = NULL; + neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED"); + + if (pqerr) + msg = pchomp(pqerr); + + CLEANUP_AND_DISCONNECT(shard); + + if (msg) + { + neon_shard_log(shard_no, elevel, + "could not connect to pageserver: %s", + msg); + pfree(msg); + } + else + neon_shard_log(shard_no, elevel, + "could not connect to pageserver"); + + return false; + } + case PGRES_POLLING_READING: + /* Sleep until there's something to do */ + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_WRITING: + /* Sleep until there's something to do */ + (void) WaitEventSetWait(shard->wes_write, -1L, &event, 1, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_OK: + neon_shard_log(shard_no, DEBUG5, "POLLING_OK"); + connected = true; + break; + } + } + while (!connected); + + /* No more polling needed; connection succeeded */ + shard->last_connect_time = GetCurrentTimestamp(); + + switch (neon_protocol_version) + { case 2: - query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); + pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); break; case 1: - query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); break; default: elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); - } - ret = PQsendQuery(conn, query); - pfree(query); - if (ret != 1) - { - PQfinish(conn); - neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); - return false; - } + } - wes = CreateWaitEventSet(TopMemoryContext, 3); - AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, - MyLatch, NULL); - AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); - AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL); + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); - PG_TRY(); + CLEANUP_AND_DISCONNECT(shard); + + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + + ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query); + pfree(pagestream_query); + if (ps_send_query_ret != 1) + { + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); + return false; + } + + shard->state = PS_Connecting_PageStream; + /* fallthrough */ + } + case PS_Connecting_PageStream: { - while (PQisBusy(conn)) + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream"); + + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + + while (PQisBusy(shard->conn)) { WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -423,40 +598,37 @@ pageserver_connect(shardno_t shard_no, int elevel) /* Data available in socket? */ if (event.events & WL_SOCKET_READABLE) { - if (!PQconsumeInput(conn)) + if (!PQconsumeInput(shard->conn)) { - char *msg = pchomp(PQerrorMessage(conn)); - - PQfinish(conn); - FreeWaitEventSet(wes); + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s", msg); - /* Returning from inside PG_TRY is bad, so we break/return later */ - broke_from_loop = true; - break; + pfree(msg); + return false; } } } - } - PG_CATCH(); - { - PQfinish(conn); - FreeWaitEventSet(wes); - PG_RE_THROW(); - } - PG_END_TRY(); - if (broke_from_loop) - { - return false; + shard->state = PS_Connected; + /* fallthrough */ } + case PS_Connected: + /* + * We successfully connected. Future connections to this PageServer + * will do fast retries again, with exponential backoff. + */ + shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; - neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); - page_servers[shard_no].conn = conn; - page_servers[shard_no].wes = wes; - - return true; + neon_shard_log(shard_no, DEBUG5, "Connection state: Connected"); + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); + return true; + default: + neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); + } + /* This shouldn't be hit */ + Assert(false); } /* @@ -476,7 +648,7 @@ retry: WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -502,7 +674,8 @@ retry: /* * Reset prefetch and drop connection to the shard. - * It also drops connection to all other shards involved in prefetch. + * It also drops connection to all other shards involved in prefetch, through + * prefetch_on_ps_disconnect(). */ static void pageserver_disconnect(shardno_t shard_no) @@ -512,9 +685,6 @@ pageserver_disconnect(shardno_t shard_no) * whole prefetch queue, even for other pageservers. It should not * cause big problems, because connection loss is supposed to be a * rare event. - * - * Prefetch state should be reset even if page_servers[shard_no].conn == NULL, - * because prefetch request may be registered before connection is established. */ prefetch_on_ps_disconnect(); @@ -527,37 +697,36 @@ pageserver_disconnect(shardno_t shard_no) static void pageserver_disconnect_shard(shardno_t shard_no) { + PageServer *shard = &page_servers[shard_no]; /* * If anything goes wrong while we were sending a request, it's not clear * what state the connection is in. For example, if we sent the request * but didn't receive a response yet, we might receive the response some * time later after we have already sent a new unrelated request. Close * the connection to avoid getting confused. + * Similarly, even when we're in PS_DISCONNECTED, we may have junk to + * clean up: It is possible that we encountered an error allocating any + * of the wait event sets or the psql connection, or failed when we tried + * to attach wait events to the WaitEventSets. */ - if (page_servers[shard_no].conn) - { - neon_shard_log(shard_no, LOG, "dropping connection to page server due to error"); - PQfinish(page_servers[shard_no].conn); - page_servers[shard_no].conn = NULL; - } - if (page_servers[shard_no].wes != NULL) - { - FreeWaitEventSet(page_servers[shard_no].wes); - page_servers[shard_no].wes = NULL; - } + CLEANUP_AND_DISCONNECT(shard); + + shard->state = PS_Disconnected; } static bool pageserver_send(shardno_t shard_no, NeonRequest *request) { StringInfoData req_buff; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn; /* If the connection was lost for some reason, reconnect */ - if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD) + if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD) { neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection"); pageserver_disconnect(shard_no); + pageserver_conn = NULL; } req_buff = nm_pack_request(request); @@ -571,17 +740,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) * https://github.com/neondatabase/neon/issues/1138 So try to reestablish * connection in case of failure. */ - if (!page_servers[shard_no].conn) + if (shard->state != PS_Connected) { - while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) + while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { HandleMainLoopInterrupts(); - n_reconnect_attempts += 1; + shard->n_reconnect_attempts += 1; } - n_reconnect_attempts = 0; + shard->n_reconnect_attempts = 0; + } else { + Assert(shard->conn != NULL); } - pageserver_conn = page_servers[shard_no].conn; + pageserver_conn = shard->conn; /* * Send request. @@ -590,13 +761,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) * should use async mode and check for interrupts while waiting. In * practice, our requests are small enough to always fit in the output and * TCP buffer. + * + * Note that this also will fail when the connection is in the + * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this + * point, but on the grand scheme of things it's only a small issue. */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); - neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); + neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg); pfree(msg); pfree(req_buff.data); return false; @@ -611,6 +786,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg); pfree(msg); } + return true; } @@ -619,58 +795,68 @@ pageserver_receive(shardno_t shard_no) { StringInfoData resp_buff; NeonResponse *resp; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; + /* read response */ + int rc; - if (!pageserver_conn) - return NULL; - - PG_TRY(); + if (shard->state != PS_Connected) { - /* read response */ - int rc; + neon_shard_log(shard_no, LOG, + "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x", + shard->state); + return NULL; + } - rc = call_PQgetCopyData(shard_no, &resp_buff.data); - if (rc >= 0) + Assert(pageserver_conn); + + rc = call_PQgetCopyData(shard_no, &resp_buff.data); + if (rc >= 0) + { + /* call_PQgetCopyData handles rc == 0 */ + Assert(rc > 0); + + PG_TRY(); { resp_buff.len = rc; resp_buff.cursor = 0; resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = nm_to_string((NeonMessage *) resp); - - neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); - pfree(msg); - } } - else if (rc == -1) + PG_CATCH(); { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn))); + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response"); pageserver_disconnect(shard_no); - resp = NULL; + PG_RE_THROW(); } - else if (rc == -2) - { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + PG_END_TRY(); - pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); - } - else + if (message_level_is_interesting(PageStoreTrace)) { - pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc); + char *msg = nm_to_string((NeonMessage *) resp); + + neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); + pfree(msg); } } - PG_CATCH(); + else if (rc == -1) { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception"); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); pageserver_disconnect(shard_no); - PG_RE_THROW(); + resp = NULL; + } + else if (rc == -2) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + } + else + { + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); } - PG_END_TRY(); return (NeonResponse *) resp; } @@ -681,7 +867,7 @@ pageserver_flush(shardno_t shard_no) { PGconn *pageserver_conn = page_servers[shard_no].conn; - if (!pageserver_conn) + if (page_servers[shard_no].state != PS_Connected) { neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected"); } @@ -697,6 +883,7 @@ pageserver_flush(shardno_t shard_no) return false; } } + return true; } @@ -891,5 +1078,7 @@ pg_init_libpagestore(void) dbsize_hook = neon_dbsize; } + memset(page_servers, 0, sizeof(page_servers)); + lfc_init(); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 41546eae85..ac505fe6fb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -94,6 +94,10 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; +#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ + neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ + ##__VA_ARGS__) + page_server_api *page_server; /* unlogged relation build states */ @@ -526,6 +530,8 @@ prefetch_flush_requests(void) * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * NOTE: callers should make sure they can handle query cancellations in this + * function's call path. */ static bool prefetch_wait_for(uint64 ring_index) @@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index) * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * + * NOTE: this does IO, and can get canceled out-of-line. */ static bool prefetch_read(PrefetchRequest *slot) @@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu", + slot->status, (size_t) (void *) slot->response, + slot->my_ring_index, MyPState->ring_receive); + old = MemoryContextSwitchTo(MyPState->errctx); response = (NeonResponse *) page_server->receive(slot->shard_no); MemoryContextSwitchTo(old); @@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot) } else { + neon_shard_log(slot->shard_no, WARNING, + "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + slot->my_ring_index, + RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)), + slot->buftag.forkNum, slot->buftag.blockNum); return false; } } @@ -603,6 +624,7 @@ void prefetch_on_ps_disconnect(void) { MyPState->ring_flush = MyPState->ring_unused; + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -625,6 +647,7 @@ prefetch_on_ps_disconnect(void) slot->status = PRFS_TAG_REMAINS; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + prefetch_set_unused(ring_index); } } @@ -691,6 +714,8 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) { bool found; + uint64 mySlotNo = slot->my_ring_index; + NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, /* lsn and not_modified_since are filled in below */ @@ -699,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns .blkno = slot->buftag.blockNum, }; + Assert(mySlotNo == MyPState->ring_unused); + if (force_request_lsns) slot->request_lsns = *force_request_lsns; else @@ -711,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); - while (!page_server->send(slot->shard_no, (NeonRequest *) &request)); + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) + { + Assert(mySlotNo == MyPState->ring_unused); + /* loop */ + } /* update prefetch state */ MyPState->n_requests_inflight += 1; @@ -722,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns /* update slot state */ slot->status = PRFS_REQUESTED; - prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } @@ -894,6 +924,10 @@ Retry: return ring_index; } +/* + * Note: this function can get canceled and use a long jump to the next catch + * context. Take care. + */ static NeonResponse * page_server_request(void const *req) { @@ -925,19 +959,38 @@ page_server_request(void const *req) * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || + ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) { shard_no = 0; } do { - while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no)); - consume_prefetch_responses(); - resp = page_server->receive(shard_no); - } while (resp == NULL); - return resp; + PG_TRY(); + { + while (!page_server->send(shard_no, (NeonRequest *) req) + || !page_server->flush(shard_no)) + { + /* do nothing */ + } + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + } + PG_CATCH(); + { + /* + * Cancellation in this code needs to be handled better at some + * point, but this currently seems fine for now. + */ + page_server->disconnect(shard_no); + PG_RE_THROW(); + } + PG_END_TRY(); + } while (resp == NULL); + + return resp; } @@ -1905,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); return exists; @@ -2357,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * Try to find prefetched page in the list of received pages. */ - Retry: +Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) @@ -2443,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag); + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } /* buffer was used, clean up for later reuse */ @@ -2714,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); } update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); @@ -2767,7 +2826,9 @@ neon_dbsize(Oid dbNode) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", @@ -3106,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", + T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index fb16b76567..e1674049a6 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -9,6 +9,7 @@ use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; +use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled}; use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; @@ -65,6 +66,8 @@ async fn main() -> anyhow::Result<()> { let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + let args = cli().get_matches(); let destination: String = args.get_one::("dest").unwrap().parse()?; diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 61d6d60dbe..7d3153a3c1 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -51,9 +51,10 @@ impl AsyncWrite for WebSocketRw { ) -> Poll> { let this = self.project(); let mut stream = this.stream; - this.send.put(buf); ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; + + this.send.put(buf); match stream.as_mut().start_send(Frame::binary(this.send.split())) { Ok(()) => Poll::Ready(Ok(buf.len())), Err(e) => Poll::Ready(Err(io_error(e))), diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index dd64a0a98f..134afa53da 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet}; use anyhow::Context; use aws_sdk_s3::{types::ObjectIdentifier, Client}; -use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; use tracing::{error, info, warn}; use utils::generation::Generation; @@ -208,7 +208,7 @@ impl TenantObjectListing { &mut self, timeline_id: TimelineId, layer_file: &LayerName, - metadata: &IndexLayerMetadata, + metadata: &LayerFileMetadata, ) -> bool { let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { return false; diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs index a24a1e92ae..450b337235 100644 --- a/s3_scrubber/src/tenant_snapshot.rs +++ b/s3_scrubber/src/tenant_snapshot.rs @@ -11,7 +11,7 @@ use async_stream::stream; use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; -use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; @@ -49,8 +49,8 @@ impl SnapshotDownloader { &self, ttid: TenantShardTimelineId, layer_name: LayerName, - layer_metadata: IndexLayerMetadata, - ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> { + layer_metadata: LayerFileMetadata, + ) -> anyhow::Result<(LayerName, LayerFileMetadata)> { // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use // different layer names (remote-style has the generation suffix) let local_path = self.output_path.join(format!( @@ -110,7 +110,7 @@ impl SnapshotDownloader { async fn download_layers( &self, ttid: TenantShardTimelineId, - layers: Vec<(LayerName, IndexLayerMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let layer_count = layers.len(); tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); @@ -161,10 +161,7 @@ impl SnapshotDownloader { ttid: TenantShardTimelineId, index_part: Box, index_part_generation: Generation, - ancestor_layers: &mut HashMap< - TenantShardTimelineId, - HashMap, - >, + ancestor_layers: &mut HashMap>, ) -> anyhow::Result<()> { let index_bytes = serde_json::to_string(&index_part).unwrap(); @@ -234,7 +231,7 @@ impl SnapshotDownloader { // happen if this tenant has been split at some point) let mut ancestor_layers: HashMap< TenantShardTimelineId, - HashMap, + HashMap, > = Default::default(); for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index e671d4f36a..4b1481a397 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -827,10 +827,10 @@ where /// Persist control file if there is something to save and enough time /// passed after the last save. - pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> { + pub async fn maybe_persist_inmem_control_file(&mut self) -> Result { const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL { - return Ok(()); + return Ok(false); } let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn || self.state.inmem.backup_lsn > self.state.backup_lsn @@ -840,7 +840,7 @@ where self.state.flush().await?; trace!("saved control file: {CF_SAVE_INTERVAL:?} passed"); } - Ok(()) + Ok(need_persist) } /// Handle request to append WAL. diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 89c157d514..0cc6153373 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -104,11 +104,16 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>; pub struct WriteGuardSharedState<'a> { tli: Arc, guard: RwLockWriteGuard<'a, SharedState>, + skip_update: bool, } impl<'a> WriteGuardSharedState<'a> { fn new(tli: Arc, guard: RwLockWriteGuard<'a, SharedState>) -> Self { - WriteGuardSharedState { tli, guard } + WriteGuardSharedState { + tli, + guard, + skip_update: false, + } } } @@ -149,10 +154,12 @@ impl<'a> Drop for WriteGuardSharedState<'a> { } }); - // send notification about shared state update - self.tli.shared_state_version_tx.send_modify(|old| { - *old += 1; - }); + if !self.skip_update { + // send notification about shared state update + self.tli.shared_state_version_tx.send_modify(|old| { + *old += 1; + }); + } } } @@ -802,7 +809,11 @@ impl Timeline { // update last_removed_segno let mut shared_state = self.write_shared_state().await; - shared_state.last_removed_segno = horizon_segno; + if shared_state.last_removed_segno != horizon_segno { + shared_state.last_removed_segno = horizon_segno; + } else { + shared_state.skip_update = true; + } Ok(()) } @@ -811,11 +822,10 @@ impl Timeline { /// to date so that storage nodes restart doesn't cause many pageserver -> /// safekeeper reconnections. pub async fn maybe_persist_control_file(self: &Arc) -> Result<()> { - self.write_shared_state() - .await - .sk - .maybe_persist_inmem_control_file() - .await + let mut guard = self.write_shared_state().await; + let changed = guard.sk.maybe_persist_inmem_control_file().await?; + guard.skip_update = !changed; + Ok(()) } /// Gather timeline data for metrics. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 796ae7217b..36aa18f1f9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2667,7 +2667,9 @@ class NeonPageserver(PgProtocol, LogUtils): tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) ) - def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: + def list_layers( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ) -> list[Path]: """ Inspect local storage on a pageserver to discover which layer files are present. diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index fa6e4eaafd..ad8bbe2021 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -70,7 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # this is expected given our collaborative shutdown approach for the UploadQueue ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*", ".*Compaction failed.*, retrying in .*: ShuttingDown", - ".*Compaction failed.*, retrying in .*: timeline shutting down.*", + ".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*", # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", ".*took more than expected to complete.*", diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 22bb43c580..89e116df28 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -541,11 +541,22 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str left_list, right_list = map(build_hash_list, [left, right]) - try: - assert len(left_list) == len(right_list) + assert len(left_list) == len( + right_list + ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}" - for left_tuple, right_tuple in zip(left_list, right_list): - assert left_tuple == right_tuple - finally: - elapsed = time.time() - started_at - log.info(f"assert_pageserver_backups_equal completed in {elapsed}s") + mismatching = set() + + for left_tuple, right_tuple in zip(left_list, right_list): + left_path, left_hash = left_tuple + right_path, right_hash = right_tuple + assert ( + left_path == right_path + ), f"file count matched, expected these to be same paths: {left_path}, {right_path}" + if left_hash != right_hash: + mismatching.add(left_path) + + assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}" + + elapsed = time.time() - started_at + log.info(f"assert_pageserver_backups_equal completed in {elapsed}s") diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 8c60b454d8..1d193b8999 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -17,9 +17,13 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - # eviction might be the first one after an attach to access the layers - env.pageserver.allowed_errors.append( - ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction" + env.pageserver.allowed_errors.extend( + [ + # eviction might be the first one after an attach to access the layers + ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction", + # detach can happen before we get to validate the generation number + ".*deletion backend: Dropped remote consistent LSN updates for tenant.*", + ] ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) return env diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 62229ebfe7..ac27a4cf36 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -163,7 +163,7 @@ def test_import_from_pageserver_small( num_rows = 3000 lsn = _generate_data(num_rows, endpoint) - _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir) + _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir) @pytest.mark.timeout(1800) @@ -193,9 +193,7 @@ def test_import_from_pageserver_multisegment( log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB - tar_output_file = _import( - num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir - ) + tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir) # Check if the backup data contains multiple segment files cnt_seg_files = 0 @@ -235,7 +233,6 @@ def _import( env: NeonEnv, pg_bin: PgBin, timeline: TimelineId, - pg_distrib_dir: Path, test_output_dir: Path, ) -> Path: """Test importing backup data to the pageserver. @@ -295,7 +292,7 @@ def _import( wait_for_upload(client, tenant, timeline, lsn) # Check it worked - endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) + endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn) assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py new file mode 100644 index 0000000000..bad2e5865e --- /dev/null +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -0,0 +1,282 @@ +from contextlib import closing +from typing import Set + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver +from fixtures.pageserver.http import PageserverHttpClient +from psycopg2.errors import QueryCanceled + +CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", + "ps::handle-pagerequest-message::exists", + "ps::handle-pagerequest-message::nblocks", + "ps::handle-pagerequest-message::getpage", + "ps::handle-pagerequest-message::dbsize", + # We don't yet have a good way to on-demand guarantee the download of an + # SLRU segment, so that's disabled for now. + # "ps::handle-pagerequest-message::slrusegment", +} + +PG_PS_START_FAILPOINTS = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", +} +SMGR_EXISTS = "ps::handle-pagerequest-message::exists" +SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks" +SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage" +SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize" + +""" +Test that we can handle connection delays and cancellations at various +unfortunate connection startup and request states. +""" + + +def test_cancellations(neon_simple_env: NeonEnv): + env = neon_simple_env + ps = env.pageserver + ps_http = ps.http_client() + ps_http.is_testing_enabled_or_skip() + + env.neon_cli.create_branch("test_config", "empty") + + # We don't want to have any racy behaviour with autovacuum IOs + ep = env.endpoints.create_start( + "test_config", + config_lines=[ + "autovacuum = off", + "shared_buffers = 128MB", + ], + ) + + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + CREATE TABLE test1 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1, 1024::bigint) p(id); + """ + ) + cur.execute( + """ + CREATE TABLE test2 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1025, 2048::bigint) p(id); + """ + ) + cur.execute( + """ + VACUUM (ANALYZE, FREEZE) test1, test2; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_buffercache; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_prewarm; + """ + ) + + # data preparation is now complete, with 2 disjoint tables that aren't + # preloaded into any caches. + + ep.stop() + + for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS: + connect_works_correctly(failpoint, ep, ps, ps_http) + + +ENABLED_FAILPOINTS: Set[str] = set() + + +def connect_works_correctly( + failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient +): + log.debug("Starting work on %s", failpoint) + # All queries we use should finish (incl. IO) within 500ms, + # including all their IO. + # This allows us to use `SET statement_timeout` to let the query + # timeout system cancel queries, rather than us having to go + # through the most annoying effort of manual query cancellation + # in psycopg2. + options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1" + + ep.start() + + def fp_enable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "pause"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint} + log.info( + 'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2 + ) + + def fp_disable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "off"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint} + log.info( + 'Disabled failpoint "%s", current_active=%s', + failpoint, + ENABLED_FAILPOINTS, + stacklevel=2, + ) + + def check_buffers(cur): + cur.execute( + """ + SELECT n.nspname AS nspname + , c.relname AS relname + , count(*) AS count + FROM pg_buffercache b + JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database()) + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid) + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10 + """ + ) + return cur.fetchone() + + def exec_may_cancel(query, cursor, result, cancels): + if cancels: + with pytest.raises(QueryCanceled): + cursor.execute(query) + assert cursor.fetchone() == result + else: + cursor.execute(query) + assert cursor.fetchone() == result + + fp_disable() + + # Warm caches required for new connections, so that they can run without + # requiring catalog reads. + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + SELECT 1; + """ + ) + assert cur.fetchone() == (1,) + + assert check_buffers(cur) is None + # Ensure all caches required for connection start are correctly + # filled, so that we don't have any "accidents" in this test run + # caused by changes in connection startup plans that require + # requests to the PageServer. + cur.execute( + """ + select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0)) + from pg_class c + where c.oid < 16384 AND c.relkind IN ('i', 'r'); + """ + ) + assert cur.fetchone() == ([True],) + + # Enable failpoint + fp_enable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + cur.execute("SHOW statement_timeout;") + assert cur.fetchone() == ("500ms",) + assert check_buffers(cur) is None + exec_may_cancel( + """ + SELECT min(id) FROM test1; + """, + cur, + (1,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + # Do a select on the data, putting some buffers into the prefetch + # queue. + cur.execute( + """ + SELECT count(id) FROM (select * from test1 LIMIT 256) a; + """ + ) + assert cur.fetchone() == (256,) + + ps.stop() + ps.start() + fp_enable() + + exec_may_cancel( + """ + SELECT COUNT(id) FROM test1; + """, + cur, + (1024,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}), + ) + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + exec_may_cancel( + """ + SELECT COUNT(id) FROM test2; + """, + cur, + (1024,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + fp_enable() + + exec_may_cancel( + """ + SELECT 0 < pg_database_size(CURRENT_DATABASE()); + """, + cur, + (True,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}), + ) + + fp_disable() + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376) + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1, 1024, 524800) + + ep.stop() diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index bbb1ad0c6d..545ba05b17 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -177,7 +177,16 @@ def test_sharding_split_unsharded( env.storage_controller.consistency_check() -def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "failpoint", + [ + None, + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + "compact-shard-ancestors-persistent", + ], +) +def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]): """ Test that after a split, we clean up parent layer data in the child shards via compaction. """ @@ -196,6 +205,11 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): "image_layer_creation_check_threshold": "0", } + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + "max_unavailable": "300s" + } + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -213,6 +227,10 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): # Split one shard into two shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + # Let all shards move into their stable locations, so that during subsequent steps we + # don't have reconciles in progress (simpler to reason about what messages we expect in logs) + env.storage_controller.reconcile_until_idle() + # Check we got the shard IDs we expected assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None @@ -237,6 +255,90 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): # Compaction shouldn't make anything unreadable workload.validate() + # Force a generation increase: layer rewrites are a long-term thing and only happen after + # the generation has increased. + env.pageserver.stop() + env.pageserver.start() + + # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"}) + env.storage_controller.reconcile_until_idle() + + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior + # across restarts, as we will have local layer files that temporarily disagree with the remote metadata + # for the same local layer file name. + if failpoint is not None: + ps.http_client().configure_failpoints((failpoint, "exit")) + + # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten) + # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively. + ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0) + + # We will compare stats before + after compaction + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + + # Invoke compaction: this should rewrite layers that are behind the pitr horizon + try: + ps.http_client().timeline_compact(shard, timeline_id) + except requests.ConnectionError as e: + if failpoint is None: + raise e + else: + log.info(f"Compaction failed (failpoint={failpoint}): {e}") + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we left local files that don't match remote metadata, we expect warnings on next startup + env.pageserver.allowed_errors.append( + ".*removing local file .+ because it has unexpected length.*" + ) + + # Post-failpoint: we check that the pageserver comes back online happily. + env.pageserver.running = False + env.pageserver.start() + else: + assert failpoint is None # We shouldn't reach success path if a failpoint was set + + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because layers are smaller + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Validate size statistics + for shard in shards: + ps = env.get_tenant_pageserver(shard) + timeline_info = ps.http_client().timeline_detail(shard, timeline_id) + reported_size = timeline_info["current_physical_size"] + layer_paths = ps.list_layers(shard, timeline_id) + measured_size = 0 + for p in layer_paths: + abs_path = ps.timeline_dir(shard, timeline_id) / p + measured_size += os.stat(abs_path).st_size + + log.info( + f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)" + ) + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we injected a failure between local rewrite and remote upload, then after + # restart we may end up with neither version of the file on local disk (the new file + # is cleaned up because it doesn't matchc remote metadata). So local size isn't + # necessarily going to match remote physical size. + continue + + assert measured_size == reported_size + + # Compaction shouldn't make anything unreadable + workload.validate() + def test_sharding_split_smoke( neon_env_builder: NeonEnvBuilder, diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 0f9d56e466..73a24c42d6 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -194,7 +194,7 @@ files: - metric_name: pg_stats_userdb type: gauge - help: 'Stats for the oldest non-system db' + help: 'Stats for several oldest non-system dbs' key_labels: - datname value_label: kind @@ -205,9 +205,8 @@ files: - inserted - updated - deleted - # We export stats for only one non-system database. Without this limit + # We export stats for 10 non-system database. Without this limit # it is too easy to abuse the system by creating lots of databases. - # We can try lifting this limit in the future after we understand the needs better. query: | select pg_database_size(datname) as db_size, deadlocks, tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, @@ -218,7 +217,7 @@ files: from pg_database where datname <> 'postgres' and not datistemplate order by oid - limit 1 + limit 10 ); - metric_name: max_cluster_size @@ -320,7 +319,7 @@ files: - metric_name: wal_is_lost type: gauge - help: 'Whether or not the replication slot\'s wal_status is lost' + help: 'Whether or not the replication slot wal_status is lost' key_labels: - slot_name values: [wal_status_is_lost]