diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 44e13a6c73..ae330d8a20 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -67,7 +67,7 @@ RUN apt update && \ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ - cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ + cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / @@ -95,7 +95,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && \ cd build && \ - cmake .. && \ + cmake -DCMAKE_BUILD_TYPE=Release .. && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control @@ -355,7 +355,7 @@ RUN apt-get update && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ - ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \ + ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ @@ -410,7 +410,7 @@ RUN apt-get update && \ mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ mkdir build && \ cd build && \ - cmake .. && \ + cmake -DCMAKE_BUILD_TYPE=Release .. && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control @@ -432,6 +432,54 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control +######################################################################################### +# +# Layer "rdkit-pg-build" +# compile rdkit extension +# +######################################################################################### +FROM build-deps AS rdkit-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN apt-get update && \ + apt-get install -y \ + cmake \ + libboost-iostreams1.74-dev \ + libboost-regex1.74-dev \ + libboost-serialization1.74-dev \ + libboost-system1.74-dev \ + libeigen3-dev \ + libfreetype6-dev + +ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" +RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \ + echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \ + mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \ + cmake \ + -D RDK_BUILD_CAIRO_SUPPORT=OFF \ + -D RDK_BUILD_INCHI_SUPPORT=ON \ + -D RDK_BUILD_AVALON_SUPPORT=ON \ + -D RDK_BUILD_PYTHON_WRAPPERS=OFF \ + -D RDK_BUILD_DESCRIPTORS3D=OFF \ + -D RDK_BUILD_FREESASA_SUPPORT=OFF \ + -D RDK_BUILD_COORDGEN_SUPPORT=ON \ + -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \ + -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \ + -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \ + -D RDK_USE_URF=OFF \ + -D RDK_BUILD_PGSQL=ON \ + -D RDK_PGSQL_STATIC=ON \ + -D PostgreSQL_CONFIG=pg_config \ + -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \ + -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \ + -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \ + -D RDK_INSTALL_INTREE=OFF \ + -D CMAKE_BUILD_TYPE=Release \ + . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control + ######################################################################################### # # Layer "rust extensions" @@ -564,6 +612,7 @@ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -637,14 +686,19 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS # libxml2, libxslt1.1 for xml2 # libzstd1 for zstd +# libboost*, libfreetype6, and zlib1g for rdkit RUN apt update && \ apt install --no-install-recommends -y \ gdb \ - locales \ libicu67 \ liblz4-1 \ libreadline8 \ + libboost-iostreams1.74.0 \ + libboost-regex1.74.0 \ + libboost-serialization1.74.0 \ + libboost-system1.74.0 \ libossp-uuid16 \ + libfreetype6 \ libgeos-c1v5 \ libgdal28 \ libproj19 \ @@ -654,7 +708,9 @@ RUN apt update && \ libxslt1.1 \ libzstd1 \ libcurl4-openssl-dev \ - procps && \ + locales \ + procps \ + zlib1g && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 617b330704..94cebf93de 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -370,11 +370,6 @@ impl ComputeNode { // 'Close' connection drop(client); - info!( - "finished configuration of compute for project {}", - spec.cluster.cluster_id.as_deref().unwrap_or("None") - ); - Ok(()) } @@ -427,22 +422,22 @@ impl ComputeNode { #[instrument(skip(self))] pub fn start_compute(&self) -> Result { let compute_state = self.state.lock().unwrap().clone(); - let spec = compute_state.pspec.as_ref().expect("spec must be set"); + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", - spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), - spec.spec.operation_uuid.as_deref().unwrap_or("None"), - spec.tenant_id, - spec.timeline_id, + pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), + pspec.spec.operation_uuid.as_deref().unwrap_or("None"), + pspec.tenant_id, + pspec.timeline_id, ); self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); - let pg = self.start_postgres(spec.storage_auth_token.clone())?; + let pg = self.start_postgres(pspec.storage_auth_token.clone())?; - if spec.spec.mode == ComputeMode::Primary { + if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { self.apply_config(&compute_state)?; } @@ -462,6 +457,11 @@ impl ComputeNode { } self.set_status(ComputeStatus::Running); + info!( + "finished configuration of compute for project {}", + pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None") + ); + Ok(pg) } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index b28315a35d..d3131ac476 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -450,6 +450,7 @@ impl Endpoint { // Create spec file let spec = ComputeSpec { + skip_pg_catalog_updates: false, format_version: 1.0, operation_uuid: None, cluster: Cluster { diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 4014774a7e..c2ad30f86f 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -27,6 +27,12 @@ pub struct ComputeSpec { pub cluster: Cluster, pub delta_operations: Option>, + /// An optinal hint that can be passed to speed up startup time if we know + /// that no pg catalog mutations (like role creation, database creation, + /// extension creation) need to be done on the actual database to start. + #[serde(default)] // Default false + pub skip_pg_catalog_updates: bool, + // Information needed to connect to the storage layer. // // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed. diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e0cc3ca543..ac1f8a357e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -111,6 +111,8 @@ pub trait RemoteStorage: Send + Sync + 'static { ) -> Result; async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; + + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; } pub struct Download { @@ -223,6 +225,14 @@ impl GenericRemoteStorage { Self::Unreliable(s) => s.delete(path).await, } } + + pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.delete_objects(paths).await, + Self::AwsS3(s) => s.delete_objects(paths).await, + Self::Unreliable(s) => s.delete_objects(paths).await, + } + } } impl GenericRemoteStorage { diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index c73e647845..59304c2481 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -320,6 +320,13 @@ impl RemoteStorage for LocalFs { .await .map_err(|e| anyhow::anyhow!(e))?) } + + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + for path in paths { + self.delete(path).await? + } + Ok(()) + } } fn storage_metadata_path(original_path: &Path) -> PathBuf { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 0be8c72fe0..38e1bf00f8 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -17,6 +17,7 @@ use aws_sdk_s3::{ error::SdkError, operation::get_object::GetObjectError, primitives::ByteStream, + types::{Delete, ObjectIdentifier}, Client, }; use aws_smithy_http::body::SdkBody; @@ -81,12 +82,24 @@ pub(super) mod metrics { .inc(); } + pub fn inc_delete_objects(count: u64) { + S3_REQUESTS_COUNT + .with_label_values(&["delete_object"]) + .inc_by(count); + } + pub fn inc_delete_object_fail() { S3_REQUESTS_FAIL_COUNT .with_label_values(&["delete_object"]) .inc(); } + pub fn inc_delete_objects_fail(count: u64) { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["delete_object"]) + .inc_by(count); + } + pub fn inc_list_objects() { S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); } @@ -396,6 +409,34 @@ impl RemoteStorage for S3Bucket { }) .await } + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 delete")?; + + let mut delete_objects = Vec::with_capacity(paths.len()); + for path in paths { + let obj_id = ObjectIdentifier::builder() + .set_key(Some(self.relative_path_to_s3_object(path))) + .build(); + delete_objects.push(obj_id); + } + + metrics::inc_delete_objects(paths.len() as u64); + self.client + .delete_objects() + .bucket(self.bucket_name.clone()) + .delete(Delete::builder().set_objects(Some(delete_objects)).build()) + .send() + .await + .map_err(|e| { + metrics::inc_delete_objects_fail(paths.len() as u64); + e + })?; + Ok(()) + } async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { let _guard = self diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index cb40859831..2f341bb29d 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -119,4 +119,11 @@ impl RemoteStorage for UnreliableWrapper { self.attempt(RemoteOp::Delete(path.clone()))?; self.inner.delete(path).await } + + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + for path in paths { + self.delete(path).await? + } + Ok(()) + } } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 48ed8f686c..5f52b0754c 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -107,6 +107,37 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result Ok(()) } +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3::Enabled(ctx) => ctx, + MaybeEnabledS3::Disabled => return Ok(()), + }; + + let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,))) + .with_context(|| "RemotePath conversion")?; + + let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,))) + .with_context(|| "RemotePath conversion")?; + + let data1 = "remote blob data1".as_bytes(); + let data1_len = data1.len(); + let data2 = "remote blob data2".as_bytes(); + let data2_len = data2.len(); + ctx.client + .upload(std::io::Cursor::new(data1), data1_len, &path1, None) + .await?; + + ctx.client + .upload(std::io::Cursor::new(data2), data2_len, &path2, None) + .await?; + + ctx.client.delete_objects(&[path1, path2]).await?; + + Ok(()) +} + fn ensure_logging_ready() { LOGGING_DONE.get_or_init(|| { utils::logging::init( diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 7cb96d9094..33241dbdf7 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,19 +1,18 @@ use crate::auth::{Claims, JwtAuth}; use crate::http::error::{api_error_handler, route_error_handler, ApiError}; -use anyhow::{anyhow, Context}; +use anyhow::Context; use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::http::HeaderValue; use hyper::Method; -use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; +use hyper::{header::CONTENT_TYPE, Body, Request, Response}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; -use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService}; +use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use tokio::task::JoinError; use tracing::{self, debug, info, info_span, warn, Instrument}; use std::future::Future; -use std::net::TcpListener; use std::str::FromStr; static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { @@ -348,40 +347,6 @@ pub fn check_permission_with( } } -/// -/// Start listening for HTTP requests on given socket. -/// -/// 'shutdown_future' can be used to stop. If the Future becomes -/// ready, we stop listening for new requests, and the function returns. -/// -pub fn serve_thread_main( - router_builder: RouterBuilder, - listener: TcpListener, - shutdown_future: S, -) -> anyhow::Result<()> -where - S: Future + Send + Sync, -{ - info!("Starting an HTTP endpoint at {}", listener.local_addr()?); - - // Create a Service from the router above to handle incoming requests. - let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap(); - - // Enter a single-threaded tokio runtime bound to the current thread - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - let _guard = runtime.enter(); - - let server = Server::from_tcp(listener)? - .serve(service) - .with_graceful_shutdown(shutdown_future); - - runtime.block_on(server)?; - - Ok(()) -} #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 936de35eb9..9ad0124a80 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir( { pg_control = Some(control_file); } - modification.flush()?; + modification.flush().await?; } } // We're done importing all the data files. - modification.commit()?; + modification.commit().await?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar( // We found the pg_control file. pg_control = Some(res); } - modification.flush()?; + modification.flush().await?; } tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); @@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar( // sanity check: ensure that pg_control is loaded let _pg_control = pg_control.context("pg_control file not found")?; - modification.commit()?; + modification.commit().await?; Ok(()) } @@ -594,7 +594,7 @@ async fn import_file( // zenith.signal is not necessarily the last file, that we handle // but it is ok to call `finish_write()`, because final `modification.commit()` // will update lsn once more to the final one. - let writer = modification.tline.writer(); + let writer = modification.tline.writer().await; writer.finish_write(prev_lsn); debug!("imported zenith signal {}", prev_lsn); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 186209dfcf..51cac43f50 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -699,6 +699,20 @@ impl<'a> DatadirModification<'a> { Ok(()) } + #[cfg(test)] + pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> { + self.init_empty()?; + self.put_control_file(bytes::Bytes::from_static( + b"control_file contents do not matter", + )) + .context("put_control_file")?; + self.put_checkpoint(bytes::Bytes::from_static( + b"checkpoint_file contents do not matter", + )) + .context("put_checkpoint_file")?; + Ok(()) + } + /// Put a new page version that can be constructed from a WAL record /// /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the @@ -1108,7 +1122,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub fn flush(&mut self) -> anyhow::Result<()> { + pub async fn flush(&mut self) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -1116,7 +1130,7 @@ impl<'a> DatadirModification<'a> { return Ok(()); } - let writer = self.tline.writer(); + let writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. let mut result: anyhow::Result<()> = Ok(()); @@ -1143,8 +1157,8 @@ impl<'a> DatadirModification<'a> { /// underlying timeline. /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub fn commit(&mut self) -> anyhow::Result<()> { - let writer = self.tline.writer(); + pub async fn commit(&mut self) -> anyhow::Result<()> { + let writer = self.tline.writer().await; let lsn = self.lsn; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -1593,20 +1607,6 @@ fn is_slru_block_key(key: Key) -> bool { && key.field6 != 0xffffffff // and not SlruSegSize } -#[cfg(test)] -pub fn create_test_timeline( - tenant: &crate::tenant::Tenant, - timeline_id: utils::id::TimelineId, - pg_version: u32, - ctx: &RequestContext, -) -> anyhow::Result> { - let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?; - let mut m = tline.begin_modification(Lsn(8)); - m.init_empty()?; - m.commit()?; - Ok(tline) -} - #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d23f1cb96f..32390c06cf 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11,7 +11,7 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, Context}; +use anyhow::{bail, ensure, Context}; use futures::FutureExt; use pageserver_api::models::TimelineState; use remote_storage::DownloadError; @@ -86,6 +86,7 @@ pub mod block_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; +pub mod manifest; pub mod metadata; mod par_fsync; @@ -185,18 +186,13 @@ struct TimelineUninitMark { } impl UninitializedTimeline<'_> { - /// Ensures timeline data is valid, loads it into pageserver's memory and removes - /// uninit mark file on success. + /// Finish timeline creation: insert it into the Tenant's timelines map and remove the + /// uninit mark file. /// /// This function launches the flush loop if not already done. /// /// The caller is responsible for activating the timeline (function `.activate()`). - fn initialize_with_lock( - mut self, - _ctx: &RequestContext, - timelines: &mut HashMap>, - load_layer_map: bool, - ) -> anyhow::Result> { + fn finish_creation(mut self) -> anyhow::Result> { let timeline_id = self.timeline_id; let tenant_id = self.owning_tenant.tenant_id; @@ -204,25 +200,19 @@ impl UninitializedTimeline<'_> { format!("No timeline for initalization found for {tenant_id}/{timeline_id}") })?; + // Check that the caller initialized disk_consistent_lsn let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); - // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least - // ensure!(new_disk_consistent_lsn.is_valid(), - // "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn and cannot be initialized"); + ensure!( + new_disk_consistent_lsn.is_valid(), + "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn" + ); + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); match timelines.entry(timeline_id) { Entry::Occupied(_) => anyhow::bail!( "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" ), Entry::Vacant(v) => { - if load_layer_map { - new_timeline - .load_layer_map(new_disk_consistent_lsn) - .with_context(|| { - format!( - "Failed to load layermap for timeline {tenant_id}/{timeline_id}" - ) - })?; - } uninit_mark.remove_uninit_mark().with_context(|| { format!( "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" @@ -251,9 +241,10 @@ impl UninitializedTimeline<'_> { .await .context("Failed to import basebackup")?; + // Flush the new layer files to disk, before we make the timeline as available to + // the outside world. + // // Flush loop needs to be spawned in order to be able to flush. - // We want to run proper checkpoint before we mark timeline as available to outside world - // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock raw_timeline.maybe_spawn_flush_loop(); fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -265,10 +256,9 @@ impl UninitializedTimeline<'_> { .await .context("Failed to flush after basebackup import")?; - // Initialize without loading the layer map. We started with an empty layer map, and already - // updated it for the layers that we created during the import. - let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - let tl = self.initialize_with_lock(ctx, &mut timelines, false)?; + // All the data has been imported. Insert the Timeline into the tenant's timelines + // map and remove the uninit mark file. + let tl = self.finish_creation()?; tl.activate(broker_client, None, ctx); Ok(tl) } @@ -311,15 +301,6 @@ fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { } impl TimelineUninitMark { - /// Useful for initializing timelines, existing on disk after the restart. - pub fn dummy() -> Self { - Self { - uninit_mark_deleted: true, - uninit_mark_path: PathBuf::new(), - timeline_path: PathBuf::new(), - } - } - fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self { Self { uninit_mark_deleted: false, @@ -513,7 +494,7 @@ impl Tenant { ancestor: Option>, first_save: bool, init_order: Option<&InitializationOrder>, - ctx: &RequestContext, + _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; @@ -524,54 +505,38 @@ impl Tenant { .context("merge_local_remote_metadata")? .to_owned(); - let timeline = { + let timeline = self.create_timeline_struct( + timeline_id, + up_to_date_metadata, + ancestor.clone(), + remote_client, + init_order, + )?; + let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn(); + anyhow::ensure!( + new_disk_consistent_lsn.is_valid(), + "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn" + ); + timeline + .load_layer_map(new_disk_consistent_lsn) + .with_context(|| { + format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") + })?; + + { // avoiding holding it across awaits let mut timelines_accessor = self.timelines.lock().unwrap(); - if timelines_accessor.contains_key(&timeline_id) { - anyhow::bail!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); - } - - let dummy_timeline = self.create_timeline_data( - timeline_id, - up_to_date_metadata, - ancestor.clone(), - remote_client, - init_order, - )?; - - let timeline = UninitializedTimeline { - owning_tenant: self, - timeline_id, - raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())), - }; - // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote - // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver - // will ingest data which may require looking at the layers which are not yet available locally - match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true) { - Ok(new_timeline) => new_timeline, - Err(e) => { - error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}"); - // FIXME using None is a hack, it wont hurt, just ugly. - // Ideally initialize_with_lock error should return timeline in the error - // Or return ownership of itself completely so somethin like into_broken - // can be called directly on Uninitielized timeline - // also leades to redundant .clone - let broken_timeline = self - .create_timeline_data( - timeline_id, - up_to_date_metadata, - ancestor.clone(), - None, - None, - ) - .with_context(|| { - format!("creating broken timeline data for {tenant_id}/{timeline_id}") - })?; - broken_timeline.set_broken(e.to_string()); - timelines_accessor.insert(timeline_id, broken_timeline); - return Err(e); + match timelines_accessor.entry(timeline_id) { + Entry::Occupied(_) => { + // The uninit mark file acts as a lock that prevents another task from + // initializing the timeline at the same time. + unreachable!( + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); + } + Entry::Vacant(v) => { + v.insert(Arc::clone(&timeline)); + timeline.maybe_spawn_flush_loop(); } } }; @@ -1163,14 +1128,14 @@ impl Tenant { .init_upload_queue_stopped_to_continue_deletion(&index_part)?; let timeline = self - .create_timeline_data( + .create_timeline_struct( timeline_id, &local_metadata, ancestor, Some(remote_client), init_order, ) - .context("create_timeline_data")?; + .context("create_timeline_struct")?; let guard = Arc::clone(&timeline.delete_lock).lock_owned().await; @@ -1267,6 +1232,18 @@ impl Tenant { /// This is used to create the initial 'main' timeline during bootstrapping, /// or when importing a new base backup. The caller is expected to load an /// initial image of the datadir to the new timeline after this. + /// + /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0)) + /// and the timeline will fail to load at a restart. + /// + /// That's why we add an uninit mark file, and wrap it together witht the Timeline + /// in-memory object into UninitializedTimeline. + /// Once the caller is done setting up the timeline, they should call + /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark. + /// + /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the + /// minimum amount of keys required to get a writable timeline. + /// (Without it, `put` might fail due to `repartition` failing.) pub fn create_empty_timeline( &self, new_timeline_id: TimelineId, @@ -1284,6 +1261,8 @@ impl Tenant { drop(timelines); let new_metadata = TimelineMetadata::new( + // Initialize disk_consistent LSN to 0, The caller must import some data to + // make it valid, before calling finish_creation() Lsn(0), None, None, @@ -1292,11 +1271,11 @@ impl Tenant { initdb_lsn, pg_version, ); - self.prepare_timeline( + self.prepare_new_timeline( new_timeline_id, &new_metadata, timeline_uninit_mark, - true, + initdb_lsn, None, ) } @@ -1307,7 +1286,7 @@ impl Tenant { // This makes the various functions which anyhow::ensure! for Active state work in tests. // Our current tests don't need the background loops. #[cfg(test)] - pub fn create_test_timeline( + pub async fn create_test_timeline( &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, @@ -1315,8 +1294,24 @@ impl Tenant { ctx: &RequestContext, ) -> anyhow::Result> { let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?; - let mut timelines = self.timelines.lock().unwrap(); - let tl = uninit_tl.initialize_with_lock(ctx, &mut timelines, true)?; + let tline = uninit_tl.raw_timeline().expect("we just created it"); + assert_eq!(tline.get_last_record_lsn(), Lsn(0)); + + // Setup minimum keys required for the timeline to be usable. + let mut modification = tline.begin_modification(initdb_lsn); + modification + .init_empty_test_timeline() + .context("init_empty_test_timeline")?; + modification + .commit() + .await + .context("commit init_empty_test_timeline modification")?; + + // Flush to disk so that uninit_tl's check for valid disk_consistent_lsn passes. + tline.maybe_spawn_flush_loop(); + tline.freeze_and_flush().await.context("freeze_and_flush")?; + + let tl = uninit_tl.finish_creation()?; // The non-test code would call tl.activate() here. tl.set_state(TimelineState::Active); Ok(tl) @@ -2248,7 +2243,12 @@ impl Tenant { } } - fn create_timeline_data( + /// Helper function to create a new Timeline struct. + /// + /// The returned Timeline is in Loading state. The caller is responsible for + /// initializing any on-disk state, and for inserting the Timeline to the 'timelines' + /// map. + fn create_timeline_struct( &self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, @@ -2668,7 +2668,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - ctx: &RequestContext, + _ctx: &RequestContext, ) -> anyhow::Result> { let src_id = src_timeline.timeline_id; @@ -2753,17 +2753,15 @@ impl Tenant { src_timeline.pg_version, ); - let new_timeline = { - let mut timelines = self.timelines.lock().unwrap(); - self.prepare_timeline( - dst_id, - &metadata, - timeline_uninit_mark, - false, - Some(Arc::clone(src_timeline)), - )? - .initialize_with_lock(ctx, &mut timelines, true)? - }; + let uninitialized_timeline = self.prepare_new_timeline( + dst_id, + &metadata, + timeline_uninit_mark, + start_lsn + 1, + Some(Arc::clone(src_timeline)), + )?; + + let new_timeline = uninitialized_timeline.finish_creation()?; // Root timeline gets its layers during creation and uploads them along with the metadata. // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created. @@ -2839,8 +2837,13 @@ impl Tenant { pgdata_lsn, pg_version, ); - let raw_timeline = - self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?; + let raw_timeline = self.prepare_new_timeline( + timeline_id, + &new_metadata, + timeline_uninit_mark, + pgdata_lsn, + None, + )?; let tenant_id = raw_timeline.owning_tenant.tenant_id; let unfinished_timeline = raw_timeline.raw_timeline()?; @@ -2856,10 +2859,10 @@ impl Tenant { format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") })?; - // Flush the new layer files to disk, before we mark the timeline as available to + // Flush the new layer files to disk, before we make the timeline as available to // the outside world. // - // Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock + // Flush loop needs to be spawned in order to be able to flush. unfinished_timeline.maybe_spawn_flush_loop(); fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -2875,12 +2878,8 @@ impl Tenant { ) })?; - // Initialize the timeline without loading the layer map, because we already updated the layer - // map above, when we imported the datadir. - let timeline = { - let mut timelines = self.timelines.lock().unwrap(); - raw_timeline.initialize_with_lock(ctx, &mut timelines, false)? - }; + // All done! + let timeline = raw_timeline.finish_creation()?; info!( "created root timeline {} timeline.lsn {}", @@ -2891,14 +2890,18 @@ impl Tenant { Ok(timeline) } - /// Creates intermediate timeline structure and its files, without loading it into memory. - /// It's up to the caller to import the necesary data and import the timeline into memory. - fn prepare_timeline( + /// Creates intermediate timeline structure and its files. + /// + /// An empty layer map is initialized, and new data and WAL can be imported starting + /// at 'disk_consistent_lsn'. After any initial data has been imported, call + /// `finish_creation` to insert the Timeline into the timelines map and to remove the + /// uninit mark file. + fn prepare_new_timeline( &self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, uninit_mark: TimelineUninitMark, - init_layers: bool, + start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result { let tenant_id = self.tenant_id; @@ -2916,33 +2919,27 @@ impl Tenant { None }; - match self.create_timeline_files( - &uninit_mark.timeline_path, - new_timeline_id, - new_metadata, - ancestor, - remote_client, - ) { - Ok(new_timeline) => { - if init_layers { - new_timeline.layers.write().unwrap().next_open_layer_at = - Some(new_timeline.initdb_lsn); - } - debug!( - "Successfully created initial files for timeline {tenant_id}/{new_timeline_id}" - ); - Ok(UninitializedTimeline { - owning_tenant: self, - timeline_id: new_timeline_id, - raw_timeline: Some((new_timeline, uninit_mark)), - }) - } - Err(e) => { - error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); - cleanup_timeline_directory(uninit_mark); - Err(e) - } + let timeline_struct = self + .create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None) + .context("Failed to create timeline data structure")?; + + timeline_struct.init_empty_layer_map(start_lsn); + + if let Err(e) = + self.create_timeline_files(&uninit_mark.timeline_path, new_timeline_id, new_metadata) + { + error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); + cleanup_timeline_directory(uninit_mark); + return Err(e); } + + debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}"); + + Ok(UninitializedTimeline { + owning_tenant: self, + timeline_id: new_timeline_id, + raw_timeline: Some((timeline_struct, uninit_mark)), + }) } fn create_timeline_files( @@ -2950,13 +2947,8 @@ impl Tenant { timeline_path: &Path, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - ancestor: Option>, - remote_client: Option, - ) -> anyhow::Result> { - let timeline_data = self - .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client, None) - .context("Failed to create timeline data structure")?; - crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; + ) -> anyhow::Result<()> { + crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?; fail::fail_point!("after-timeline-uninit-mark-creation", |_| { anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); @@ -2970,8 +2962,7 @@ impl Tenant { true, ) .context("Failed to create timeline metadata")?; - - Ok(timeline_data) + Ok(()) } /// Attempts to create an uninit mark file for the timeline initialization. @@ -3586,14 +3577,16 @@ mod tests { #[tokio::test] async fn test_basic() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); - let writer = tline.writer(); + let writer = tline.writer().await; writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -3619,9 +3612,11 @@ mod tests { let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")? .load() .await; - let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let _ = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; - match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -3650,8 +3645,10 @@ mod tests { use std::str::from_utf8; let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let writer = tline.writer(); + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + let writer = tline.writer().await; #[allow(non_snake_case)] let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); @@ -3677,7 +3674,7 @@ mod tests { let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - let new_writer = newtline.writer(); + let new_writer = newtline.writer().await; new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; new_writer.finish_write(Lsn(0x40)); @@ -3704,7 +3701,7 @@ mod tests { let mut lsn = start_lsn; #[allow(non_snake_case)] { - let writer = tline.writer(); + let writer = tline.writer().await; // Create a relation on the timeline writer.put( *TEST_KEY, @@ -3723,7 +3720,7 @@ mod tests { } tline.freeze_and_flush().await?; { - let writer = tline.writer(); + let writer = tline.writer().await; writer.put( *TEST_KEY, lsn, @@ -3747,7 +3744,9 @@ mod tests { TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load() .await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -3784,8 +3783,9 @@ mod tests { .load() .await; - let tline = - tenant.create_test_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx) + .await?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) @@ -3834,7 +3834,9 @@ mod tests { TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? .load() .await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant @@ -3882,7 +3884,9 @@ mod tests { TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? .load() .await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant @@ -3905,7 +3909,9 @@ mod tests { TenantHarness::create("test_parent_keeps_data_forever_after_branching")? .load() .await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant @@ -3937,8 +3943,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let (tenant, ctx) = harness.load().await; - let tline = - tenant.create_test_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx) + .await?; make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; } @@ -3957,8 +3964,9 @@ mod tests { // create two timelines { let (tenant, ctx) = harness.load().await; - let tline = - tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; @@ -3995,7 +4003,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let (tenant, ctx) = harness.load().await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; drop(tline); drop(tenant); @@ -4033,9 +4043,11 @@ mod tests { #[tokio::test] async fn test_images() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); @@ -4043,7 +4055,7 @@ mod tests { tline.freeze_and_flush().await?; tline.compact(&ctx).await?; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -4051,7 +4063,7 @@ mod tests { tline.freeze_and_flush().await?; tline.compact(&ctx).await?; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; writer.finish_write(Lsn(0x30)); drop(writer); @@ -4059,7 +4071,7 @@ mod tests { tline.freeze_and_flush().await?; tline.compact(&ctx).await?; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; writer.finish_write(Lsn(0x40)); drop(writer); @@ -4098,7 +4110,9 @@ mod tests { #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; let mut lsn = Lsn(0x10); @@ -4109,7 +4123,7 @@ mod tests { for _ in 0..50 { for _ in 0..10000 { test_key.field6 = blknum; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put( test_key, lsn, @@ -4140,7 +4154,9 @@ mod tests { #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await; - let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; const NUM_KEYS: usize = 1000; @@ -4152,12 +4168,12 @@ mod tests { // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; - let mut lsn = Lsn(0); + let mut lsn = Lsn(0x10); #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put( test_key, lsn, @@ -4175,7 +4191,7 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put( test_key, lsn, @@ -4213,8 +4229,9 @@ mod tests { let (tenant, ctx) = TenantHarness::create("test_traverse_branches")? .load() .await; - let mut tline = - tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let mut tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; const NUM_KEYS: usize = 1000; @@ -4226,12 +4243,12 @@ mod tests { // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; - let mut lsn = Lsn(0); + let mut lsn = Lsn(0x10); #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put( test_key, lsn, @@ -4257,7 +4274,7 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put( test_key, lsn, @@ -4296,8 +4313,9 @@ mod tests { let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")? .load() .await; - let mut tline = - tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let mut tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; @@ -4306,7 +4324,7 @@ mod tests { // Track page mutation lsns across different timelines. let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES]; - let mut lsn = Lsn(0); + let mut lsn = Lsn(0x10); #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { @@ -4322,7 +4340,7 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer(); + let writer = tline.writer().await; writer.put( test_key, lsn, @@ -4352,6 +4370,74 @@ mod tests { } Ok(()) } + + #[tokio::test] + async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> { + let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")? + .load() + .await; + + let initdb_lsn = Lsn(0x20); + let utline = + tenant.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)?; + let tline = utline.raw_timeline().unwrap(); + + // Spawn flush loop now so that we can set the `expect_initdb_optimization` + tline.maybe_spawn_flush_loop(); + + // Make sure the timeline has the minimum set of required keys for operation. + // The only operation you can always do on an empty timeline is to `put` new data. + // Except if you `put` at `initdb_lsn`. + // In that case, there's an optimization to directly create image layers instead of delta layers. + // It uses `repartition()`, which assumes some keys to be present. + // Let's make sure the test timeline can handle that case. + { + let mut state = tline.flush_loop_state.lock().unwrap(); + assert_eq!( + timeline::FlushLoopState::Running { + expect_initdb_optimization: false, + initdb_optimization_count: 0, + }, + *state + ); + *state = timeline::FlushLoopState::Running { + expect_initdb_optimization: true, + initdb_optimization_count: 0, + }; + } + + // Make writes at the initdb_lsn. When we flush it below, it should be handled by the optimization. + // As explained above, the optimization requires some keys to be present. + // As per `create_empty_timeline` documentation, use init_empty to set them. + // This is what `create_test_timeline` does, by the way. + let mut modification = tline.begin_modification(initdb_lsn); + modification + .init_empty_test_timeline() + .context("init_empty_test_timeline")?; + modification + .commit() + .await + .context("commit init_empty_test_timeline modification")?; + + // Do the flush. The flush code will check the expectations that we set above. + tline.freeze_and_flush().await?; + + // assert freeze_and_flush exercised the initdb optimization + { + let state = tline.flush_loop_state.lock().unwrap(); + let + timeline::FlushLoopState::Running { + expect_initdb_optimization, + initdb_optimization_count, + } = *state else { + panic!("unexpected state: {:?}", *state); + }; + assert!(expect_initdb_optimization); + assert!(initdb_optimization_count > 0); + } + + Ok(()) + } } #[cfg(not(debug_assertions))] diff --git a/pageserver/src/tenant/manifest.rs b/pageserver/src/tenant/manifest.rs new file mode 100644 index 0000000000..745437dfbd --- /dev/null +++ b/pageserver/src/tenant/manifest.rs @@ -0,0 +1,325 @@ +//! This module contains the encoding and decoding of the local manifest file. +//! +//! MANIFEST is a write-ahead log which is stored locally to each timeline. It +//! records the state of the storage engine. It contains a snapshot of the +//! state and all operations proceeding that snapshot. The file begins with a +//! header recording MANIFEST version number. After that, it contains a snapshot. +//! The snapshot is followed by a list of operations. Each operation is a list +//! of records. Each record is either an addition or a removal of a layer. +//! +//! With MANIFEST, we can: +//! +//! 1. recover state quickly by reading the file, potentially boosting the +//! startup speed. +//! 2. ensure all operations are atomic and avoid corruption, solving issues +//! like redundant image layer and preparing us for future compaction +//! strategies. +//! +//! There is also a format for storing all layer files on S3, called +//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which +//! records all operations as logs, and therefore we can easily replay the +//! operations when recovering from crash, while ensuring those operations +//! are atomic upon restart. +//! +//! Currently, this is not used in the system. Future refactors will ensure +//! the storage state will be recorded in this file, and the system can be +//! recovered from this file. This is tracked in +//! https://github.com/neondatabase/neon/issues/4418 + +use std::io::{self, Read, Write}; + +use crate::virtual_file::VirtualFile; +use anyhow::Result; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use crc32c::crc32c; +use serde::{Deserialize, Serialize}; +use tracing::log::warn; +use utils::lsn::Lsn; + +use super::storage_layer::PersistentLayerDesc; + +pub struct Manifest { + file: VirtualFile, +} + +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub struct Snapshot { + pub layers: Vec, +} + +/// serde by default encode this in tagged enum, and therefore it will be something +/// like `{ "AddLayer": { ... } }`. +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub enum Record { + AddLayer(PersistentLayerDesc), + RemoveLayer(PersistentLayerDesc), +} + +/// `echo neon.manifest | sha1sum` and take the leading 8 bytes. +const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c; +const MANIFEST_VERSION: u64 = 1; + +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub struct ManifestHeader { + magic_number: u64, + version: u64, +} + +const MANIFEST_HEADER_LEN: usize = 16; + +impl ManifestHeader { + fn encode(&self) -> BytesMut { + let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN); + buf.put_u64(self.magic_number); + buf.put_u64(self.version); + buf + } + + fn decode(mut buf: &[u8]) -> Self { + assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header"); + Self { + magic_number: buf.get_u64(), + version: buf.get_u64(), + } + } +} + +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub enum Operation { + /// A snapshot of the current state. + /// + /// Lsn field represents the LSN that is persisted to disk for this snapshot. + Snapshot(Snapshot, Lsn), + /// An atomic operation that changes the state. + /// + /// Lsn field represents the LSN that is persisted to disk after the operation is done. + /// This will only change when new L0 is flushed to the disk. + Operation(Vec, Lsn), +} + +struct RecordHeader { + size: u32, + checksum: u32, +} + +const RECORD_HEADER_LEN: usize = 8; + +impl RecordHeader { + fn encode(&self) -> BytesMut { + let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN); + buf.put_u32(self.size); + buf.put_u32(self.checksum); + buf + } + + fn decode(mut buf: &[u8]) -> Self { + assert!(buf.len() == RECORD_HEADER_LEN, "invalid header"); + Self { + size: buf.get_u32(), + checksum: buf.get_u32(), + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum ManifestLoadError { + #[error("manifest header is corrupted")] + CorruptedManifestHeader, + #[error("unsupported manifest version: got {0}, expected {1}")] + UnsupportedVersion(u64, u64), + #[error("error when decoding record: {0}")] + DecodeRecord(serde_json::Error), + #[error("I/O error: {0}")] + Io(io::Error), +} + +#[must_use = "Should check if the manifest is partially corrupted"] +pub struct ManifestPartiallyCorrupted(bool); + +impl Manifest { + /// Create a new manifest by writing the manifest header and a snapshot record to the given file. + pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result { + let mut manifest = Self { file }; + manifest.append_manifest_header(ManifestHeader { + magic_number: MANIFEST_MAGIC_NUMBER, + version: MANIFEST_VERSION, + })?; + manifest.append_operation(Operation::Snapshot(snapshot, lsn))?; + Ok(manifest) + } + + /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted, + /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and + /// backup the current one. + pub fn load( + mut file: VirtualFile, + ) -> Result<(Self, Vec, ManifestPartiallyCorrupted), ManifestLoadError> { + let mut buf = vec![]; + file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?; + + // Read manifest header + let mut buf = Bytes::from(buf); + if buf.remaining() < MANIFEST_HEADER_LEN { + return Err(ManifestLoadError::CorruptedManifestHeader); + } + let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]); + buf.advance(MANIFEST_HEADER_LEN); + if header.version != MANIFEST_VERSION { + return Err(ManifestLoadError::UnsupportedVersion( + header.version, + MANIFEST_VERSION, + )); + } + + // Read operations + let mut operations = Vec::new(); + let corrupted = loop { + if buf.remaining() == 0 { + break false; + } + if buf.remaining() < RECORD_HEADER_LEN { + warn!("incomplete header when decoding manifest, could be corrupted"); + break true; + } + let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]); + let size = size as usize; + buf.advance(RECORD_HEADER_LEN); + if buf.remaining() < size { + warn!("incomplete data when decoding manifest, could be corrupted"); + break true; + } + let data = &buf[..size]; + if crc32c(data) != checksum { + warn!("checksum mismatch when decoding manifest, could be corrupted"); + break true; + } + // if the following decode fails, we cannot use the manifest or safely ignore any record. + operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?); + buf.advance(size); + }; + Ok(( + Self { file }, + operations, + ManifestPartiallyCorrupted(corrupted), + )) + } + + fn append_data(&mut self, data: &[u8]) -> Result<()> { + if data.len() >= u32::MAX as usize { + panic!("data too large"); + } + let header = RecordHeader { + size: data.len() as u32, + checksum: crc32c(data), + }; + let header = header.encode(); + self.file.write_all(&header)?; + self.file.write_all(data)?; + self.file.sync_all()?; + Ok(()) + } + + fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> { + let encoded = header.encode(); + self.file.write_all(&encoded)?; + Ok(()) + } + + /// Add an operation to the manifest. The operation will be appended to the end of the file, + /// and the file will fsync. + pub fn append_operation(&mut self, operation: Operation) -> Result<()> { + let encoded = Vec::from(serde_json::to_string(&operation)?); + self.append_data(&encoded) + } +} + +#[cfg(test)] +mod tests { + use std::fs::OpenOptions; + + use crate::repository::Key; + + use super::*; + + #[test] + fn test_read_manifest() { + let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest"); + std::fs::create_dir_all(&testdir).unwrap(); + let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap(); + let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233)); + let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333)); + let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333)); + let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333)); + + // Write a manifest with a snapshot and some operations + let snapshot = Snapshot { + layers: vec![layer1, layer2], + }; + let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap(); + manifest + .append_operation(Operation::Operation( + vec![Record::AddLayer(layer3.clone())], + Lsn::from(1), + )) + .unwrap(); + drop(manifest); + + // Open the second time and write + let file = VirtualFile::open_with_options( + &testdir.join("MANIFEST"), + OpenOptions::new() + .read(true) + .write(true) + .create_new(false) + .truncate(false), + ) + .unwrap(); + let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap(); + assert!(!corrupted.0); + assert_eq!(operations.len(), 2); + assert_eq!( + &operations[0], + &Operation::Snapshot(snapshot.clone(), Lsn::from(0)) + ); + assert_eq!( + &operations[1], + &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1)) + ); + manifest + .append_operation(Operation::Operation( + vec![ + Record::RemoveLayer(layer3.clone()), + Record::AddLayer(layer4.clone()), + ], + Lsn::from(2), + )) + .unwrap(); + drop(manifest); + + // Open the third time and verify + let file = VirtualFile::open_with_options( + &testdir.join("MANIFEST"), + OpenOptions::new() + .read(true) + .write(true) + .create_new(false) + .truncate(false), + ) + .unwrap(); + let (_manifest, operations, corrupted) = Manifest::load(file).unwrap(); + assert!(!corrupted.0); + assert_eq!(operations.len(), 3); + assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0))); + assert_eq!( + &operations[1], + &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1)) + ); + assert_eq!( + &operations[2], + &Operation::Operation( + vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)], + Lsn::from(2) + ) + ); + } +} diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 2936e7a4af..2c84c59dcb 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1392,7 +1392,12 @@ mod tests { let harness = TenantHarness::create(test_name)?; let (tenant, ctx) = runtime.block_on(harness.load()); // create an empty timeline directory - let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let _ = runtime.block_on(tenant.create_test_timeline( + TIMELINE_ID, + Lsn(8), + DEFAULT_PG_VERSION, + &ctx, + ))?; let remote_fs_dir = harness.conf.workdir.join("remote_fs"); std::fs::create_dir_all(remote_fs_dir)?; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 624fe8dac4..6e14663121 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; +use once_cell::sync::OnceCell; use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -46,7 +47,6 @@ use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; -use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tracing::*; use utils::{ @@ -184,7 +184,7 @@ pub struct DeltaLayer { access_stats: LayerAccessStats, - inner: RwLock, + inner: OnceCell, } impl std::fmt::Debug for DeltaLayer { @@ -201,21 +201,17 @@ impl std::fmt::Debug for DeltaLayer { } pub struct DeltaLayerInner { - /// If false, the fields below have not been loaded into memory yet. - loaded: bool, - // values copied from summary index_start_blk: u32, index_root_blk: u32, - /// Reader object for reading blocks from the file. (None if not loaded yet) - file: Option>, + /// Reader object for reading blocks from the file. + file: FileBlockReader, } impl std::fmt::Debug for DeltaLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("DeltaLayerInner") - .field("loaded", &self.loaded) .field("index_start_blk", &self.index_start_blk) .field("index_root_blk", &self.index_root_blk) .finish() @@ -246,7 +242,7 @@ impl Layer for DeltaLayer { inner.index_start_blk, inner.index_root_blk ); - let file = inner.file.as_ref().unwrap(); + let file = &inner.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( inner.index_start_blk, inner.index_root_blk, @@ -315,7 +311,7 @@ impl Layer for DeltaLayer { let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?; // Scan the page versions backwards, starting from `lsn`. - let file = inner.file.as_ref().unwrap(); + let file = &inner.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( inner.index_start_blk, inner.index_root_blk, @@ -500,51 +496,22 @@ impl DeltaLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result> { + fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> { self.access_stats .record_access(access_kind, ctx.task_kind()); - loop { - // Quick exit if already loaded - let inner = self.inner.read().unwrap(); - if inner.loaded { - return Ok(inner); - } - - // Need to open the file and load the metadata. Upgrade our lock to - // a write lock. (Or rather, release and re-lock in write mode.) - drop(inner); - let inner = self.inner.write().unwrap(); - if !inner.loaded { - self.load_inner(inner).with_context(|| { - format!("Failed to load delta layer {}", self.path().display()) - })?; - } else { - // Another thread loaded it while we were not holding the lock. - } - - // We now have the file open and loaded. There's no function to do - // that in the std library RwLock, so we have to release and re-lock - // in read mode. (To be precise, the lock guard was moved in the - // above call to `load_inner`, so it's already been released). And - // while we do that, another thread could unload again, so we have - // to re-check and retry if that happens. - } + // Quick exit if already loaded + self.inner + .get_or_try_init(|| self.load_inner()) + .with_context(|| format!("Failed to load delta layer {}", self.path().display())) } - fn load_inner(&self, mut inner: RwLockWriteGuard) -> Result<()> { + fn load_inner(&self) -> Result { let path = self.path(); - // Open the file if it's not open already. - if inner.file.is_none() { - let file = VirtualFile::open(&path) - .with_context(|| format!("Failed to open file '{}'", path.display()))?; - inner.file = Some(FileBlockReader::new(file)); - } - let file = inner.file.as_mut().unwrap(); + let file = VirtualFile::open(&path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + let file = FileBlockReader::new(file); + let summary_blk = file.read_blk(0)?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; @@ -571,13 +538,13 @@ impl DeltaLayer { } } - inner.index_start_blk = actual_summary.index_start_blk; - inner.index_root_blk = actual_summary.index_root_blk; - debug!("loaded from {}", &path.display()); - inner.loaded = true; - Ok(()) + Ok(DeltaLayerInner { + file, + index_start_blk: actual_summary.index_start_blk, + index_root_blk: actual_summary.index_root_blk, + }) } /// Create a DeltaLayer struct representing an existing file on disk. @@ -599,12 +566,7 @@ impl DeltaLayer { file_size, ), access_stats, - inner: RwLock::new(DeltaLayerInner { - loaded: false, - file: None, - index_start_blk: 0, - index_root_blk: 0, - }), + inner: once_cell::sync::OnceCell::new(), } } @@ -631,12 +593,7 @@ impl DeltaLayer { metadata.len(), ), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), - inner: RwLock::new(DeltaLayerInner { - loaded: false, - file: None, - index_start_blk: 0, - index_root_blk: 0, - }), + inner: once_cell::sync::OnceCell::new(), }) } @@ -800,12 +757,7 @@ impl DeltaLayerWriterInner { metadata.len(), ), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), - inner: RwLock::new(DeltaLayerInner { - loaded: false, - file: None, - index_start_blk, - index_root_blk, - }), + inner: once_cell::sync::OnceCell::new(), }; // fsync the file @@ -940,13 +892,13 @@ struct DeltaValueIter<'a> { reader: BlockCursor>, } -struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>); +struct Adapter<'a>(&'a DeltaLayerInner); impl<'a> BlockReader for Adapter<'a> { type BlockLease = PageReadGuard<'static>; fn read_blk(&self, blknum: u32) -> Result { - self.0.file.as_ref().unwrap().read_blk(blknum) + self.0.file.read_blk(blknum) } } @@ -959,8 +911,8 @@ impl<'a> Iterator for DeltaValueIter<'a> { } impl<'a> DeltaValueIter<'a> { - fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { - let file = inner.file.as_ref().unwrap(); + fn new(inner: &'a DeltaLayerInner) -> Result { + let file = &inner.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( inner.index_start_blk, inner.index_root_blk, @@ -1033,8 +985,8 @@ impl Iterator for DeltaKeyIter { } impl<'a> DeltaKeyIter { - fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { - let file = inner.file.as_ref().unwrap(); + fn new(inner: &'a DeltaLayerInner) -> Result { + let file = &inner.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( inner.index_start_blk, inner.index_root_blk, @@ -1074,3 +1026,21 @@ impl<'a> DeltaKeyIter { Ok(iter) } } + +#[cfg(test)] +mod test { + use super::DeltaKeyIter; + use super::DeltaLayer; + use super::DeltaValueIter; + + // We will soon need the iters to be send in the compaction code. + // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883 + // Cf https://github.com/neondatabase/neon/issues/4471 + #[test] + fn is_send() { + fn assert_send() {} + assert_send::(); + assert_send::(); + assert_send::(); + } +} diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index d1cef70253..5ed548909e 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -9,10 +9,12 @@ use crate::{context::RequestContext, repository::Key}; use super::{DeltaFileName, ImageFileName, LayerFileName}; +use serde::{Deserialize, Serialize}; + /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct PersistentLayerDesc { pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -50,6 +52,19 @@ impl PersistentLayerDesc { self.filename().file_name() } + #[cfg(test)] + pub fn new_test(key_range: Range) -> Self { + Self { + tenant_id: TenantId::generate(), + timeline_id: TimelineId::generate(), + key_range, + lsn_range: Lsn(0)..Lsn(1), + is_delta: false, + is_incremental: false, + file_size: 0, + } + } + pub fn new_img( tenant_id: TenantId, timeline_id: TimelineId, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 71f83bf127..b8a7cdacb7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -28,7 +28,7 @@ use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; use std::pin::pin; use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; +use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; use crate::context::{DownloadBehavior, RequestContext}; @@ -84,9 +84,14 @@ use super::remote_timeline_client::RemoteTimelineClient; use super::storage_layer::{DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] -enum FlushLoopState { +pub(super) enum FlushLoopState { NotStarted, - Running, + Running { + #[cfg(test)] + expect_initdb_optimization: bool, + #[cfg(test)] + initdb_optimization_count: usize, + }, Exited, } @@ -180,10 +185,10 @@ pub struct Timeline { /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. - write_lock: Mutex<()>, + write_lock: tokio::sync::Mutex<()>, /// Used to avoid multiple `flush_loop` tasks running - flush_loop_state: Mutex, + pub(super) flush_loop_state: Mutex, /// layer_flush_start_tx can be used to wake up the layer-flushing task. /// The value is a counter, incremented every time a new flush cycle is requested. @@ -684,7 +689,7 @@ impl Timeline { /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))] pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { - self.freeze_inmem_layer(false); + self.freeze_inmem_layer(false).await; self.flush_frozen_layers_and_wait().await } @@ -863,10 +868,10 @@ impl Timeline { } /// Mutate the timeline with a [`TimelineWriter`]. - pub fn writer(&self) -> TimelineWriter<'_> { + pub async fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { tl: self, - _write_guard: self.write_lock.lock().unwrap(), + _write_guard: self.write_lock.lock().await, } } @@ -900,37 +905,39 @@ impl Timeline { /// /// Also flush after a period of time without new data -- it helps /// safekeepers to regard pageserver as caught up and suspend activity. - pub fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { + pub async fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { let last_lsn = self.get_last_record_lsn(); - let layers = self.layers.read().unwrap(); - if let Some(open_layer) = &layers.open_layer { - let open_layer_size = open_layer.size()?; - drop(layers); - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); + let open_layer_size = { + let layers = self.layers.read().unwrap(); + let Some(open_layer) = layers.open_layer.as_ref() else { + return Ok(()); + }; + open_layer.size()? + }; + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); + let distance = last_lsn.widening_sub(last_freeze_at); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) + { + info!( + "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", + distance, + open_layer_size, + last_freeze_ts.elapsed() + ); - self.freeze_inmem_layer(true); - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); + self.freeze_inmem_layer(true).await; + self.last_freeze_at.store(last_lsn); + *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - // Wake up the layer flusher - self.flush_frozen_layers(); - } + // Wake up the layer flusher + self.flush_frozen_layers(); } Ok(()) } @@ -1445,7 +1452,7 @@ impl Timeline { layer_flush_start_tx, layer_flush_done_tx, - write_lock: Mutex::new(()), + write_lock: tokio::sync::Mutex::new(()), layer_removal_cs: Default::default(), gc_info: std::sync::RwLock::new(GcInfo { @@ -1497,7 +1504,7 @@ impl Timeline { let mut flush_loop_state = self.flush_loop_state.lock().unwrap(); match *flush_loop_state { FlushLoopState::NotStarted => (), - FlushLoopState::Running => { + FlushLoopState::Running { .. } => { info!( "skipping attempt to start flush_loop twice {}/{}", self.tenant_id, self.timeline_id @@ -1517,6 +1524,12 @@ impl Timeline { let self_clone = Arc::clone(self); info!("spawning flush loop"); + *flush_loop_state = FlushLoopState::Running { + #[cfg(test)] + expect_initdb_optimization: false, + #[cfg(test)] + initdb_optimization_count: 0, + }; task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, @@ -1528,14 +1541,12 @@ impl Timeline { let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); - assert_eq!(*flush_loop_state, FlushLoopState::Running); + assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..})); *flush_loop_state = FlushLoopState::Exited; Ok(()) } .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id)) ); - - *flush_loop_state = FlushLoopState::Running; } /// Creates and starts the wal receiver. @@ -1583,9 +1594,16 @@ impl Timeline { )); } + /// + /// Initialize with an empty layer map. Used when creating a new timeline. + /// + pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) { + let mut layers = self.layers.write().unwrap(); + layers.next_open_layer_at = Some(Lsn(start_lsn.0)); + } + /// /// Scan the timeline directory to populate the layer map. - /// Returns all timeline-related files that were found and loaded. /// pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { let mut layers = self.layers.write().unwrap(); @@ -2385,10 +2403,17 @@ impl Timeline { } ValueReconstructResult::Missing => { return Err(layer_traversal_error( - format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn - ), + if cfg!(test) { + format!( + "could not find data for key {} at LSN {}, for request at LSN {}\n{}", + key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), + ) + } else { + format!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, cont_lsn, request_lsn + ) + }, traversal_path, )); } @@ -2644,16 +2669,21 @@ impl Timeline { let last_record_lsn = self.get_last_record_lsn(); ensure!( lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}", lsn, last_record_lsn, + std::backtrace::Backtrace::force_capture(), ); // Do we have a layer open for writing already? let layer; if let Some(open_layer) = &layers.open_layer { if open_layer.get_lsn_range().start > lsn { - bail!("unexpected open layer in the future"); + bail!( + "unexpected open layer in the future: open layers starts at {}, write lsn {}", + open_layer.get_lsn_range().start, + lsn + ); } layer = Arc::clone(open_layer); @@ -2702,13 +2732,13 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - fn freeze_inmem_layer(&self, write_lock_held: bool) { + async fn freeze_inmem_layer(&self, write_lock_held: bool) { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. let _write_guard = if write_lock_held { None } else { - Some(self.write_lock.lock().unwrap()) + Some(self.write_lock.lock().await) }; let mut layers = self.layers.write().unwrap(); if let Some(open_layer) = &layers.open_layer { @@ -2781,7 +2811,7 @@ impl Timeline { let mut my_flush_request = 0; let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; - if flush_loop_state != FlushLoopState::Running { + if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}") } @@ -2831,6 +2861,18 @@ impl Timeline { let lsn_range = frozen_layer.get_lsn_range(); let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + #[cfg(test)] + match &mut *self.flush_loop_state.lock().unwrap() { + FlushLoopState::NotStarted | FlushLoopState::Exited => { + panic!("flush loop not running") + } + FlushLoopState::Running { + initdb_optimization_count, + .. + } => { + *initdb_optimization_count += 1; + } + } // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not // require downloading anything during initial import. let (partitioning, _lsn) = self @@ -2839,6 +2881,18 @@ impl Timeline { self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx) .await? } else { + #[cfg(test)] + match &mut *self.flush_loop_state.lock().unwrap() { + FlushLoopState::NotStarted | FlushLoopState::Exited => { + panic!("flush loop not running") + } + FlushLoopState::Running { + expect_initdb_optimization, + .. + } => { + assert!(!*expect_initdb_optimization, "expected initdb optimization"); + } + } // normal case, write out a L0 delta layer file. let this = self.clone(); let frozen_layer = frozen_layer.clone(); @@ -4541,7 +4595,7 @@ fn layer_traversal_error(msg: String, path: Vec) -> PageRecon // but will cause large code changes. pub struct TimelineWriter<'a> { tl: &'a Timeline, - _write_guard: MutexGuard<'a, ()>, + _write_guard: tokio::sync::MutexGuard<'a, ()>, } impl Deref for TimelineWriter<'_> { @@ -4582,6 +4636,14 @@ impl<'a> TimelineWriter<'a> { } } +// We need TimelineWriter to be send in upcoming conversion of +// Timeline::layers to tokio::sync::RwLock. +#[test] +fn is_send() { + fn _assert_send() {} + _assert_send::>(); +} + /// Add a suffix to a layer file's name: .{num}.old /// Uses the first available num (starts at 0) fn rename_to_backup(path: &Path) -> anyhow::Result<()> { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index a5d0af32fe..83dfc5f598 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -1324,7 +1324,8 @@ mod tests { async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState { let (tenant, ctx) = harness.load().await; let timeline = tenant - .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx) + .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx) + .await .expect("Failed to create an empty timeline for dummy wal connection manager"); ConnectionManagerState { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index a16afe2b3c..1c1fe87305 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -304,12 +304,15 @@ pub(super) async fn handle_walreceiver_connection( } } - timeline.check_checkpoint_distance().with_context(|| { - format!( - "Failed to check checkpoint distance for timeline {}", - timeline.timeline_id - ) - })?; + timeline + .check_checkpoint_distance() + .await + .with_context(|| { + format!( + "Failed to check checkpoint distance for timeline {}", + timeline.timeline_id + ) + })?; if let Some(last_lsn) = status_update { let timeline_remote_consistent_lsn = diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 4b8e6aa515..68cf2a4645 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> { // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit()?; + modification.commit().await?; Ok(()) } @@ -1171,7 +1171,6 @@ impl<'a> WalIngest<'a> { #[cfg(test)] mod tests { use super::*; - use crate::pgdatadir_mapping::create_test_timeline; use crate::tenant::harness::*; use crate::tenant::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; @@ -1200,7 +1199,7 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file - m.commit()?; + m.commit().await?; let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?; Ok(walingest) @@ -1209,7 +1208,9 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) + .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1217,22 +1218,22 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) .await?; - m.commit()?; + m.commit().await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx) .await?; - m.commit()?; + m.commit().await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx) .await?; - m.commit()?; + m.commit().await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx) .await?; - m.commit()?; + m.commit().await?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1318,7 +1319,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx) .await?; - m.commit()?; + m.commit().await?; assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation @@ -1360,7 +1361,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx) .await?; - m.commit()?; + m.commit().await?; assert_eq!( tline .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx) @@ -1373,7 +1374,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx) .await?; - m.commit()?; + m.commit().await?; assert_eq!( tline .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx) @@ -1398,7 +1399,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx) .await?; - m.commit()?; + m.commit().await?; assert_eq!( tline .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) @@ -1428,14 +1429,16 @@ mod tests { #[tokio::test] async fn test_drop_extend() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) + .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut m = tline.begin_modification(Lsn(0x20)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) .await?; - m.commit()?; + m.commit().await?; // Check that rel exists and size is correct assert_eq!( @@ -1454,7 +1457,7 @@ mod tests { // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?; - m.commit()?; + m.commit().await?; // Check that rel is not visible anymore assert_eq!( @@ -1472,7 +1475,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx) .await?; - m.commit()?; + m.commit().await?; // Check that rel exists and size is correct assert_eq!( @@ -1497,7 +1500,9 @@ mod tests { #[tokio::test] async fn test_truncate_extend() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) + .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; // Create a 20 MB relation (the size is arbitrary) @@ -1509,7 +1514,7 @@ mod tests { .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) .await?; } - m.commit()?; + m.commit().await?; // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( @@ -1554,7 +1559,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx) .await?; - m.commit()?; + m.commit().await?; // Check reported size and contents after truncation assert_eq!( @@ -1603,7 +1608,7 @@ mod tests { .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) .await?; } - m.commit()?; + m.commit().await?; assert_eq!( tline @@ -1637,7 +1642,9 @@ mod tests { #[tokio::test] async fn test_large_rel() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await; - let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) + .await?; let mut walingest = init_walingest_test(&tline, &ctx).await?; let mut lsn = 0x10; @@ -1648,7 +1655,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx) .await?; - m.commit()?; + m.commit().await?; } assert_current_logical_size(&tline, Lsn(lsn)); @@ -1664,7 +1671,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx) .await?; - m.commit()?; + m.commit().await?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, RELSEG_SIZE @@ -1677,7 +1684,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx) .await?; - m.commit()?; + m.commit().await?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, RELSEG_SIZE - 1 @@ -1693,7 +1700,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx) .await?; - m.commit()?; + m.commit().await?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, size as BlockNumber diff --git a/poetry.lock b/poetry.lock index f544eb8d5c..8dc45f68b8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "aiohttp" @@ -855,35 +855,31 @@ files = [ [[package]] name = "cryptography" -version = "39.0.1" +version = "41.0.0" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"}, - {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"}, - {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"}, - {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"}, - {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"}, - {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"}, - {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"}, + {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"}, + {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"}, + {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"}, + {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"}, + {file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"}, + {file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"}, + {file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"}, ] [package.dependencies] @@ -892,12 +888,12 @@ cffi = ">=1.12" [package.extras] docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] -pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"] -sdist = ["setuptools-rust (>=0.11.4)"] +nox = ["nox"] +pep8test = ["black", "check-sdist", "mypy", "ruff"] +sdist = ["build"] ssh = ["bcrypt (>=3.1.5)"] -test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"] +test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] -tox = ["tox"] [[package]] name = "docker" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index fecbb8bd41..0625538bf3 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -3,15 +3,19 @@ // use anyhow::{bail, Context, Result}; use clap::Parser; +use futures::future::BoxFuture; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; use remote_storage::RemoteStorageConfig; +use tokio::runtime::Handle; +use tokio::signal::unix::{signal, SignalKind}; +use tokio::task::JoinError; use toml_edit::Document; -use utils::signals::ShutdownSignals; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::thread; use std::time::Duration; use storage_broker::Uri; use tokio::sync::mpsc; @@ -20,22 +24,21 @@ use tracing::*; use utils::pid_file; use metrics::set_build_info_metric; -use safekeeper::broker; -use safekeeper::control_file; use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PG_LISTEN_ADDR, }; -use safekeeper::http; -use safekeeper::remove_wal; -use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; +use safekeeper::{broker, WAL_SERVICE_RUNTIME}; +use safekeeper::{control_file, BROKER_RUNTIME}; +use safekeeper::{http, WAL_REMOVER_RUNTIME}; +use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME}; +use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; use utils::auth::JwtAuth; use utils::{ - http::endpoint, id::NodeId, logging::{self, LogFormat}, project_git_version, @@ -104,10 +107,6 @@ struct Args { /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)] max_offloader_lag: u64, - /// Number of threads for wal backup runtime, by default number of cores - /// available to the system. - #[arg(long)] - wal_backup_threads: Option, /// Number of max parallel WAL segments to be offloaded to remote storage. #[arg(long, default_value = "5")] wal_backup_parallel_jobs: usize, @@ -121,9 +120,14 @@ struct Args { /// Format for logging, either 'plain' or 'json'. #[arg(long, default_value = "plain")] log_format: String, + /// Run everything in single threaded current thread runtime, might be + /// useful for debugging. + #[arg(long)] + current_thread_runtime: bool, } -fn main() -> anyhow::Result<()> { +#[tokio::main(flavor = "current_thread")] +async fn main() -> anyhow::Result<()> { let args = Args::parse(); if let Some(addr) = args.dump_control_file { @@ -183,10 +187,10 @@ fn main() -> anyhow::Result<()> { heartbeat_timeout: args.heartbeat_timeout, remote_storage: args.remote_storage, max_offloader_lag_bytes: args.max_offloader_lag, - backup_runtime_threads: args.wal_backup_threads, wal_backup_enabled: !args.disable_wal_backup, backup_parallel_jobs: args.wal_backup_parallel_jobs, auth, + current_thread_runtime: args.current_thread_runtime, }; // initialize sentry if SENTRY_DSN is provided @@ -194,10 +198,14 @@ fn main() -> anyhow::Result<()> { Some(GIT_VERSION.into()), &[("node_id", &conf.my_id.to_string())], ); - start_safekeeper(conf) + start_safekeeper(conf).await } -fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { +/// Result of joining any of main tasks: upper error means task failed to +/// complete, e.g. panicked, inner is error produced by task itself. +type JoinTaskRes = Result, JoinError>; + +async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { // Prevent running multiple safekeepers on the same directory let lock_file_path = conf.workdir.join(PID_FILE_NAME); let lock_file = @@ -208,14 +216,18 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { // we need to release the lock file only when the current process is gone std::mem::forget(lock_file); - let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_http_addr, e); + info!("starting safekeeper WAL service on {}", conf.listen_pg_addr); + let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); e })?; - info!("starting safekeeper on {}", conf.listen_pg_addr); - let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); + info!( + "starting safekeeper HTTP service on {}", + conf.listen_http_addr + ); + let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e })?; @@ -224,71 +236,88 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); // Load all timelines from disk to memory. GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("http_endpoint_thread".into()) - .spawn(|| { - let router = http::make_router(conf_); - endpoint::serve_thread_main( - router, - http_listener, - std::future::pending(), // never shut down - ) - .unwrap(); - })?, - ); - - let conf_cloned = conf.clone(); - let safekeeper_thread = thread::Builder::new() - .name("WAL service thread".into()) - .spawn(|| wal_service::thread_main(conf_cloned, pg_listener)) - .unwrap(); - - threads.push(safekeeper_thread); + // Keep handles to main tasks to die if any of them disappears. + let mut tasks_handles: FuturesUnordered> = + FuturesUnordered::new(); let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("broker thread".into()) - .spawn(|| { - broker::thread_main(conf_); - })?, - ); + // Run everything in current thread rt, if asked. + if conf.current_thread_runtime { + info!("running in current thread runtime"); + } + let current_thread_rt = conf + .current_thread_runtime + .then(|| Handle::try_current().expect("no runtime in main")); + let wal_service_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) + .spawn(wal_service::task_main(conf_, pg_listener)) + // wrap with task name for error reporting + .map(|res| ("WAL service main".to_owned(), res)); + tasks_handles.push(Box::pin(wal_service_handle)); let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("WAL removal thread".into()) - .spawn(|| { - remove_wal::thread_main(conf_); - })?, - ); + let http_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| HTTP_RUNTIME.handle()) + .spawn(http::task_main(conf_, http_listener)) + .map(|res| ("HTTP service main".to_owned(), res)); + tasks_handles.push(Box::pin(http_handle)); - threads.push( - thread::Builder::new() - .name("WAL backup launcher thread".into()) - .spawn(move || { - wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx); - })?, - ); + let conf_ = conf.clone(); + let broker_task_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| BROKER_RUNTIME.handle()) + .spawn(broker::task_main(conf_).instrument(info_span!("broker"))) + .map(|res| ("broker main".to_owned(), res)); + tasks_handles.push(Box::pin(broker_task_handle)); + + let conf_ = conf.clone(); + let wal_remover_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle()) + .spawn(remove_wal::task_main(conf_)) + .map(|res| ("WAL remover".to_owned(), res)); + tasks_handles.push(Box::pin(wal_remover_handle)); + + let conf_ = conf.clone(); + let wal_backup_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle()) + .spawn(wal_backup::wal_backup_launcher_task_main( + conf_, + wal_backup_launcher_rx, + )) + .map(|res| ("WAL backup launcher".to_owned(), res)); + tasks_handles.push(Box::pin(wal_backup_handle)); set_build_info_metric(GIT_VERSION); - // TODO: put more thoughts into handling of failed threads - // We should catch & die if they are in trouble. - // On any shutdown signal, log receival and exit. Additionally, handling - // SIGQUIT prevents coredump. - ShutdownSignals::handle(|signal| { - info!("received {}, terminating", signal.name()); - std::process::exit(0); - }) + // TODO: update tokio-stream, convert to real async Stream with + // SignalStream, map it to obtain missing signal name, combine streams into + // single stream we can easily sit on. + let mut sigquit_stream = signal(SignalKind::quit())?; + let mut sigint_stream = signal(SignalKind::interrupt())?; + let mut sigterm_stream = signal(SignalKind::terminate())?; + + tokio::select! { + Some((task_name, res)) = tasks_handles.next()=> { + error!("{} task failed: {:?}, exiting", task_name, res); + std::process::exit(1); + } + // On any shutdown signal, log receival and exit. Additionally, handling + // SIGQUIT prevents coredump. + _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"), + _ = sigint_stream.recv() => info!("received SIGINT, terminating"), + _ = sigterm_stream.recv() => info!("received SIGTERM, terminating") + + }; + std::process::exit(0); } /// Determine safekeeper id. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 48c56ee58f..2b1db2714b 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -8,7 +8,7 @@ use anyhow::Error; use anyhow::Result; use storage_broker::parse_proto_ttid; -use storage_broker::proto::broker_service_client::BrokerServiceClient; + use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::Request; @@ -16,7 +16,7 @@ use storage_broker::Request; use std::time::Duration; use std::time::Instant; use tokio::task::JoinHandle; -use tokio::{runtime, time::sleep}; +use tokio::time::sleep; use tracing::*; use crate::metrics::BROKER_ITERATION_TIMELINES; @@ -29,23 +29,10 @@ use crate::SafeKeeperConf; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; -pub fn thread_main(conf: SafeKeeperConf) { - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - let _enter = info_span!("broker").entered(); - info!("started, broker endpoint {:?}", conf.broker_endpoint); - - runtime.block_on(async { - main_loop(conf).await; - }); -} - /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { - let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?; + let mut client = + storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); let outbound = async_stream::stream! { @@ -55,20 +42,27 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. let now = Instant::now(); - let mut active_tlis = GlobalTimelines::get_all(); - active_tlis.retain(|tli| tli.is_active()); - for tli in &active_tlis { - let sk_info = tli.get_safekeeper_info(&conf); + let all_tlis = GlobalTimelines::get_all(); + let mut n_pushed_tlis = 0; + for tli in &all_tlis { + // filtering alternative futures::stream::iter(all_tlis) + // .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::>().await; + // doesn't look better, and I'm not sure how to do that without collect. + if !tli.is_active().await { + continue; + } + let sk_info = tli.get_safekeeper_info(&conf).await; yield sk_info; BROKER_PUSHED_UPDATES.inc(); + n_pushed_tlis += 1; } let elapsed = now.elapsed(); BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64()); - BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64); + BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64); if elapsed > push_interval / 2 { - info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); + info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed); } sleep(push_interval).await; @@ -125,10 +119,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { bail!("end of stream"); } -async fn main_loop(conf: SafeKeeperConf) { +pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { + info!("started, broker endpoint {:?}", conf.broker_endpoint); + let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); let mut push_handle: Option>> = None; let mut pull_handle: Option>> = None; + // Selecting on JoinHandles requires some squats; is there a better way to // reap tasks individually? diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index b1b0c032d7..6c4ad24323 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,9 +2,10 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use tokio::fs::{self, File}; +use tokio::io::AsyncWriteExt; -use std::fs::{self, File, OpenOptions}; -use std::io::{Read, Write}; +use std::io::Read; use std::ops::Deref; use std::path::{Path, PathBuf}; use std::time::Instant; @@ -26,9 +27,10 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. +#[async_trait::async_trait] pub trait Storage: Deref { /// Persist safekeeper state on disk and update internal state. - fn persist(&mut self, s: &SafeKeeperState) -> Result<()>; + async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>; /// Timestamp of last persist. fn last_persist_at(&self) -> Instant; @@ -82,7 +84,7 @@ impl FileStorage { /// Check the magic/version in the on-disk data and deserialize it, if possible. fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part - let magic = buf.read_u32::()?; + let magic = ReadBytesExt::read_u32::(buf)?; if magic != SK_MAGIC { bail!( "bad control file magic: {:X}, expected {:X}", @@ -90,7 +92,7 @@ impl FileStorage { SK_MAGIC ); } - let version = buf.read_u32::()?; + let version = ReadBytesExt::read_u32::(buf)?; if version == SK_FORMAT_VERSION { let res = SafeKeeperState::des(buf)?; return Ok(res); @@ -110,7 +112,7 @@ impl FileStorage { /// Read in the control file. pub fn load_control_file>(control_file_path: P) -> Result { - let mut control_file = OpenOptions::new() + let mut control_file = std::fs::OpenOptions::new() .read(true) .write(true) .open(&control_file_path) @@ -159,30 +161,31 @@ impl Deref for FileStorage { } } +#[async_trait::async_trait] impl Storage for FileStorage { /// persists state durably to underlying storage /// for description see https://lwn.net/Articles/457667/ - fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { + async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); - let mut control_partial = File::create(&control_partial_path).with_context(|| { + let mut control_partial = File::create(&control_partial_path).await.with_context(|| { format!( "failed to create partial control file at: {}", &control_partial_path.display() ) })?; let mut buf: Vec = Vec::new(); - buf.write_u32::(SK_MAGIC)?; - buf.write_u32::(SK_FORMAT_VERSION)?; + WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; + WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; s.ser_into(&mut buf)?; // calculate checksum before resize let checksum = crc32c::crc32c(&buf); buf.extend_from_slice(&checksum.to_le_bytes()); - control_partial.write_all(&buf).with_context(|| { + control_partial.write_all(&buf).await.with_context(|| { format!( "failed to write safekeeper state into control file at: {}", control_partial_path.display() @@ -191,7 +194,7 @@ impl Storage for FileStorage { // fsync the file if !self.conf.no_sync { - control_partial.sync_all().with_context(|| { + control_partial.sync_all().await.with_context(|| { format!( "failed to sync partial control file at {}", control_partial_path.display() @@ -202,21 +205,22 @@ impl Storage for FileStorage { let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); // rename should be atomic - fs::rename(&control_partial_path, &control_path)?; + fs::rename(&control_partial_path, &control_path).await?; // this sync is not required by any standard but postgres does this (see durable_rename) if !self.conf.no_sync { - File::open(&control_path) - .and_then(|f| f.sync_all()) - .with_context(|| { - format!( - "failed to sync control file at: {}", - &control_path.display() - ) - })?; + let new_f = File::open(&control_path).await?; + new_f.sync_all().await.with_context(|| { + format!( + "failed to sync control file at: {}", + &control_path.display() + ) + })?; // fsync the directory (linux specific) - File::open(&self.timeline_dir) - .and_then(|f| f.sync_all()) + let tli_dir = File::open(&self.timeline_dir).await?; + tli_dir + .sync_all() + .await .context("failed to sync control file directory")?; } @@ -236,7 +240,6 @@ mod test { use super::*; use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; use anyhow::Result; - use std::fs; use utils::{id::TenantTimelineId, lsn::Lsn}; fn stub_conf() -> SafeKeeperConf { @@ -247,59 +250,75 @@ mod test { } } - fn load_from_control_file( + async fn load_from_control_file( conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)) + .await + .expect("failed to create timeline dir"); Ok(( FileStorage::restore_new(ttid, conf)?, FileStorage::load_control_file_conf(conf, ttid)?, )) } - fn create( + async fn create( conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)) + .await + .expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) } - #[test] - fn test_read_write_safekeeper_state() { + #[tokio::test] + async fn test_read_write_safekeeper_state() { let conf = stub_conf(); let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state"); + let (mut storage, mut state) = + create(&conf, &ttid).await.expect("failed to create state"); // change something state.commit_lsn = Lsn(42); - storage.persist(&state).expect("failed to persist state"); + storage + .persist(&state) + .await + .expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state"); + let (_, state) = load_from_control_file(&conf, &ttid) + .await + .expect("failed to read state"); assert_eq!(state.commit_lsn, Lsn(42)); } - #[test] - fn test_safekeeper_state_checksum_mismatch() { + #[tokio::test] + async fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state"); + let (mut storage, mut state) = + create(&conf, &ttid).await.expect("failed to read state"); // change something state.commit_lsn = Lsn(42); - storage.persist(&state).expect("failed to persist state"); + storage + .persist(&state) + .await + .expect("failed to persist state"); } let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); - let mut data = fs::read(&control_path).unwrap(); + let mut data = fs::read(&control_path).await.unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation - fs::write(&control_path, &data).expect("failed to write control file"); + fs::write(&control_path, &data) + .await + .expect("failed to write control file"); - match load_from_control_file(&conf, &ttid) { + match load_from_control_file(&conf, &ttid).await { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index f711c4429d..387b577a13 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -121,7 +121,7 @@ pub struct FileInfo { } /// Build debug dump response, using the provided [`Args`] filters. -pub fn build(args: Args) -> Result { +pub async fn build(args: Args) -> Result { let start_time = Utc::now(); let timelines_count = GlobalTimelines::timelines_count(); @@ -155,7 +155,7 @@ pub fn build(args: Args) -> Result { } let control_file = if args.dump_control_file { - let mut state = tli.get_state().1; + let mut state = tli.get_state().await.1; if !args.dump_term_history { state.acceptor_state.term_history = TermHistory(vec![]); } @@ -165,7 +165,7 @@ pub fn build(args: Args) -> Result { }; let memory = if args.dump_memory { - Some(tli.memory_dump()) + Some(tli.memory_dump().await) } else { None }; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 7d25ced449..1367d5eebb 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler { let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn - tli.get_flush_lsn() + tli.get_flush_lsn().await } else { // other clients shouldn't get any uncommitted WAL - tli.get_state().0.commit_lsn + tli.get_state().await.0.commit_lsn } .to_string(); - let sysid = tli.get_state().1.server.system_id.to_string(); + let sysid = tli.get_state().await.1.server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli = PG_TLI.to_string(); let tli_bytes = tli.as_bytes(); diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 1831470007..2a9570595f 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -2,3 +2,18 @@ pub mod routes; pub use routes::make_router; pub use safekeeper_api::models; + +use crate::SafeKeeperConf; + +pub async fn task_main( + conf: SafeKeeperConf, + http_listener: std::net::TcpListener, +) -> anyhow::Result<()> { + let router = make_router(conf) + .build() + .map_err(|err| anyhow::anyhow!(err))?; + let service = utils::http::RouterService::new(router).unwrap(); + let server = hyper::Server::from_tcp(http_listener)?; + server.serve(service).await?; + Ok(()) // unreachable +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a498d868af..5cd0973ad6 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -13,7 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::fs::File; use tokio::io::AsyncReadExt; -use tokio::task::JoinError; +use utils::http::endpoint::request_span; use crate::safekeeper::ServerInfo; use crate::safekeeper::Term; @@ -116,8 +116,8 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result timeline_id, }; - let resp = tokio::task::spawn_blocking(move || { - debug_dump::build(args).map_err(ApiError::InternalServerError) - }) - .await - .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + let resp = debug_dump::build(args) + .await + .map_err(ApiError::InternalServerError)?; // TODO: use streaming response json_response(StatusCode::OK, resp) @@ -386,29 +379,32 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder router .data(Arc::new(conf)) .data(auth) - .get("/v1/status", status_handler) + .get("/v1/status", |r| request_span(r, status_handler)) // Will be used in the future instead of implicit timeline creation - .post("/v1/tenant/timeline", timeline_create_handler) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_status_handler, - ) - .delete( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_delete_force_handler, - ) - .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler) - .post("/v1/pull_timeline", timeline_pull_handler) + .post("/v1/tenant/timeline", |r| { + request_span(r, timeline_create_handler) + }) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + request_span(r, timeline_status_handler) + }) + .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + request_span(r, timeline_delete_force_handler) + }) + .delete("/v1/tenant/:tenant_id", |r| { + request_span(r, tenant_delete_force_handler) + }) + .post("/v1/pull_timeline", |r| { + request_span(r, timeline_pull_handler) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", - timeline_files_handler, + |r| request_span(r, timeline_files_handler), ) // for tests - .post( - "/v1/record_safekeeper_info/:tenant_id/:timeline_id", - record_safekeeper_info, - ) - .get("/v1/debug_dump", dump_debug_handler) + .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { + request_span(r, record_safekeeper_info) + }) + .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler)) } #[cfg(test)] diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index dc9188723e..14d0cc3653 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -73,12 +73,12 @@ pub async fn handle_json_ctrl( // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { - send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?; + send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?; } - let inserted_wal = append_logical_message(&tli, append_request)?; + let inserted_wal = append_logical_message(&tli, append_request).await?; let response = AppendResult { - state: tli.get_state().1, + state: tli.get_state().await.1, inserted_wal, }; let response_data = serde_json::to_vec(&response) @@ -114,9 +114,9 @@ async fn prepare_safekeeper( .await } -fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { +async fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { // add new term to existing history - let history = tli.get_state().1.acceptor_state.term_history; + let history = tli.get_state().await.1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); let mut history_entries = history.0; history_entries.push(TermSwitchEntry { term, lsn }); @@ -129,7 +129,7 @@ fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::R timeline_start_lsn: lsn, }); - tli.process_msg(&proposer_elected_request)?; + tli.process_msg(&proposer_elected_request).await?; Ok(()) } @@ -142,12 +142,12 @@ pub struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -pub fn append_logical_message( +pub async fn append_logical_message( tli: &Arc, msg: &AppendLogicalMessage, ) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = tli.get_state().1; + let sk_state = tli.get_state().await.1; let begin_lsn = msg.begin_lsn; let end_lsn = begin_lsn + wal_data.len() as u64; @@ -171,7 +171,7 @@ pub fn append_logical_message( wal_data: Bytes::from(wal_data), }); - let response = tli.process_msg(&append_request)?; + let response = tli.process_msg(&append_request).await?; let append_response = match response { Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 22d6d57e19..b8e1101369 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,4 +1,6 @@ +use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; +use tokio::runtime::Runtime; use std::path::PathBuf; use std::time::Duration; @@ -36,7 +38,6 @@ pub mod defaults { DEFAULT_PG_LISTEN_PORT, }; - pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms"; pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); } @@ -60,10 +61,10 @@ pub struct SafeKeeperConf { pub heartbeat_timeout: Duration, pub remote_storage: Option, pub max_offloader_lag_bytes: u64, - pub backup_runtime_threads: Option, pub backup_parallel_jobs: usize, pub wal_backup_enabled: bool, pub auth: Option>, + pub current_thread_runtime: bool, } impl SafeKeeperConf { @@ -92,12 +93,64 @@ impl SafeKeeperConf { .parse() .expect("failed to parse default broker endpoint"), broker_keepalive_interval: Duration::from_secs(5), - backup_runtime_threads: None, wal_backup_enabled: true, backup_parallel_jobs: 1, auth: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, + current_thread_runtime: false, } } } + +// Tokio runtimes. +pub static WAL_SERVICE_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("WAL service worker") + .enable_all() + .build() + .expect("Failed to create WAL service runtime") +}); + +pub static HTTP_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("HTTP worker") + .enable_all() + .build() + .expect("Failed to create WAL service runtime") +}); + +pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("broker worker") + .worker_threads(2) // there are only 2 tasks, having more threads doesn't make sense + .enable_all() + .build() + .expect("Failed to create broker runtime") +}); + +pub static WAL_REMOVER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("WAL remover") + .worker_threads(1) + .enable_all() + .build() + .expect("Failed to create broker runtime") +}); + +pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("WAL backup worker") + .enable_all() + .build() + .expect("Failed to create WAL backup runtime") +}); + +pub static METRICS_SHIFTER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("metric shifter") + .worker_threads(1) + .enable_all() + .build() + .expect("Failed to create broker runtime") +}); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 235a88501d..0711beb290 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,6 +7,7 @@ use std::{ use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; use anyhow::Result; +use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, @@ -292,14 +293,17 @@ impl WalStorageMetrics { } } -/// Accepts a closure that returns a result, and returns the duration of the closure. -pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { +/// Accepts async function that returns empty anyhow result, and returns the duration of its execution. +pub async fn time_io_closure>( + closure: impl Future>, +) -> Result { let start = std::time::Instant::now(); - closure()?; + closure.await.map_err(|e| e.into())?; Ok(start.elapsed().as_secs_f64()) } /// Metrics for a single timeline. +#[derive(Clone)] pub struct FullTimelineInfo { pub ttid: TenantTimelineId, pub ps_feedback: PageserverFeedback, @@ -575,13 +579,19 @@ impl Collector for TimelineCollector { let timelines = GlobalTimelines::get_all(); let timelines_count = timelines.len(); - for arc_tli in timelines { - let tli = arc_tli.info_for_metrics(); - if tli.is_none() { - continue; - } - let tli = tli.unwrap(); + // Prometheus Collector is sync, and data is stored under async lock. To + // bridge the gap with a crutch, collect data in spawned thread with + // local tokio runtime. + let infos = std::thread::spawn(|| { + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .expect("failed to create rt"); + rt.block_on(collect_timeline_metrics()) + }) + .join() + .expect("collect_timeline_metrics thread panicked"); + for tli in &infos { let tenant_id = tli.ttid.tenant_id.to_string(); let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; @@ -682,3 +692,15 @@ impl Collector for TimelineCollector { mfs } } + +async fn collect_timeline_metrics() -> Vec { + let mut res = vec![]; + let timelines = GlobalTimelines::get_all(); + + for tli in timelines { + if let Some(info) = tli.info_for_metrics().await { + res.push(info); + } + } + res +} diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 344b760fd3..61ba37efaa 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -231,7 +231,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result info!( "Loaded timeline {}, flush_lsn={}", ttid, - tli.get_flush_lsn() + tli.get_flush_lsn().await ); Ok(Response { diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 195470e3ca..a5e99c5f0a 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -18,15 +18,14 @@ use postgres_backend::QueryError; use pq_proto::BeMessage; use std::net::SocketAddr; use std::sync::Arc; -use std::thread; -use std::thread::JoinHandle; use tokio::io::AsyncRead; use tokio::io::AsyncWrite; use tokio::sync::mpsc::channel; use tokio::sync::mpsc::error::TryRecvError; use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Sender; -use tokio::task::spawn_blocking; +use tokio::task; +use tokio::task::JoinHandle; use tokio::time::Duration; use tokio::time::Instant; use tracing::*; @@ -97,7 +96,7 @@ impl SafekeeperPostgresHandler { Err(res.expect_err("no error with WalAcceptor not spawn")) } Some(handle) => { - let wal_acceptor_res = handle.join(); + let wal_acceptor_res = handle.await; // If there was any network error, return it. res?; @@ -107,7 +106,7 @@ impl SafekeeperPostgresHandler { Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))), Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!( - "WalAcceptor thread panicked", + "WalAcceptor task panicked", ))), } } @@ -154,10 +153,12 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { } }; - *self.acceptor_handle = Some( - WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id) - .context("spawn WalAcceptor thread")?, - ); + *self.acceptor_handle = Some(WalAcceptor::spawn( + tli.clone(), + msg_rx, + reply_tx, + self.conn_id, + )); // Forward all messages to WalAcceptor read_network_loop(self.pgb_reader, msg_tx, next_msg).await @@ -226,28 +227,19 @@ impl WalAcceptor { msg_rx: Receiver, reply_tx: Sender, conn_id: ConnectionId, - ) -> anyhow::Result>> { - let thread_name = format!("WAL acceptor {}", tli.ttid); - thread::Builder::new() - .name(thread_name) - .spawn(move || -> anyhow::Result<()> { - let mut wa = WalAcceptor { - tli, - msg_rx, - reply_tx, - }; + ) -> JoinHandle> { + task::spawn(async move { + let mut wa = WalAcceptor { + tli, + msg_rx, + reply_tx, + }; - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - let span_ttid = wa.tli.ttid; // satisfy borrow checker - runtime.block_on( - wa.run() - .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)), - ) - }) - .map_err(anyhow::Error::from) + let span_ttid = wa.tli.ttid; // satisfy borrow checker + wa.run() + .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)) + .await + }) } /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; @@ -281,7 +273,7 @@ impl WalAcceptor { while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg { let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - if let Some(reply) = self.tli.process_msg(&noflush_msg)? { + if let Some(reply) = self.tli.process_msg(&noflush_msg).await? { if self.reply_tx.send(reply).await.is_err() { return Ok(()); // chan closed, streaming terminated } @@ -300,10 +292,12 @@ impl WalAcceptor { } // flush all written WAL to the disk - self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)? + self.tli + .process_msg(&ProposerAcceptorMessage::FlushWAL) + .await? } else { // process message other than AppendRequest - self.tli.process_msg(&next_msg)? + self.tli.process_msg(&next_msg).await? }; if let Some(reply) = reply_msg { @@ -326,8 +320,8 @@ impl Drop for ComputeConnectionGuard { let tli = self.timeline.clone(); // tokio forbids to call blocking_send inside the runtime, and see // comments in on_compute_disconnect why we call blocking_send. - spawn_blocking(move || { - if let Err(e) = tli.on_compute_disconnect() { + tokio::spawn(async move { + if let Err(e) = tli.on_compute_disconnect().await { error!("failed to unregister compute connection: {}", e); } }); diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index ad9d655fae..3306f0b63a 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -1,29 +1,36 @@ //! Thread removing old WAL. -use std::{thread, time::Duration}; +use std::time::Duration; +use tokio::time::sleep; use tracing::*; use crate::{GlobalTimelines, SafeKeeperConf}; -pub fn thread_main(conf: SafeKeeperConf) { +pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { let wal_removal_interval = Duration::from_millis(5000); loop { let tlis = GlobalTimelines::get_all(); for tli in &tlis { - if !tli.is_active() { + if !tli.is_active().await { continue; } let ttid = tli.ttid; - let _enter = - info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered(); - if let Err(e) = tli.maybe_pesist_control_file() { + if let Err(e) = tli + .maybe_persist_control_file() + .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id)) + .await + { warn!("failed to persist control file: {e}"); } - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { - warn!("failed to remove WAL: {}", e); + if let Err(e) = tli + .remove_old_wal(conf.wal_backup_enabled) + .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id)) + .await + { + error!("failed to remove WAL: {}", e); } } - thread::sleep(wal_removal_interval) + sleep(wal_removal_interval).await; } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 7378ccb994..d0b14a1282 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -568,25 +568,27 @@ where /// Process message from proposer and possibly form reply. Concurrent /// callers must exclude each other. - pub fn process_msg( + pub async fn process_msg( &mut self, msg: &ProposerAcceptorMessage, ) -> Result> { match msg { - ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg), - ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg), - ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg), - ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true), - ProposerAcceptorMessage::NoFlushAppendRequest(msg) => { - self.handle_append_request(msg, false) + ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await, + ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await, + ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await, + ProposerAcceptorMessage::AppendRequest(msg) => { + self.handle_append_request(msg, true).await } - ProposerAcceptorMessage::FlushWAL => self.handle_flush(), + ProposerAcceptorMessage::NoFlushAppendRequest(msg) => { + self.handle_append_request(msg, false).await + } + ProposerAcceptorMessage::FlushWAL => self.handle_flush().await, } } /// Handle initial message from proposer: check its sanity and send my /// current term. - fn handle_greeting( + async fn handle_greeting( &mut self, msg: &ProposerGreeting, ) -> Result> { @@ -649,7 +651,7 @@ where if msg.pg_version != UNKNOWN_SERVER_VERSION { state.server.pg_version = msg.pg_version; } - self.state.persist(&state)?; + self.state.persist(&state).await?; } info!( @@ -664,7 +666,7 @@ where } /// Give vote for the given term, if we haven't done that previously. - fn handle_vote_request( + async fn handle_vote_request( &mut self, msg: &VoteRequest, ) -> Result> { @@ -678,7 +680,7 @@ where // handle_elected instead. Currently not a big deal, as proposer is the // only source of WAL; with peer2peer recovery it would be more // important. - self.wal_store.flush_wal()?; + self.wal_store.flush_wal().await?; // initialize with refusal let mut resp = VoteResponse { term: self.state.acceptor_state.term, @@ -692,7 +694,7 @@ where let mut state = self.state.clone(); state.acceptor_state.term = msg.term; // persist vote before sending it out - self.state.persist(&state)?; + self.state.persist(&state).await?; resp.term = self.state.acceptor_state.term; resp.vote_given = true as u64; @@ -715,12 +717,15 @@ where ar } - fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { + async fn handle_elected( + &mut self, + msg: &ProposerElected, + ) -> Result> { info!("received ProposerElected {:?}", msg); if self.state.acceptor_state.term < msg.term { let mut state = self.state.clone(); state.acceptor_state.term = msg.term; - self.state.persist(&state)?; + self.state.persist(&state).await?; } // If our term is higher, ignore the message (next feedback will inform the compute) @@ -750,7 +755,7 @@ where // intersection of our history and history from msg // truncate wal, update the LSNs - self.wal_store.truncate_wal(msg.start_streaming_at)?; + self.wal_store.truncate_wal(msg.start_streaming_at).await?; // and now adopt term history from proposer { @@ -784,7 +789,7 @@ where self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); - self.persist_control_file(state)?; + self.persist_control_file(state).await?; } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -796,7 +801,7 @@ where /// /// Note: it is assumed that 'WAL we have is from the right term' check has /// already been done outside. - fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> { + async fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> { // Both peers and walproposer communicate this value, we might already // have a fresher (higher) version. candidate = max(candidate, self.inmem.commit_lsn); @@ -818,29 +823,32 @@ where // that we receive new epoch_start_lsn, and we still need to sync // control file in this case. if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { - self.persist_control_file(self.state.clone())?; + self.persist_control_file(self.state.clone()).await?; } Ok(()) } /// Persist control file to disk, called only after timeline creation (bootstrap). - pub fn persist(&mut self) -> Result<()> { - self.persist_control_file(self.state.clone()) + pub async fn persist(&mut self) -> Result<()> { + self.persist_control_file(self.state.clone()).await } /// Persist in-memory state to the disk, taking other data from state. - fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { + async fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; state.proposer_uuid = self.inmem.proposer_uuid; - self.state.persist(&state) + self.state.persist(&state).await } /// Persist control file if there is something to save and enough time /// passed after the last save. - pub fn maybe_persist_control_file(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> { + pub async fn maybe_persist_control_file( + &mut self, + inmem_remote_consistent_lsn: Lsn, + ) -> Result<()> { const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL { return Ok(()); @@ -852,7 +860,7 @@ where if need_persist { let mut state = self.state.clone(); state.remote_consistent_lsn = inmem_remote_consistent_lsn; - self.persist_control_file(state)?; + self.persist_control_file(state).await?; trace!("saved control file: {CF_SAVE_INTERVAL:?} passed"); } Ok(()) @@ -860,7 +868,7 @@ where /// Handle request to append WAL. #[allow(clippy::comparison_chain)] - fn handle_append_request( + async fn handle_append_request( &mut self, msg: &AppendRequest, require_flush: bool, @@ -883,17 +891,19 @@ where // do the job if !msg.wal_data.is_empty() { - self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; + self.wal_store + .write_wal(msg.h.begin_lsn, &msg.wal_data) + .await?; } // flush wal to the disk, if required if require_flush { - self.wal_store.flush_wal()?; + self.wal_store.flush_wal().await?; } // Update commit_lsn. if msg.h.commit_lsn != Lsn(0) { - self.update_commit_lsn(msg.h.commit_lsn)?; + self.update_commit_lsn(msg.h.commit_lsn).await?; } // Value calculated by walproposer can always lag: // - safekeepers can forget inmem value and send to proposer lower @@ -909,7 +919,7 @@ where if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) < self.inmem.peer_horizon_lsn { - self.persist_control_file(self.state.clone())?; + self.persist_control_file(self.state.clone()).await?; } trace!( @@ -931,15 +941,15 @@ where } /// Flush WAL to disk. Return AppendResponse with latest LSNs. - fn handle_flush(&mut self) -> Result> { - self.wal_store.flush_wal()?; + async fn handle_flush(&mut self) -> Result> { + self.wal_store.flush_wal().await?; Ok(Some(AcceptorProposerMessage::AppendResponse( self.append_response(), ))) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { + pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { let mut sync_control_file = false; if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) { @@ -947,7 +957,7 @@ where // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. if sk_info.last_log_term == self.get_epoch() { - self.update_commit_lsn(Lsn(sk_info.commit_lsn))?; + self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?; } } @@ -973,7 +983,7 @@ where // Note: we could make remote_consistent_lsn update in cf common by // storing Arc to walsenders in Safekeeper. state.remote_consistent_lsn = new_remote_consistent_lsn; - self.persist_control_file(state)?; + self.persist_control_file(state).await?; } Ok(()) } @@ -997,6 +1007,7 @@ where #[cfg(test)] mod tests { + use futures::future::BoxFuture; use postgres_ffi::WAL_SEGMENT_SIZE; use super::*; @@ -1008,8 +1019,9 @@ mod tests { persisted_state: SafeKeeperState, } + #[async_trait::async_trait] impl control_file::Storage for InMemoryState { - fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { + async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { self.persisted_state = s.clone(); Ok(()) } @@ -1039,27 +1051,28 @@ mod tests { lsn: Lsn, } + #[async_trait::async_trait] impl wal_storage::Storage for DummyWalStore { fn flush_lsn(&self) -> Lsn { self.lsn } - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) } - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { self.lsn = end_pos; Ok(()) } - fn flush_wal(&mut self) -> Result<()> { + async fn flush_wal(&mut self) -> Result<()> { Ok(()) } - fn remove_up_to(&self) -> Box Result<()>> { - Box::new(move |_segno_up_to: XLogSegNo| Ok(())) + fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> { + Box::pin(async { Ok(()) }) } fn get_metrics(&self) -> crate::metrics::WalStorageMetrics { @@ -1067,8 +1080,8 @@ mod tests { } } - #[test] - fn test_voting() { + #[tokio::test] + async fn test_voting() { let storage = InMemoryState { persisted_state: test_sk_state(), }; @@ -1077,7 +1090,7 @@ mod tests { // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); - let mut vote_resp = sk.process_msg(&vote_request); + let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), r => panic!("unexpected response: {:?}", r), @@ -1092,15 +1105,15 @@ mod tests { sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok - vote_resp = sk.process_msg(&vote_request); + vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), r => panic!("unexpected response: {:?}", r), } } - #[test] - fn test_epoch_switch() { + #[tokio::test] + async fn test_epoch_switch() { let storage = InMemoryState { persisted_state: test_sk_state(), }; @@ -1132,10 +1145,13 @@ mod tests { timeline_start_lsn: Lsn(0), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) + .await .unwrap(); // check that AppendRequest before epochStartLsn doesn't switch epoch - let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); + let resp = sk + .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await; assert!(resp.is_ok()); assert_eq!(sk.get_epoch(), 0); @@ -1146,9 +1162,11 @@ mod tests { h: ar_hdr, wal_data: Bytes::from_static(b"b"), }; - let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); + let resp = sk + .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await; assert!(resp.is_ok()); - sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %) + sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %) assert_eq!(sk.get_epoch(), 1); } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index fb420cba64..abca0a86b1 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -396,7 +396,7 @@ impl SafekeeperPostgresHandler { // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. let stop_pos: Option = if self.is_walproposer_recovery() { - let wal_end = tli.get_flush_lsn(); + let wal_end = tli.get_flush_lsn().await; Some(wal_end) } else { None @@ -418,7 +418,7 @@ impl SafekeeperPostgresHandler { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse).await?; - let (_, persisted_state) = tli.get_state(); + let (_, persisted_state) = tli.get_state().await; let wal_reader = WalReader::new( self.conf.workdir.clone(), self.conf.timeline_dir(&tli.ttid), @@ -562,7 +562,7 @@ impl WalSender<'_, IO> { .walsenders .get_ws_remote_consistent_lsn(self.ws_guard.id) { - if self.tli.should_walsender_stop(remote_consistent_lsn) { + if self.tli.should_walsender_stop(remote_consistent_lsn).await { // Terminate if there is nothing more to send. return Err(CopyStreamHandlerEnd::ServerInitiated(format!( "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 941f8dae54..52c3e8d4be 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -2,12 +2,13 @@ //! to glue together SafeKeeper and all other background services. use anyhow::{anyhow, bail, Result}; -use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; +use tokio::fs; use std::cmp::max; use std::path::PathBuf; use std::sync::Arc; +use tokio::sync::{Mutex, MutexGuard}; use tokio::{ sync::{mpsc::Sender, watch}, time::Instant, @@ -286,8 +287,9 @@ pub struct Timeline { commit_lsn_watch_tx: watch::Sender, commit_lsn_watch_rx: watch::Receiver, - /// Safekeeper and other state, that should remain consistent and synchronized - /// with the disk. + /// Safekeeper and other state, that should remain consistent and + /// synchronized with the disk. This is tokio mutex as we write WAL to disk + /// while holding it, ensuring that consensus checks are in order. mutex: Mutex, walsenders: Arc, @@ -361,8 +363,8 @@ impl Timeline { /// /// Bootstrap is transactional, so if it fails, created files will be deleted, /// and state on disk should remain unchanged. - pub fn bootstrap(&self, shared_state: &mut MutexGuard) -> Result<()> { - match std::fs::metadata(&self.timeline_dir) { + pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> { + match fs::metadata(&self.timeline_dir).await { Ok(_) => { // Timeline directory exists on disk, we should leave state unchanged // and return error. @@ -375,53 +377,51 @@ impl Timeline { } // Create timeline directory. - std::fs::create_dir_all(&self.timeline_dir)?; + fs::create_dir_all(&self.timeline_dir).await?; // Write timeline to disk and TODO: start background tasks. - match || -> Result<()> { - shared_state.sk.persist()?; - // TODO: add more initialization steps here - self.update_status(shared_state); - Ok(()) - }() { - Ok(_) => Ok(()), - Err(e) => { - // Bootstrap failed, cancel timeline and remove timeline directory. - self.cancel(shared_state); + if let Err(e) = shared_state.sk.persist().await { + // Bootstrap failed, cancel timeline and remove timeline directory. + self.cancel(shared_state); - if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { - warn!( - "failed to remove timeline {} directory after bootstrap failure: {}", - self.ttid, fs_err - ); - } - - Err(e) + if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await { + warn!( + "failed to remove timeline {} directory after bootstrap failure: {}", + self.ttid, fs_err + ); } + + return Err(e); } + + // TODO: add more initialization steps here + self.update_status(shared_state); + Ok(()) } /// Delete timeline from disk completely, by removing timeline directory. Background /// timeline activities will stop eventually. - pub fn delete_from_disk( + pub async fn delete_from_disk( &self, - shared_state: &mut MutexGuard, + shared_state: &mut MutexGuard<'_, SharedState>, ) -> Result<(bool, bool)> { let was_active = shared_state.active; self.cancel(shared_state); - let dir_existed = delete_dir(&self.timeline_dir)?; + let dir_existed = delete_dir(&self.timeline_dir).await?; Ok((dir_existed, was_active)) } /// Cancel timeline to prevent further usage. Background tasks will stop /// eventually after receiving cancellation signal. - fn cancel(&self, shared_state: &mut MutexGuard) { + /// + /// Note that we can't notify backup launcher here while holding + /// shared_state lock, as this is a potential deadlock: caller is + /// responsible for that. Generally we should probably make WAL backup tasks + /// to shut down on their own, checking once in a while whether it is the + /// time. + fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) { info!("timeline {} is cancelled", self.ttid); let _ = self.cancellation_tx.send(true); - let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); - if let Err(e) = res { - error!("Failed to send stop signal to wal_backup_launcher: {}", e); - } // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.wal_store.close(); @@ -433,8 +433,8 @@ impl Timeline { } /// Take a writing mutual exclusive lock on timeline shared_state. - pub fn write_shared_state(&self) -> MutexGuard { - self.mutex.lock() + pub async fn write_shared_state(&self) -> MutexGuard { + self.mutex.lock().await } fn update_status(&self, shared_state: &mut SharedState) -> bool { @@ -450,7 +450,7 @@ impl Timeline { let is_wal_backup_action_pending: bool; { - let mut shared_state = self.write_shared_state(); + let mut shared_state = self.write_shared_state().await; shared_state.num_computes += 1; is_wal_backup_action_pending = self.update_status(&mut shared_state); } @@ -464,22 +464,17 @@ impl Timeline { /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. - pub fn on_compute_disconnect(&self) -> Result<()> { + pub async fn on_compute_disconnect(&self) -> Result<()> { let is_wal_backup_action_pending: bool; { - let mut shared_state = self.write_shared_state(); + let mut shared_state = self.write_shared_state().await; shared_state.num_computes -= 1; is_wal_backup_action_pending = self.update_status(&mut shared_state); } // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { // Can fail only if channel to a static thread got closed, which is not normal at all. - // - // Note: this is blocking_send because on_compute_disconnect is called in Drop, there is - // no async Drop and we use current thread runtimes. With current thread rt spawning - // task in drop impl is racy, as thread along with runtime might finish before the task. - // This should be switched send.await when/if we go to full async. - self.wal_backup_launcher_tx.blocking_send(self.ttid)?; + self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } @@ -489,11 +484,11 @@ impl Timeline { /// computes. While there might be nothing to stream already, we learn about /// remote_consistent_lsn update through replication feedback, and we want /// to stop pushing to the broker if pageserver is fully caughtup. - pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { + pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { if self.is_cancelled() { return true; } - let shared_state = self.write_shared_state(); + let shared_state = self.write_shared_state().await; if shared_state.num_computes == 0 { return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn; @@ -503,12 +498,12 @@ impl Timeline { /// Returns whether s3 offloading is required and sets current status as /// matching it. - pub fn wal_backup_attend(&self) -> bool { + pub async fn wal_backup_attend(&self) -> bool { if self.is_cancelled() { return false; } - self.write_shared_state().wal_backup_attend() + self.write_shared_state().await.wal_backup_attend() } /// Returns commit_lsn watch channel. @@ -517,7 +512,7 @@ impl Timeline { } /// Pass arrived message to the safekeeper. - pub fn process_msg( + pub async fn process_msg( &self, msg: &ProposerAcceptorMessage, ) -> Result> { @@ -528,8 +523,8 @@ impl Timeline { let mut rmsg: Option; let commit_lsn: Lsn; { - let mut shared_state = self.write_shared_state(); - rmsg = shared_state.sk.process_msg(msg)?; + let mut shared_state = self.write_shared_state().await; + rmsg = shared_state.sk.process_msg(msg).await?; // if this is AppendResponse, fill in proper pageserver and hot // standby feedback. @@ -546,37 +541,37 @@ impl Timeline { } /// Returns wal_seg_size. - pub fn get_wal_seg_size(&self) -> usize { - self.write_shared_state().get_wal_seg_size() + pub async fn get_wal_seg_size(&self) -> usize { + self.write_shared_state().await.get_wal_seg_size() } /// Returns true only if the timeline is loaded and active. - pub fn is_active(&self) -> bool { + pub async fn is_active(&self) -> bool { if self.is_cancelled() { return false; } - self.write_shared_state().active + self.write_shared_state().await.active } /// Returns state of the timeline. - pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { - let state = self.write_shared_state(); + pub async fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { + let state = self.write_shared_state().await; (state.sk.inmem.clone(), state.sk.state.clone()) } /// Returns latest backup_lsn. - pub fn get_wal_backup_lsn(&self) -> Lsn { - self.write_shared_state().sk.inmem.backup_lsn + pub async fn get_wal_backup_lsn(&self) -> Lsn { + self.write_shared_state().await.sk.inmem.backup_lsn } /// Sets backup_lsn to the given value. - pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } - let mut state = self.write_shared_state(); + let mut state = self.write_shared_state().await; state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn); // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. @@ -584,8 +579,8 @@ impl Timeline { } /// Get safekeeper info for broadcasting to broker and other peers. - pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { - let shared_state = self.write_shared_state(); + pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { + let shared_state = self.write_shared_state().await; shared_state.get_safekeeper_info( &self.ttid, conf, @@ -604,8 +599,8 @@ impl Timeline { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { - let mut shared_state = self.write_shared_state(); - shared_state.sk.record_safekeeper_info(&sk_info)?; + let mut shared_state = self.write_shared_state().await; + shared_state.sk.record_safekeeper_info(&sk_info).await?; let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); is_wal_backup_action_pending = self.update_status(&mut shared_state); @@ -622,8 +617,8 @@ impl Timeline { /// Get our latest view of alive peers status on the timeline. /// We pass our own info through the broker as well, so when we don't have connection /// to the broker returned vec is empty. - pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { - let shared_state = self.write_shared_state(); + pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { + let shared_state = self.write_shared_state().await; let now = Instant::now(); shared_state .peers_info @@ -640,34 +635,34 @@ impl Timeline { } /// Returns flush_lsn. - pub fn get_flush_lsn(&self) -> Lsn { - self.write_shared_state().sk.wal_store.flush_lsn() + pub async fn get_flush_lsn(&self) -> Lsn { + self.write_shared_state().await.sk.wal_store.flush_lsn() } /// Delete WAL segments from disk that are no longer needed. This is determined /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. - pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } let horizon_segno: XLogSegNo; - let remover: Box Result<(), anyhow::Error>>; - { - let shared_state = self.write_shared_state(); + let remover = { + let shared_state = self.write_shared_state().await; horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); - remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { - return Ok(()); + return Ok(()); // nothing to do } + let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1); // release the lock before removing - } + remover + }; // delete old WAL files - remover(horizon_segno - 1)?; + remover.await?; // update last_removed_segno - let mut shared_state = self.write_shared_state(); + let mut shared_state = self.write_shared_state().await; shared_state.last_removed_segno = horizon_segno; Ok(()) } @@ -676,22 +671,24 @@ impl Timeline { /// passed after the last save. This helps to keep remote_consistent_lsn up /// to date so that storage nodes restart doesn't cause many pageserver -> /// safekeeper reconnections. - pub fn maybe_pesist_control_file(&self) -> Result<()> { + pub async fn maybe_persist_control_file(&self) -> Result<()> { let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn(); self.write_shared_state() + .await .sk .maybe_persist_control_file(remote_consistent_lsn) + .await } - /// Returns full timeline info, required for the metrics. If the timeline is - /// not active, returns None instead. - pub fn info_for_metrics(&self) -> Option { + /// Gather timeline data for metrics. If the timeline is not active, returns + /// None, we do not collect these. + pub async fn info_for_metrics(&self) -> Option { if self.is_cancelled() { return None; } let ps_feedback = self.walsenders.get_ps_feedback(); - let state = self.write_shared_state(); + let state = self.write_shared_state().await; if state.active { Some(FullTimelineInfo { ttid: self.ttid, @@ -713,8 +710,8 @@ impl Timeline { } /// Returns in-memory timeline state to build a full debug dump. - pub fn memory_dump(&self) -> debug_dump::Memory { - let state = self.write_shared_state(); + pub async fn memory_dump(&self) -> debug_dump::Memory { + let state = self.write_shared_state().await; let (write_lsn, write_record_lsn, flush_lsn, file_open) = state.sk.wal_store.internal_state(); @@ -738,8 +735,8 @@ impl Timeline { } /// Deletes directory and it's contents. Returns false if directory does not exist. -fn delete_dir(path: &PathBuf) -> Result { - match std::fs::remove_dir_all(path) { +async fn delete_dir(path: &PathBuf) -> Result { + match fs::remove_dir_all(path).await { Ok(_) => Ok(true), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), Err(e) => Err(e.into()), diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 41809794dc..f2d5df8744 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -113,9 +113,17 @@ impl GlobalTimelines { Ok(()) } - /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any. + /// Loads all timelines for the given tenant to memory. Returns fs::read_dir + /// errors if any. + /// + /// Note: This function (and all reading/loading below) is sync because + /// timelines are loaded while holding GlobalTimelinesState lock. Which is + /// fine as this is called only from single threaded main runtime on boot, + /// but clippy complains anyway, and suppressing that isn't trivial as async + /// is the keyword, ha. That only other user is pull_timeline.rs for which + /// being blocked is not that bad, and we can do spawn_blocking. fn load_tenant_timelines( - state: &mut MutexGuard, + state: &mut MutexGuard<'_, GlobalTimelinesState>, tenant_id: TenantId, ) -> Result<()> { let timelines_dir = state.get_conf().tenant_dir(&tenant_id); @@ -220,7 +228,7 @@ impl GlobalTimelines { // Take a lock and finish the initialization holding this mutex. No other threads // can interfere with creation after we will insert timeline into the map. { - let mut shared_state = timeline.write_shared_state(); + let mut shared_state = timeline.write_shared_state().await; // We can get a race condition here in case of concurrent create calls, but only // in theory. create() will return valid timeline on the next try. @@ -232,7 +240,7 @@ impl GlobalTimelines { // Write the new timeline to the disk and start background workers. // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. - if let Err(e) = timeline.bootstrap(&mut shared_state) { + if let Err(e) = timeline.bootstrap(&mut shared_state).await { // Note: the most likely reason for bootstrap failure is that the timeline // directory already exists on disk. This happens when timeline is corrupted // and wasn't loaded from disk on startup because of that. We want to preserve @@ -294,15 +302,16 @@ impl GlobalTimelines { } /// Cancels timeline, then deletes the corresponding data directory. - pub fn delete_force(ttid: &TenantTimelineId) -> Result { + pub async fn delete_force(ttid: &TenantTimelineId) -> Result { let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); match tli_res { Ok(timeline) => { // Take a lock and finish the deletion holding this mutex. - let mut shared_state = timeline.write_shared_state(); + let mut shared_state = timeline.write_shared_state().await; info!("deleting timeline {}", ttid); - let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?; + let (dir_existed, was_active) = + timeline.delete_from_disk(&mut shared_state).await?; // Remove timeline from the map. // FIXME: re-enable it once we fix the issue with recreation of deleted timelines @@ -335,7 +344,7 @@ impl GlobalTimelines { /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are /// created simultaneously. In that case the function will return error and the caller should /// retry tenant deletion again later. - pub fn delete_force_all_for_tenant( + pub async fn delete_force_all_for_tenant( tenant_id: &TenantId, ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); @@ -345,7 +354,7 @@ impl GlobalTimelines { let mut deleted = HashMap::new(); for tli in &to_delete { - match Self::delete_force(&tli.ttid) { + match Self::delete_force(&tli.ttid).await { Ok(result) => { deleted.insert(tli.ttid, result); } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 4d341a7ef8..eae3f3fe86 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -17,7 +17,6 @@ use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::fs::File; -use tokio::runtime::Builder; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; @@ -36,30 +35,16 @@ use once_cell::sync::OnceCell; const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; -pub fn wal_backup_launcher_thread_main( - conf: SafeKeeperConf, - wal_backup_launcher_rx: Receiver, -) { - let mut builder = Builder::new_multi_thread(); - if let Some(num_threads) = conf.backup_runtime_threads { - builder.worker_threads(num_threads); - } - let rt = builder - .enable_all() - .build() - .expect("failed to create wal backup runtime"); - - rt.block_on(async { - wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await; - }); -} - /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { - GlobalTimelines::get(ttid) - .ok() - .filter(|tli| tli.wal_backup_attend()) +async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { + match GlobalTimelines::get(ttid).ok() { + Some(tli) => { + tli.wal_backup_attend().await; + Some(tli) + } + None => None, + } } struct WalBackupTaskHandle { @@ -143,8 +128,8 @@ async fn update_task( ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry, ) { - let alive_peers = entry.timeline.get_peers(conf); - let wal_backup_lsn = entry.timeline.get_wal_backup_lsn(); + let alive_peers = entry.timeline.get_peers(conf).await; + let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await; let (offloader, election_dbg_str) = determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); let elected_me = Some(conf.my_id) == offloader; @@ -183,10 +168,10 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup /// tasks. Having this in separate task simplifies locking, allows to reap /// panics and separate elections from offloading itself. -async fn wal_backup_launcher_main_loop( +pub async fn wal_backup_launcher_task_main( conf: SafeKeeperConf, mut wal_backup_launcher_rx: Receiver, -) { +) -> anyhow::Result<()> { info!( "WAL backup launcher started, remote config {:?}", conf.remote_storage @@ -214,7 +199,7 @@ async fn wal_backup_launcher_main_loop( if conf.remote_storage.is_none() || !conf.wal_backup_enabled { continue; /* just drain the channel and do nothing */ } - let timeline = is_wal_backup_required(ttid); + let timeline = is_wal_backup_required(ttid).await; // do we need to do anything at all? if timeline.is_some() != tasks.contains_key(&ttid) { if let Some(timeline) = timeline { @@ -269,7 +254,7 @@ async fn backup_task_main( let tli = res.unwrap(); let mut wb = WalBackupTask { - wal_seg_size: tli.get_wal_seg_size(), + wal_seg_size: tli.get_wal_seg_size().await, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), timeline: tli, timeline_dir, @@ -326,7 +311,7 @@ impl WalBackupTask { continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ } // Perhaps peers advanced the position, check shmem value. - backup_lsn = self.timeline.get_wal_backup_lsn(); + backup_lsn = self.timeline.get_wal_backup_lsn().await; if backup_lsn.segment_number(self.wal_seg_size) >= commit_lsn.segment_number(self.wal_seg_size) { @@ -402,6 +387,7 @@ pub async fn backup_lsn_range( let new_backup_lsn = segment.end_lsn; timeline .set_wal_backup_lsn(new_backup_lsn) + .await .context("setting wal_backup_lsn")?; *backup_lsn = new_backup_lsn; } else { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index fb0d77a9f2..406132b2b0 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,7 +4,7 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; -use std::{future, thread, time::Duration}; +use std::{future, time::Duration}; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; use tracing::*; @@ -16,104 +16,82 @@ use crate::SafeKeeperConf; use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("create runtime") - // todo catch error in main thread - .expect("failed to create runtime"); +pub async fn task_main( + conf: SafeKeeperConf, + pg_listener: std::net::TcpListener, +) -> anyhow::Result<()> { + // Tokio's from_std won't do this for us, per its comment. + pg_listener.set_nonblocking(true)?; - runtime - .block_on(async move { - // Tokio's from_std won't do this for us, per its comment. - pg_listener.set_nonblocking(true)?; - let listener = tokio::net::TcpListener::from_std(pg_listener)?; - let mut connection_count: ConnectionCount = 0; + let listener = tokio::net::TcpListener::from_std(pg_listener)?; + let mut connection_count: ConnectionCount = 0; - loop { - match listener.accept().await { - Ok((socket, peer_addr)) => { - debug!("accepted connection from {}", peer_addr); - let conf = conf.clone(); - let conn_id = issue_connection_id(&mut connection_count); + loop { + let (socket, peer_addr) = listener.accept().await.context("accept")?; + debug!("accepted connection from {}", peer_addr); + let conf = conf.clone(); + let conn_id = issue_connection_id(&mut connection_count); - let _ = thread::Builder::new() - .name("WAL service thread".into()) - .spawn(move || { - if let Err(err) = handle_socket(socket, conf, conn_id) { - error!("connection handler exited: {}", err); - } - }) - .unwrap(); - } - Err(e) => error!("Failed to accept connection: {}", e), - } + tokio::spawn(async move { + if let Err(err) = handle_socket(socket, conf, conn_id) + .instrument(info_span!("", cid = %conn_id)) + .await + { + error!("connection handler exited: {}", err); } - #[allow(unreachable_code)] // hint compiler the closure return type - Ok::<(), anyhow::Error>(()) - }) - .expect("listener failed") + }); + } } -/// This is run by `thread_main` above, inside a background thread. +/// This is run by `task_main` above, inside a background thread. /// -fn handle_socket( +async fn handle_socket( socket: TcpStream, conf: SafeKeeperConf, conn_id: ConnectionId, ) -> Result<(), QueryError> { - let _enter = info_span!("", cid = %conn_id).entered(); - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - socket.set_nodelay(true)?; let peer_addr = socket.peer_addr()?; - // TimeoutReader wants async runtime during creation. - runtime.block_on(async move { - // Set timeout on reading from the socket. It prevents hanged up connection - // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by - // default, and tokio doesn't provide ability to set it out of the box. - let mut socket = TimeoutReader::new(socket); - let wal_service_timeout = Duration::from_secs(60 * 10); - socket.set_timeout(Some(wal_service_timeout)); - // pin! is here because TimeoutReader (due to storing sleep future inside) - // is not Unpin, and all pgbackend/framed/tokio dependencies require stream - // to be Unpin. Which is reasonable, as indeed something like TimeoutReader - // shouldn't be moved. - tokio::pin!(socket); + // Set timeout on reading from the socket. It prevents hanged up connection + // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by + // default, and tokio doesn't provide ability to set it out of the box. + let mut socket = TimeoutReader::new(socket); + let wal_service_timeout = Duration::from_secs(60 * 10); + socket.set_timeout(Some(wal_service_timeout)); + // pin! is here because TimeoutReader (due to storing sleep future inside) + // is not Unpin, and all pgbackend/framed/tokio dependencies require stream + // to be Unpin. Which is reasonable, as indeed something like TimeoutReader + // shouldn't be moved. + tokio::pin!(socket); - let traffic_metrics = TrafficMetrics::new(); - if let Some(current_az) = conf.availability_zone.as_deref() { - traffic_metrics.set_sk_az(current_az); - } + let traffic_metrics = TrafficMetrics::new(); + if let Some(current_az) = conf.availability_zone.as_deref() { + traffic_metrics.set_sk_az(current_az); + } - let socket = MeasuredStream::new( - socket, - |cnt| { - traffic_metrics.observe_read(cnt); - }, - |cnt| { - traffic_metrics.observe_write(cnt); - }, - ); + let socket = MeasuredStream::new( + socket, + |cnt| { + traffic_metrics.observe_read(cnt); + }, + |cnt| { + traffic_metrics.observe_write(cnt); + }, + ); - let auth_type = match conf.auth { - None => AuthType::Trust, - Some(_) => AuthType::NeonJWT, - }; - let mut conn_handler = - SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; - // libpq protocol between safekeeper and walproposer / pageserver - // We don't use shutdown. - pgbackend - .run(&mut conn_handler, future::pending::<()>) - .await - }) + let auth_type = match conf.auth { + None => AuthType::Trust, + Some(_) => AuthType::NeonJWT, + }; + let mut conn_handler = + SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + // libpq protocol between safekeeper and walproposer / pageserver + // We don't use shutdown. + pgbackend + .run(&mut conn_handler, future::pending::<()>) + .await } /// Unique WAL service connection ids are logged in spans for observability. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 644c956fc1..687e1ba6b6 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,54 +8,47 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{bail, Context, Result}; -use remote_storage::RemotePath; - -use std::io::{self, Seek, SeekFrom}; -use std::pin::Pin; -use tokio::io::AsyncRead; - +use bytes::Bytes; +use futures::future::BoxFuture; use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::{XLogSegNo, PG_TLI}; +use remote_storage::RemotePath; use std::cmp::{max, min}; - -use bytes::Bytes; -use std::fs::{self, remove_file, File, OpenOptions}; -use std::io::Write; +use std::io::{self, SeekFrom}; use std::path::{Path, PathBuf}; - +use std::pin::Pin; +use tokio::fs::{self, remove_file, File, OpenOptions}; +use tokio::io::{AsyncRead, AsyncWriteExt}; +use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; -use utils::{id::TenantTimelineId, lsn::Lsn}; - use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; use crate::safekeeper::SafeKeeperState; - use crate::wal_backup::read_object; use crate::SafeKeeperConf; +use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::XLogFileName; use postgres_ffi::XLOG_BLCKSZ; - -use postgres_ffi::waldecoder::WalStreamDecoder; - use pq_proto::SystemId; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; +use utils::{id::TenantTimelineId, lsn::Lsn}; +#[async_trait::async_trait] pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; /// Write piece of WAL from buf to disk, but not necessarily sync it. - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; /// Truncate WAL at specified LSN, which must be the end of WAL record. - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; /// Durably store WAL on disk, up to the last written WAL record. - fn flush_wal(&mut self) -> Result<()>; + async fn flush_wal(&mut self) -> Result<()>; - /// Remove all segments <= given segno. Returns closure as we want to do - /// that without timeline lock. - fn remove_up_to(&self) -> Box Result<()>>; + /// Remove all segments <= given segno. Returns function doing that as we + /// want to perform it without timeline lock. + fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>>; /// Release resources associated with the storage -- technically, close FDs. /// Currently we don't remove timelines until restart (#3146), so need to @@ -105,6 +98,22 @@ pub struct PhysicalStorage { /// - points to write_lsn, so no seek is needed for writing /// - doesn't point to the end of the segment file: Option, + + /// When false, we have just initialized storage using the LSN from find_end_of_wal(). + /// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular, + /// there can be a case with unexpected .partial file. + /// + /// Imagine the following: + /// - 000000010000000000000001 + /// - it was fully written, but the last record is split between 2 segments + /// - after restart, find_end_of_wal() returned 0/1FFFFF0, which is in the end of this segment + /// - write_lsn, write_record_lsn and flush_record_lsn were initialized to 0/1FFFFF0 + /// - 000000010000000000000002.partial + /// - it has only 1 byte written, which is not enough to make a full WAL record + /// + /// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal(). + /// This flag will be set to true after the first truncate_wal() call. + is_truncated_after_restart: bool, } impl PhysicalStorage { @@ -164,6 +173,7 @@ impl PhysicalStorage { flush_record_lsn: flush_lsn, decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), file: None, + is_truncated_after_restart: false, }) } @@ -178,33 +188,37 @@ impl PhysicalStorage { } /// Call fdatasync if config requires so. - fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { + async fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?); + .observe_flush_seconds(time_io_closure(file.sync_data()).await?); } Ok(()) } /// Call fsync if config requires so. - fn fsync_file(&mut self, file: &mut File) -> Result<()> { + async fn fsync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?); + .observe_flush_seconds(time_io_closure(file.sync_all()).await?); } Ok(()) } /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { + async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; // Try to open already completed segment - if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await { Ok((file, false)) - } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) { + } else if let Ok(file) = OpenOptions::new() + .write(true) + .open(&wal_file_partial_path) + .await + { // Try to open existing partial file Ok((file, true)) } else { @@ -213,35 +227,36 @@ impl PhysicalStorage { .create(true) .write(true) .open(&wal_file_partial_path) + .await .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; - write_zeroes(&mut file, self.wal_seg_size)?; - self.fsync_file(&mut file)?; + write_zeroes(&mut file, self.wal_seg_size).await?; + self.fsync_file(&mut file).await?; Ok((file, true)) } } /// Write WAL bytes, which are known to be located in a single WAL segment. - fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { + async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { let mut file = if let Some(file) = self.file.take() { file } else { - let (mut file, is_partial) = self.open_or_create(segno)?; + let (mut file, is_partial) = self.open_or_create(segno).await?; assert!(is_partial, "unexpected write into non-partial segment file"); - file.seek(SeekFrom::Start(xlogoff as u64))?; + file.seek(SeekFrom::Start(xlogoff as u64)).await?; file }; - file.write_all(buf)?; + file.write_all(buf).await?; if xlogoff + buf.len() == self.wal_seg_size { // If we reached the end of a WAL segment, flush and close it. - self.fdatasync_file(&mut file)?; + self.fdatasync_file(&mut file).await?; // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(wal_file_partial_path, wal_file_path)?; + fs::rename(wal_file_partial_path, wal_file_path).await?; } else { // otherwise, file can be reused later self.file = Some(file); @@ -255,11 +270,11 @@ impl PhysicalStorage { /// be flushed separately later. /// /// Updates `write_lsn`. - fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { + async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { if self.write_lsn != pos { // need to flush the file before discarding it if let Some(mut file) = self.file.take() { - self.fdatasync_file(&mut file)?; + self.fdatasync_file(&mut file).await?; } self.write_lsn = pos; @@ -277,7 +292,8 @@ impl PhysicalStorage { buf.len() }; - self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?; + self.write_in_segment(segno, xlogoff, &buf[..bytes_write]) + .await?; self.write_lsn += bytes_write as u64; buf = &buf[bytes_write..]; } @@ -286,6 +302,7 @@ impl PhysicalStorage { } } +#[async_trait::async_trait] impl Storage for PhysicalStorage { /// flush_lsn returns LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { @@ -293,7 +310,7 @@ impl Storage for PhysicalStorage { } /// Write WAL to disk. - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. // If we need to move the pointer, use truncate_wal() instead. if self.write_lsn > startpos { @@ -311,7 +328,7 @@ impl Storage for PhysicalStorage { ); } - let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?; + let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?; // WAL is written, updating write metrics self.metrics.observe_write_seconds(write_seconds); self.metrics.observe_write_bytes(buf.len()); @@ -340,14 +357,14 @@ impl Storage for PhysicalStorage { Ok(()) } - fn flush_wal(&mut self) -> Result<()> { + async fn flush_wal(&mut self) -> Result<()> { if self.flush_record_lsn == self.write_record_lsn { // no need to do extra flush return Ok(()); } if let Some(mut unflushed_file) = self.file.take() { - self.fdatasync_file(&mut unflushed_file)?; + self.fdatasync_file(&mut unflushed_file).await?; self.file = Some(unflushed_file); } else { // We have unflushed data (write_lsn != flush_lsn), but no file. @@ -369,7 +386,7 @@ impl Storage for PhysicalStorage { /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { // Streaming must not create a hole, so truncate cannot be called on non-written lsn if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { bail!( @@ -381,47 +398,51 @@ impl Storage for PhysicalStorage { // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on // disk (this happens on each connect). - if end_pos == self.write_lsn { + if self.is_truncated_after_restart + && end_pos == self.write_lsn + && end_pos == self.flush_record_lsn + { return Ok(()); } // Close previously opened file, if any if let Some(mut unflushed_file) = self.file.take() { - self.fdatasync_file(&mut unflushed_file)?; + self.fdatasync_file(&mut unflushed_file).await?; } let xlogoff = end_pos.segment_offset(self.wal_seg_size); let segno = end_pos.segment_number(self.wal_seg_size); // Remove all segments after the given LSN. - remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?; + remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?; - let (mut file, is_partial) = self.open_or_create(segno)?; + let (mut file, is_partial) = self.open_or_create(segno).await?; // Fill end with zeroes - file.seek(SeekFrom::Start(xlogoff as u64))?; - write_zeroes(&mut file, self.wal_seg_size - xlogoff)?; - self.fdatasync_file(&mut file)?; + file.seek(SeekFrom::Start(xlogoff as u64)).await?; + write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?; + self.fdatasync_file(&mut file).await?; if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(wal_file_path, wal_file_partial_path)?; + fs::rename(wal_file_path, wal_file_partial_path).await?; } // Update LSNs self.write_lsn = end_pos; self.write_record_lsn = end_pos; self.flush_record_lsn = end_pos; + self.is_truncated_after_restart = true; Ok(()) } - fn remove_up_to(&self) -> Box Result<()>> { + fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> { let timeline_dir = self.timeline_dir.clone(); let wal_seg_size = self.wal_seg_size; - Box::new(move |segno_up_to: XLogSegNo| { - remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) + Box::pin(async move { + remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to).await }) } @@ -436,7 +457,7 @@ impl Storage for PhysicalStorage { } /// Remove all WAL segments in timeline_dir that match the given predicate. -fn remove_segments_from_disk( +async fn remove_segments_from_disk( timeline_dir: &Path, wal_seg_size: usize, remove_predicate: impl Fn(XLogSegNo) -> bool, @@ -445,8 +466,8 @@ fn remove_segments_from_disk( let mut min_removed = u64::MAX; let mut max_removed = u64::MIN; - for entry in fs::read_dir(timeline_dir)? { - let entry = entry?; + let mut entries = fs::read_dir(timeline_dir).await?; + while let Some(entry) = entries.next_entry().await? { let entry_path = entry.path(); let fname = entry_path.file_name().unwrap(); @@ -457,7 +478,7 @@ fn remove_segments_from_disk( } let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); if remove_predicate(segno) { - remove_file(entry_path)?; + remove_file(entry_path).await?; n_removed += 1; min_removed = min(min_removed, segno); max_removed = max(max_removed, segno); @@ -689,12 +710,12 @@ impl WalReader { const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; /// Helper for filling file with zeroes. -fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { +async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { while count >= XLOG_BLCKSZ { - file.write_all(ZERO_BLOCK)?; + file.write_all(ZERO_BLOCK).await?; count -= XLOG_BLCKSZ; } - file.write_all(&ZERO_BLOCK[0..count])?; + file.write_all(&ZERO_BLOCK[0..count]).await?; Ok(()) } diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 4bc561449d..3f6fa35cbe 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -32,6 +32,7 @@ pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms"; +pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000); // BrokerServiceClient charged with tonic provided Channel transport; helps to // avoid depending on tonic directly in user crates. @@ -58,7 +59,8 @@ where } tonic_endpoint = tonic_endpoint .http2_keep_alive_interval(keepalive_interval) - .keep_alive_while_idle(true); + .keep_alive_while_idle(true) + .connect_timeout(DEFAULT_CONNECT_TIMEOUT); // keep_alive_timeout is 20s by default on both client and server side let channel = tonic_endpoint.connect_lazy(); Ok(BrokerClientChannel::new(channel)) diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index fa2e058491..9c45088d62 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,10 +1,63 @@ from contextlib import closing import pytest -from fixtures.benchmark_fixture import NeonBenchmarker +import requests +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.neon_fixtures import NeonEnvBuilder +# Just start and measure duration. +# +# This test runs pretty quickly and can be informative when used in combination +# with emulated network delay. Some useful delay commands: +# +# 1. Add 2msec delay to all localhost traffic +# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` +# +# 2. Test that it works (you should see 4ms ping) +# `ping localhost` +# +# 3. Revert back to normal +# `sudo tc qdisc del dev lo root netem` +# +# NOTE this test might not represent the real startup time because the basebackup +# for a large database might be larger if there's a lof of transaction metadata, +# or safekeepers might need more syncing, or there might be more operations to +# apply during config step, like more users, databases, or extensions. By default +# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this +# test we only load neon. +def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_startup") + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{i}_start_and_select"): + endpoint = env.endpoints.create_start("test_startup") + endpoint.safe_psql("select 1;") + + # Get metrics + metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + durations = { + "wait_for_spec_ms": f"{i}_wait_for_spec", + "sync_safekeepers_ms": f"{i}_sync_safekeepers", + "basebackup_ms": f"{i}_basebackup", + "config_ms": f"{i}_config", + "total_startup_ms": f"{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + # Stop so we can restart + endpoint.stop() + + # This test sometimes runs for longer than the global 5 minute timeout. @pytest.mark.timeout(600) def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index ca19dc3fd0..24c5b42b5a 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -163,7 +163,6 @@ def test_forward_params_to_client(static_proxy: NeonProxy): assert conn.get_parameter_status(name) == value -@pytest.mark.timeout(5) def test_close_on_connections_exit(static_proxy: NeonProxy): # Open two connections, send SIGTERM, then ensure that proxy doesn't exit # until after connections close.