From aa72a22661ca841868ab11b21137c47e6c07c3e3 Mon Sep 17 00:00:00 2001 From: vipvap <91739071+vipvap@users.noreply.github.com> Date: Mon, 8 Jan 2024 09:26:27 +0000 Subject: [PATCH] Release 2024-01-08 (#6286) Release 2024-01-08 --- .github/workflows/build_and_test.yml | 8 +- Cargo.lock | 3 + compute_tools/src/monitor.rs | 25 +- control_plane/src/pageserver.rs | 7 + control_plane/src/tenant_migration.rs | 21 +- docs/sourcetree.md | 8 +- libs/pageserver_api/src/key.rs | 2 +- libs/pageserver_api/src/keyspace.rs | 3 + libs/pageserver_api/src/shard.rs | 22 +- libs/postgres_backend/src/lib.rs | 23 +- libs/remote_storage/src/azure_blob.rs | 6 + libs/remote_storage/src/lib.rs | 13 + libs/remote_storage/src/local_fs.rs | 14 + libs/remote_storage/src/s3_bucket.rs | 32 + libs/remote_storage/src/s3_bucket/metrics.rs | 8 +- libs/remote_storage/src/simulate_failures.rs | 7 + libs/safekeeper_api/src/models.rs | 6 + libs/utils/src/http/error.rs | 7 + libs/utils/src/lib.rs | 2 + libs/utils/src/sync/gate.rs | 6 + libs/utils/src/yielding_loop.rs | 35 + libs/walproposer/src/walproposer.rs | 2 +- pageserver/benches/bench_walredo.rs | 5 +- pageserver/client/src/mgmt_api.rs | 14 +- pageserver/client/src/page_service.rs | 9 +- .../pagebench/src/cmd/getpage_latest_lsn.rs | 38 +- pageserver/src/basebackup.rs | 15 +- pageserver/src/config.rs | 58 +- pageserver/src/http/routes.rs | 36 + pageserver/src/import_datadir.rs | 9 +- pageserver/src/lib.rs | 4 + pageserver/src/metrics.rs | 96 ++- pageserver/src/page_service.rs | 240 ++++-- pageserver/src/pgdatadir_mapping.rs | 275 ++++-- pageserver/src/task_mgr.rs | 5 +- pageserver/src/tenant.rs | 17 +- pageserver/src/tenant/config.rs | 2 + pageserver/src/tenant/delete.rs | 2 +- pageserver/src/tenant/mgr.rs | 289 ++++--- .../src/tenant/remote_timeline_client.rs | 28 +- pageserver/src/tenant/secondary.rs | 142 +++- pageserver/src/tenant/secondary/downloader.rs | 801 ++++++++++++++++++ .../src/tenant/secondary/heatmap_uploader.rs | 502 ++++------- pageserver/src/tenant/secondary/scheduler.rs | 361 ++++++++ .../tenant/storage_layer/inmemory_layer.rs | 43 +- pageserver/src/tenant/storage_layer/layer.rs | 1 + pageserver/src/tenant/tasks.rs | 2 + pageserver/src/tenant/timeline.rs | 173 ++-- pageserver/src/tenant/timeline/walreceiver.rs | 1 + .../walreceiver/connection_manager.rs | 3 + .../walreceiver/walreceiver_connection.rs | 36 +- pageserver/src/walingest.rs | 272 +++--- pageserver/src/walredo.rs | 64 +- pgxn/neon/libpagestore.c | 25 +- pgxn/neon/walproposer.c | 268 +++--- pgxn/neon/walproposer.h | 16 +- pgxn/neon/walproposer_pg.c | 77 +- poetry.lock | 174 +--- pre-commit.py | 24 +- pyproject.toml | 20 +- s3_scrubber/Cargo.toml | 3 + s3_scrubber/src/lib.rs | 8 +- s3_scrubber/src/main.rs | 57 +- s3_scrubber/src/scan_metadata.rs | 11 +- safekeeper/Cargo.toml | 1 + safekeeper/src/control_file.rs | 7 +- safekeeper/src/copy_timeline.rs | 250 ++++++ safekeeper/src/debug_dump.rs | 57 ++ safekeeper/src/http/routes.rs | 65 +- safekeeper/src/lib.rs | 1 + safekeeper/src/pull_timeline.rs | 131 ++- safekeeper/src/timeline.rs | 3 +- safekeeper/src/timelines_global_map.rs | 15 +- safekeeper/src/wal_backup.rs | 61 +- safekeeper/src/wal_storage.rs | 2 +- scripts/export_import_between_pageservers.py | 2 +- scripts/reformat | 4 +- test_runner/fixtures/neon_fixtures.py | 26 +- test_runner/fixtures/pageserver/http.py | 10 +- test_runner/performance/test_perf_olap.py | 7 +- .../performance/test_wal_backpressure.py | 3 +- .../regress/test_attach_tenant_config.py | 13 +- test_runner/regress/test_compatibility.py | 8 +- test_runner/regress/test_crafted_wal_end.py | 1 - test_runner/regress/test_layer_eviction.py | 4 +- .../regress/test_layers_from_future.py | 3 + test_runner/regress/test_pageserver_api.py | 3 +- .../regress/test_pageserver_secondary.py | 149 +++- test_runner/regress/test_tenant_detach.py | 14 +- test_runner/regress/test_tenant_relocation.py | 6 +- .../test_tenants_with_remote_storage.py | 8 +- test_runner/regress/test_wal_acceptor.py | 82 +- vm-image-spec.yaml | 21 +- 93 files changed, 4014 insertions(+), 1429 deletions(-) create mode 100644 libs/utils/src/yielding_loop.rs create mode 100644 pageserver/src/tenant/secondary/downloader.rs create mode 100644 pageserver/src/tenant/secondary/scheduler.rs create mode 100644 safekeeper/src/copy_timeline.rs diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 78deff6e85..880d6044f2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -105,11 +105,11 @@ jobs: - name: Install Python deps run: ./scripts/pysync - - name: Run ruff to ensure code format - run: poetry run ruff . + - name: Run `ruff check` to ensure code format + run: poetry run ruff check . - - name: Run black to ensure code format - run: poetry run black --diff --check . + - name: Run `ruff format` to ensure code format + run: poetry run ruff format --check . - name: Run mypy to check types run: poetry run mypy . diff --git a/Cargo.lock b/Cargo.lock index 55e868a6d5..4dd195a895 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4405,12 +4405,14 @@ dependencies = [ "async-stream", "aws-config", "aws-sdk-s3", + "aws-smithy-async", "bincode", "bytes", "chrono", "clap", "crc32c", "either", + "futures", "futures-util", "hex", "histogram", @@ -4473,6 +4475,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "sha2", "signal-hook", "storage_broker", "thiserror", diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index f974d6023d..fd19b7e53f 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -3,7 +3,7 @@ use std::{thread, time::Duration}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use crate::compute::ComputeNode; @@ -84,6 +84,29 @@ fn watch_compute_activity(compute: &ComputeNode) { } } + // If there are existing (logical) walsenders, do not suspend. + // + // walproposer doesn't currently show up in pg_stat_replication, + // but protect if it will be + let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';"; + match cli.query_one(ws_count_query, &[]) { + Ok(r) => match r.try_get::<&str, i64>("count") { + Ok(num_ws) => { + if num_ws > 0 { + last_active = Some(Utc::now()); + } + } + Err(e) => { + warn!("failed to parse ws count: {:?}", e); + continue; + } + }, + Err(e) => { + warn!("failed to get list of walsenders: {:?}", e); + continue; + } + } + // Update the last activity in the shared state if we got a more recent one. let mut state = compute.state.lock().unwrap(); // NB: `Some()` is always greater than `None`. diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 7d490016bf..fb0d251722 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -485,6 +485,13 @@ impl PageServerNode { Ok(self.http_client.list_timelines(*tenant_id).await?) } + pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> { + Ok(self + .http_client + .tenant_secondary_download(*tenant_id) + .await?) + } + pub async fn timeline_create( &self, tenant_id: TenantId, diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index 79df108896..23ea8f4060 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -11,6 +11,7 @@ use crate::{ use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; +use pageserver_api::shard::TenantShardId; use std::collections::HashMap; use std::time::Duration; use utils::{ @@ -40,9 +41,9 @@ async fn await_lsn( loop { let latest = match get_lsns(tenant_id, pageserver).await { Ok(l) => l, - Err(e) => { + Err(_e) => { println!( - "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})", + "🕑 Waiting for pageserver {} to activate...", pageserver.conf.id ); std::thread::sleep(Duration::from_millis(500)); @@ -89,7 +90,7 @@ pub async fn migrate_tenant( tenant_id: TenantId, dest_ps: PageServerNode, ) -> anyhow::Result<()> { - // Get a new generation + println!("🤔 Checking existing status..."); let attachment_service = AttachmentService::from_env(env); fn build_location_config( @@ -135,6 +136,20 @@ pub async fn migrate_tenant( baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?); } + println!( + "🔁 Downloading latest layers to destination pageserver {}", + dest_ps.conf.id + ); + match dest_ps + .tenant_secondary_download(&TenantShardId::unsharded(tenant_id)) + .await + { + Ok(()) => {} + Err(_) => { + println!(" (skipping, destination wasn't in secondary mode)") + } + } + let gen = attachment_service .attach_hook(tenant_id, dest_ps.conf.id) .await?; diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 95bed83ae5..12fa80349e 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks -We force code formatting via `black`, `ruff`, and type hints via `mypy`. +We force code formatting via `ruff`, and type hints via `mypy`. Run the following commands in the repository's root (next to `pyproject.toml`): ```bash -poetry run black . # All code is reformatted -poetry run ruff . # Python linter -poetry run mypy . # Ensure there are no typing errors +poetry run ruff format . # All code is reformatted +poetry run ruff check . # Python linter +poetry run mypy . # Ensure there are no typing errors ``` **WARNING**: do not run `mypy` from a directory other than the root of the repository. diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index d680a5600e..3e1bba2a06 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -142,7 +142,7 @@ impl Key { } pub fn is_rel_block_key(key: &Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 + key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff } impl std::str::FromStr for Key { diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 16651c322e..80183506d8 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -124,6 +124,9 @@ impl KeySpaceAccum { if range.start == accum.end { accum.end = range.end; } else { + // TODO: to efficiently support small sharding stripe sizes, we should avoid starting + // a new range here if the skipped region was all keys that don't belong on this shard. + // (https://github.com/neondatabase/neon/issues/6247) assert!(range.start > accum.end); self.ranges.push(accum.clone()); *accum = range; diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 3e4936eec4..18ef2be523 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -422,6 +422,21 @@ impl ShardIdentity { } } + /// Return true if the key should be discarded if found in this shard's + /// data store, e.g. during compaction after a split + pub fn is_key_disposable(&self, key: &Key) -> bool { + if key_is_shard0(key) { + // Q: Why can't we dispose of shard0 content if we're not shard 0? + // A: because the WAL ingestion logic currently ingests some shard 0 + // content on all shards, even though it's only read on shard 0. If we + // dropped it, then subsequent WAL ingest to these keys would encounter + // an error. + false + } else { + !self.is_key_local(key) + } + } + pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) @@ -515,12 +530,7 @@ fn key_is_shard0(key: &Key) -> bool { // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. - // - // In this condition: - // - is_rel_block_key includes only relations, i.e. excludes SLRU data and - // all metadata. - // - field6 is set to -1 for relation size pages. - !(is_rel_block_key(key) && key.field6 != 0xffffffff) + !is_rel_block_key(key) } /// Provide the same result as the function in postgres `hashfn.h` with the same name diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 1dae008a4f..73d25619c3 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -35,6 +35,12 @@ pub enum QueryError { /// We were instructed to shutdown while processing the query #[error("Shutting down")] Shutdown, + /// Query handler indicated that client should reconnect + #[error("Server requested reconnect")] + Reconnect, + /// Query named an entity that was not found + #[error("Not found: {0}")] + NotFound(std::borrow::Cow<'static, str>), /// Authentication failure #[error("Unauthorized: {0}")] Unauthorized(std::borrow::Cow<'static, str>), @@ -54,9 +60,9 @@ impl From for QueryError { impl QueryError { pub fn pg_error_code(&self) -> &'static [u8; 5] { match self { - Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure + Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN, - Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR, + Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR, Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error } } @@ -425,6 +431,11 @@ impl PostgresBackend { info!("Stopped due to shutdown"); Ok(()) } + Err(QueryError::Reconnect) => { + // Dropping out of this loop implicitly disconnects + info!("Stopped due to handler reconnect request"); + Ok(()) + } Err(QueryError::Disconnected(e)) => { info!("Disconnected ({e:#})"); // Disconnection is not an error: we just use it that way internally to drop @@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I pub fn short_error(e: &QueryError) -> String { match e { QueryError::Disconnected(connection_error) => connection_error.to_string(), + QueryError::Reconnect => "reconnect".to_string(), QueryError::Shutdown => "shutdown".to_string(), + QueryError::NotFound(_) => "not found".to_string(), QueryError::Unauthorized(_e) => "JWT authentication error".to_string(), QueryError::SimulatedConnectionError => "simulated connection error".to_string(), QueryError::Other(e) => format!("{e:#}"), @@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) { QueryError::SimulatedConnectionError => { error!("query handler for query '{query}' failed due to a simulated connection error") } + QueryError::Reconnect => { + info!("query handler for '{query}' requested client to reconnect") + } QueryError::Shutdown => { info!("query handler for '{query}' cancelled during tenant shutdown") } + QueryError::NotFound(reason) => { + info!("query handler for '{query}' entity not found: {reason}") + } QueryError::Unauthorized(e) => { warn!("query handler for '{query}' failed with authentication error: {e}"); } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 7ea1103eb2..18cf5d97ba 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -322,6 +322,12 @@ impl RemoteStorage for AzureBlobStorage { } Ok(()) } + + async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> { + Err(anyhow::anyhow!( + "copy for azure blob storage is not implemented" + )) + } } pin_project_lite::pin_project! { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 3e408e3119..942d0016b0 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -207,6 +207,9 @@ pub trait RemoteStorage: Send + Sync + 'static { async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; + + /// Copy a remote object inside a bucket from one path to another. + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>; } pub type DownloadStream = Pin> + Unpin + Send + Sync>>; @@ -374,6 +377,15 @@ impl GenericRemoteStorage { Self::Unreliable(s) => s.delete_objects(paths).await, } } + + pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.copy(from, to).await, + Self::AwsS3(s) => s.copy(from, to).await, + Self::AzureBlob(s) => s.copy(from, to).await, + Self::Unreliable(s) => s.copy(from, to).await, + } + } } impl GenericRemoteStorage { @@ -660,6 +672,7 @@ impl ConcurrencyLimiter { RequestKind::Put => &self.write, RequestKind::List => &self.read, RequestKind::Delete => &self.write, + RequestKind::Copy => &self.write, } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index d1e7d325b9..bf8b6b5dde 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -409,6 +409,20 @@ impl RemoteStorage for LocalFs { } Ok(()) } + + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + let from_path = from.with_base(&self.storage_root); + let to_path = to.with_base(&self.storage_root); + create_target_directory(&to_path).await?; + fs::copy(&from_path, &to_path).await.with_context(|| { + format!( + "Failed to copy file from '{from_path}' to '{to_path}'", + from_path = from_path, + to_path = to_path + ) + })?; + Ok(()) + } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 0f95458ad1..d7b41edaaf 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -493,6 +493,38 @@ impl RemoteStorage for S3Bucket { Ok(()) } + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + let kind = RequestKind::Copy; + let _guard = self.permit(kind).await; + + let started_at = start_measuring_requests(kind); + + // we need to specify bucket_name as a prefix + let copy_source = format!( + "{}/{}", + self.bucket_name, + self.relative_path_to_s3_object(from) + ); + + let res = self + .client + .copy_object() + .bucket(self.bucket_name.clone()) + .key(self.relative_path_to_s3_object(to)) + .copy_source(copy_source) + .send() + .await; + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + + res?; + + Ok(()) + } + async fn download(&self, from: &RemotePath) -> Result { // if prefix is not none then download file `prefix/from` // if prefix is none then download file `from` diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs index ea11edafa5..21dde14906 100644 --- a/libs/remote_storage/src/s3_bucket/metrics.rs +++ b/libs/remote_storage/src/s3_bucket/metrics.rs @@ -11,6 +11,7 @@ pub(crate) enum RequestKind { Put = 1, Delete = 2, List = 3, + Copy = 4, } use RequestKind::*; @@ -22,6 +23,7 @@ impl RequestKind { Put => "put_object", Delete => "delete_object", List => "list_objects", + Copy => "copy_object", } } const fn as_index(&self) -> usize { @@ -29,7 +31,7 @@ impl RequestKind { } } -pub(super) struct RequestTyped([C; 4]); +pub(super) struct RequestTyped([C; 5]); impl RequestTyped { pub(super) fn get(&self, kind: RequestKind) -> &C { @@ -38,8 +40,8 @@ impl RequestTyped { fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { use RequestKind::*; - let mut it = [Get, Put, Delete, List].into_iter(); - let arr = std::array::from_fn::(|index| { + let mut it = [Get, Put, Delete, List, Copy].into_iter(); + let arr = std::array::from_fn::(|index| { let next = it.next().unwrap(); assert_eq!(index, next.as_index()); f(next) diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 802b0db7f5..7f5adcea30 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper { } Ok(()) } + + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + // copy is equivalent to download + upload + self.attempt(RemoteOp::Download(from.clone()))?; + self.attempt(RemoteOp::Upload(to.clone()))?; + self.inner.copy_object(from, to).await + } } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 786712deb1..ce5a1e411e 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -51,3 +51,9 @@ pub struct SkTimelineInfo { #[serde(default)] pub http_connstr: Option, } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineCopyRequest { + pub target_timeline_id: TimelineId, + pub until_lsn: Lsn, +} diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index ac68b04888..3e9281ac81 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -31,6 +31,9 @@ pub enum ApiError { #[error("Shutting down")] ShuttingDown, + #[error("Timeout")] + Timeout(Cow<'static, str>), + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -67,6 +70,10 @@ impl ApiError { err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 9e9b0adfe5..890061dc59 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -85,6 +85,8 @@ pub mod sync; pub mod failpoint_support; +pub mod yielding_loop; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 31c76d2f74..abc3842da8 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -15,6 +15,12 @@ pub struct Gate { name: String, } +impl std::fmt::Debug for Gate { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Gate<{}>", self.name) + } +} + /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will /// not complete. #[derive(Debug)] diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs new file mode 100644 index 0000000000..963279eb4c --- /dev/null +++ b/libs/utils/src/yielding_loop.rs @@ -0,0 +1,35 @@ +use tokio_util::sync::CancellationToken; + +#[derive(thiserror::Error, Debug)] +pub enum YieldingLoopError { + #[error("Cancelled")] + Cancelled, +} + +/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically +/// yields to avoid blocking the executor, and after resuming checks the provided +/// cancellation token to drop out promptly on shutdown. +#[inline(always)] +pub async fn yielding_loop( + interval: usize, + cancel: &CancellationToken, + iter: I, + mut visitor: F, +) -> Result<(), YieldingLoopError> +where + I: Iterator, + F: FnMut(T), +{ + for (i, item) in iter.enumerate() { + visitor(item); + + if i + 1 % interval == 0 { + tokio::task::yield_now().await; + if cancel.is_cancelled() { + return Err(YieldingLoopError::Cancelled); + } + } + } + + Ok(()) +} diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 35c8f6904d..7251545792 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -425,7 +425,7 @@ mod tests { } fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) { - println!("walprop_log[{}] {}", level, msg); + println!("wp_log[{}] {}", level, msg); } fn after_election(&self, _wp: &mut crate::bindings::WalProposer) { diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index ba41866935..4837626086 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -13,6 +13,7 @@ use bytes::{Buf, Bytes}; use pageserver::{ config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, }; +use pageserver_api::shard::TenantShardId; use utils::{id::TenantId, lsn::Lsn}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; @@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) { let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_id); + let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 87e4ed8efd..4c285293f7 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,4 +1,4 @@ -use pageserver_api::models::*; +use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method}; use utils::{ http::error::HttpErrorBody, @@ -164,6 +164,18 @@ impl Client { Ok(()) } + pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{}/secondary/download", + self.mgmt_api_endpoint, tenant_id + ); + self.request(Method::POST, &uri, ()) + .await? + .error_for_status() + .map(|_| ()) + .map_err(|e| Error::ApiError(format!("{}", e))) + } + pub async fn location_config( &self, tenant_id: TenantId, diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index fc0d2311f7..231461267a 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -115,15 +115,8 @@ impl PagestreamClient { pub async fn getpage( &mut self, - key: RelTagBlockNo, - lsn: Lsn, + req: PagestreamGetPageRequest, ) -> anyhow::Result { - let req = PagestreamGetPageRequest { - latest: false, - rel: key.rel_tag, - blkno: key.block_no, - lsn, - }; let req = PagestreamFeMessage::GetPage(req); let req: bytes::Bytes = req.serialize(); // let mut req = tokio_util::io::ReaderStream::new(&req); diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 16d198ab0e..cb36a403f1 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -3,7 +3,7 @@ use futures::future::join_all; use pageserver::pgdatadir_mapping::key_to_rel_block; use pageserver::repository; use pageserver_api::key::is_rel_block_key; -use pageserver_client::page_service::RelTagBlockNo; +use pageserver_api::models::PagestreamGetPageRequest; use utils::id::TenantTimelineId; use utils::lsn::Lsn; @@ -39,6 +39,9 @@ pub(crate) struct Args { runtime: Option, #[clap(long)] per_target_rate_limit: Option, + /// Probability for sending `latest=true` in the request (uniform distribution). + #[clap(long, default_value = "1")] + req_latest_probability: f64, #[clap(long)] limit_to_first_n_targets: Option, targets: Option>, @@ -200,18 +203,26 @@ async fn main_impl( start_work_barrier.wait().await; loop { - let (range, key) = { + let (timeline, req) = { let mut rng = rand::thread_rng(); let r = &all_ranges[weights.sample(&mut rng)]; let key: i128 = rng.gen_range(r.start..r.end); let key = repository::Key::from_i128(key); let (rel_tag, block_no) = key_to_rel_block(key).expect("we filter non-rel-block keys out above"); - (r, RelTagBlockNo { rel_tag, block_no }) + ( + r.timeline, + PagestreamGetPageRequest { + latest: rng.gen_bool(args.req_latest_probability), + lsn: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + }, + ) }; - let sender = work_senders.get(&range.timeline).unwrap(); + let sender = work_senders.get(&timeline).unwrap(); // TODO: what if this blocks? - sender.send((key, range.timeline_lsn)).await.ok().unwrap(); + sender.send(req).await.ok().unwrap(); } }), Some(rps_limit) => Box::pin(async move { @@ -240,16 +251,21 @@ async fn main_impl( ); loop { ticker.tick().await; - let (range, key) = { + let req = { let mut rng = rand::thread_rng(); let r = &ranges[weights.sample(&mut rng)]; let key: i128 = rng.gen_range(r.start..r.end); let key = repository::Key::from_i128(key); let (rel_tag, block_no) = key_to_rel_block(key) .expect("we filter non-rel-block keys out above"); - (r, RelTagBlockNo { rel_tag, block_no }) + PagestreamGetPageRequest { + latest: rng.gen_bool(args.req_latest_probability), + lsn: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + } }; - sender.send((key, range.timeline_lsn)).await.ok().unwrap(); + sender.send(req).await.ok().unwrap(); } }) }; @@ -303,7 +319,7 @@ async fn client( args: &'static Args, timeline: TenantTimelineId, start_work_barrier: Arc, - mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>, + mut work: tokio::sync::mpsc::Receiver, all_work_done_barrier: Arc, live_stats: Arc, ) { @@ -317,10 +333,10 @@ async fn client( .await .unwrap(); - while let Some((key, lsn)) = work.recv().await { + while let Some(req) = work.recv().await { let start = Instant::now(); client - .getpage(key, lsn) + .getpage(req) .await .with_context(|| format!("getpage for {timeline}")) .unwrap(); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index ed452eae7d..7e5ae892ad 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -23,6 +23,7 @@ use tracing::*; use tokio_tar::{Builder, EntryType, Header}; use crate::context::RequestContext; +use crate::pgdatadir_mapping::Version; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -174,7 +175,7 @@ where ] { for segno in self .timeline - .list_slru_segments(kind, self.lsn, self.ctx) + .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx) .await? { self.add_slru_segment(kind, segno).await?; @@ -192,7 +193,7 @@ where // Otherwise only include init forks of unlogged relations. let rels = self .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty @@ -267,7 +268,7 @@ where async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { let nblocks = self .timeline - .get_rel_size(src, self.lsn, false, self.ctx) + .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx) .await?; // If the relation is empty, create an empty file @@ -288,7 +289,7 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx) + .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx) .await?; segment_data.extend_from_slice(&img[..]); } @@ -310,7 +311,7 @@ where async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { let nblocks = self .timeline - .get_slru_segment_size(slru, segno, self.lsn, self.ctx) + .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx) .await?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); @@ -352,7 +353,7 @@ where let relmap_img = if has_relmap_file { let img = self .timeline - .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx) + .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await?; ensure!( @@ -399,7 +400,7 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await? .is_empty() { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8516f397ca..7c03dc1bdd 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -37,8 +37,8 @@ use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME, - TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, + IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, + TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, }; use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; @@ -75,6 +75,9 @@ pub mod defaults { pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; /// /// Default built-in configuration file. @@ -88,6 +91,7 @@ pub mod defaults { #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' +#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE} #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS} # initial superuser role name to use when creating a new tenant @@ -108,6 +112,8 @@ pub mod defaults { #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' +#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -125,6 +131,7 @@ pub mod defaults { #gc_feedback = false #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} +#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} [remote_storage] @@ -233,6 +240,13 @@ pub struct PageServerConf { /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize /// heatmap uploads vs. other remote storage operations. pub heatmap_upload_concurrency: usize, + + /// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly + /// deprioritises secondary downloads vs. remote storage operations for attached tenants. + pub secondary_download_concurrency: usize, + + /// Maximum number of WAL records to be ingested and committed at the same time + pub ingest_batch_size: u64, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -314,6 +328,9 @@ struct PageServerConfigBuilder { control_plane_emergency_mode: BuilderValue, heatmap_upload_concurrency: BuilderValue, + secondary_download_concurrency: BuilderValue, + + ingest_batch_size: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -386,6 +403,9 @@ impl Default for PageServerConfigBuilder { control_plane_emergency_mode: Set(false), heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), + secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), + + ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), } } } @@ -534,6 +554,14 @@ impl PageServerConfigBuilder { self.heatmap_upload_concurrency = BuilderValue::Set(value) } + pub fn secondary_download_concurrency(&mut self, value: usize) { + self.secondary_download_concurrency = BuilderValue::Set(value) + } + + pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) { + self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_warmup = self .concurrent_tenant_warmup @@ -632,10 +660,15 @@ impl PageServerConfigBuilder { control_plane_emergency_mode: self .control_plane_emergency_mode .ok_or(anyhow!("missing control_plane_emergency_mode"))?, - heatmap_upload_concurrency: self .heatmap_upload_concurrency .ok_or(anyhow!("missing heatmap_upload_concurrency"))?, + secondary_download_concurrency: self + .secondary_download_concurrency + .ok_or(anyhow!("missing secondary_download_concurrency"))?, + ingest_batch_size: self + .ingest_batch_size + .ok_or(anyhow!("missing ingest_batch_size"))?, }) } } @@ -693,6 +726,11 @@ impl PageServerConf { .join(TENANT_LOCATION_CONFIG_NAME) } + pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) + .join(TENANT_HEATMAP_BASENAME) + } + pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TIMELINES_SEGMENT_NAME) @@ -878,6 +916,10 @@ impl PageServerConf { "heatmap_upload_concurrency" => { builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) }, + "secondary_download_concurrency" => { + builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) + }, + "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -949,6 +991,8 @@ impl PageServerConf { control_plane_api_token: None, control_plane_emergency_mode: false, heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, + ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, } } } @@ -1177,7 +1221,9 @@ background_task_maximum_delay = '334 s' control_plane_api: None, control_plane_api_token: None, control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, + ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, }, "Correct defaults should be used when no config values are provided" ); @@ -1238,7 +1284,9 @@ background_task_maximum_delay = '334 s' control_plane_api: None, control_plane_api_token: None, control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY + heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, + ingest_batch_size: 100, }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 157e6b4e3e..5c7747d353 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -152,6 +152,7 @@ impl From for ApiError { PageReconstructError::AncestorStopping(_) => { ApiError::ResourceUnavailable(format!("{pre}").into()) } + PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } } @@ -1273,6 +1274,23 @@ async fn put_tenant_location_config_handler( // which is not a 400 but a 409. .map_err(ApiError::BadRequest)?; + if let Some(_flush_ms) = flush { + match state + .secondary_controller + .upload_tenant(tenant_shard_id) + .await + { + Ok(()) => { + tracing::info!("Uploaded heatmap during flush"); + } + Err(e) => { + tracing::warn!("Failed to flush heatmap: {e}"); + } + } + } else { + tracing::info!("No flush requested when configuring"); + } + json_response(StatusCode::OK, ()) } @@ -1610,6 +1628,21 @@ async fn secondary_upload_handler( json_response(StatusCode::OK, ()) } +async fn secondary_download_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + state + .secondary_controller + .download_tenant(tenant_shard_id) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1878,6 +1911,9 @@ pub fn make_router( .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) + .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { + api_handler(r, secondary_download_handler) + }) .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index d95d75449d..d66df36b3a 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -21,6 +21,7 @@ use tracing::*; use walkdir::WalkDir; use crate::context::RequestContext; +use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::Timeline; @@ -312,13 +313,16 @@ async fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; - let mut modification = tline.begin_modification(endpoint); + let mut modification = tline.begin_modification(last_lsn); let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; + WAL_INGEST.records_committed.inc(); + + modification.commit(ctx).await?; last_lsn = lsn; nrecords += 1; @@ -448,13 +452,14 @@ pub async fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); - let mut modification = tline.begin_modification(end_lsn); + let mut modification = tline.begin_modification(last_lsn); let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; + modification.commit(ctx).await?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c1ce0af47b..26070e0cc1 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -117,6 +117,10 @@ pub const TENANT_CONFIG_NAME: &str = "config"; /// Full path: `tenants//config`. pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; +/// Per-tenant copy of their remote heatmap, downloaded into the local +/// tenant path while in secondary mode. +pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; + /// A suffix used for various temporary files. Any temporary files found in the /// data directory at pageserver startup can be automatically removed. pub const TEMP_FILE_SUFFIX: &str = "___temp"; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 4725903783..6f4431c3cf 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[ // Metrics collected on operations on the storage repository. #[derive(Debug, EnumVariantNames, IntoStaticStr)] #[strum(serialize_all = "kebab_case")] -pub enum StorageTimeOperation { +pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] LayerFlush, @@ -55,7 +55,7 @@ pub enum StorageTimeOperation { CreateTenant, } -pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { +pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { register_counter_vec!( "pageserver_storage_operations_seconds_sum", "Total time spent on storage operations with operation, tenant and timeline dimensions", @@ -64,7 +64,7 @@ pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { +pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_storage_operations_seconds_count", "Count of storage operations with operation, tenant and timeline dimensions", @@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub struct PageCacheMetricsForTaskKind { +pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, @@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind { pub read_hits_materialized_page_older_lsn: IntCounter, } -pub struct PageCacheMetrics { +pub(crate) struct PageCacheMetrics { map: EnumMap>, } @@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { +pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { map: EnumMap::from_array(std::array::from_fn(|task_kind| { let task_kind = ::from_usize(task_kind); let task_kind: &'static str = task_kind.into(); @@ -243,10 +243,9 @@ impl PageCacheMetrics { } } -pub struct PageCacheSizeMetrics { +pub(crate) struct PageCacheSizeMetrics { pub max_bytes: UIntGauge, - pub current_bytes_ephemeral: UIntGauge, pub current_bytes_immutable: UIntGauge, pub current_bytes_materialized_page: UIntGauge, } @@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheSizeMetrics { - max_bytes: { - register_uint_gauge!( - "pageserver_page_cache_size_max_bytes", - "Maximum size of the page cache in bytes" - ) - .expect("failed to define a metric") - }, - - current_bytes_ephemeral: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["ephemeral"]) - .unwrap() - }, - current_bytes_immutable: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["immutable"]) - .unwrap() - }, - current_bytes_materialized_page: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["materialized_page"]) - .unwrap() - }, -}); +pub(crate) static PAGE_CACHE_SIZE: Lazy = + Lazy::new(|| PageCacheSizeMetrics { + max_bytes: { + register_uint_gauge!( + "pageserver_page_cache_size_max_bytes", + "Maximum size of the page cache in bytes" + ) + .expect("failed to define a metric") + }, + current_bytes_immutable: { + PAGE_CACHE_SIZE_CURRENT_BYTES + .get_metric_with_label_values(&["immutable"]) + .unwrap() + }, + current_bytes_materialized_page: { + PAGE_CACHE_SIZE_CURRENT_BYTES + .get_metric_with_label_values(&["materialized_page"]) + .unwrap() + }, + }); pub(crate) mod page_cache_eviction_metrics { use std::num::NonZeroUsize; @@ -740,13 +734,13 @@ pub(crate) static TENANT: Lazy = Lazy::new(|| { /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] -pub struct EvictionsWithLowResidenceDuration { +pub(crate) struct EvictionsWithLowResidenceDuration { data_source: &'static str, threshold: Duration, counter: Option, } -pub struct EvictionsWithLowResidenceDurationBuilder { +pub(crate) struct EvictionsWithLowResidenceDurationBuilder { data_source: &'static str, threshold: Duration, } @@ -1009,7 +1003,7 @@ pub enum SmgrQueryType { } #[derive(Debug)] -pub struct SmgrQueryTimePerTimeline { +pub(crate) struct SmgrQueryTimePerTimeline { metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT], } @@ -1181,8 +1175,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { .map(|ms| (ms as f64) / 1000.0) }); -pub struct BasebackupQueryTime(HistogramVec); -pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { +pub(crate) struct BasebackupQueryTime(HistogramVec); +pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { BasebackupQueryTime({ register_histogram_vec!( "pageserver_basebackup_query_seconds", @@ -1202,7 +1196,7 @@ impl DurationResultObserver for BasebackupQueryTime { } } -pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { +pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_live_connections", "Number of live network connections", @@ -1369,6 +1363,8 @@ pub(crate) struct SecondaryModeMetrics { pub(crate) upload_heatmap: IntCounter, pub(crate) upload_heatmap_errors: IntCounter, pub(crate) upload_heatmap_duration: Histogram, + pub(crate) download_heatmap: IntCounter, + pub(crate) download_layer: IntCounter, } pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { upload_heatmap: register_int_counter!( @@ -1386,6 +1382,16 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco "Time to build and upload a heatmap, including any waiting inside the S3 client" ) .expect("failed to define a metric"), + download_heatmap: register_int_counter!( + "pageserver_secondary_download_heatmap", + "Number of downloads of heatmaps by secondary mode locations" + ) + .expect("failed to define a metric"), + download_layer: register_int_counter!( + "pageserver_secondary_download_layer", + "Number of downloads of layers by secondary mode locations" + ) + .expect("failed to define a metric"), }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1655,7 +1661,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); /// Similar to `prometheus::HistogramTimer` but does not record on drop. -pub struct StorageTimeMetricsTimer { +pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, start: Instant, } @@ -1680,7 +1686,7 @@ impl StorageTimeMetricsTimer { /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and /// timeline total sum and count. #[derive(Clone, Debug)] -pub struct StorageTimeMetrics { +pub(crate) struct StorageTimeMetrics { /// Sum of f64 seconds, per operation, tenant_id and timeline_id timeline_sum: Counter, /// Number of oeprations, per operation, tenant_id and timeline_id @@ -1719,7 +1725,7 @@ impl StorageTimeMetrics { } #[derive(Debug)] -pub struct TimelineMetrics { +pub(crate) struct TimelineMetrics { tenant_id: String, shard_id: String, timeline_id: String, @@ -1927,7 +1933,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge { } } -pub struct RemoteTimelineClientMetrics { +pub(crate) struct RemoteTimelineClientMetrics { tenant_id: String, timeline_id: String, remote_physical_size_gauge: Mutex>, @@ -2225,7 +2231,7 @@ impl Drop for RemoteTimelineClientMetrics { /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. -pub trait MeasureRemoteOp: Sized { +pub(crate) trait MeasureRemoteOp: Sized { fn measure_remote_op( self, tenant_id: TenantId, @@ -2250,7 +2256,7 @@ pub trait MeasureRemoteOp: Sized { impl MeasureRemoteOp for T {} pin_project! { - pub struct MeasuredRemoteOp + pub(crate) struct MeasuredRemoteOp { #[pin] inner: F, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d5ca7f7382..291490d016 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -25,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; +use std::borrow::Cow; use std::io; use std::net::TcpListener; use std::pin::pin; @@ -53,7 +54,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; -use crate::pgdatadir_mapping::rel_block_to_key; +use crate::pgdatadir_mapping::{rel_block_to_key, Version}; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; @@ -61,6 +62,9 @@ use crate::tenant::mgr; use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::ShardSelector; +use crate::tenant::timeline::WaitLsnError; +use crate::tenant::GetTimelineError; +use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::trace::Tracer; @@ -283,6 +287,64 @@ struct PageServerHandler { connection_ctx: RequestContext, } +#[derive(thiserror::Error, Debug)] +enum PageStreamError { + /// We encountered an error that should prompt the client to reconnect: + /// in practice this means we drop the connection without sending a response. + #[error("Reconnect required: {0}")] + Reconnect(Cow<'static, str>), + + /// We were instructed to shutdown while processing the query + #[error("Shutting down")] + Shutdown, + + /// Something went wrong reading a page: this likely indicates a pageserver bug + #[error("Read error: {0}")] + Read(PageReconstructError), + + /// Ran out of time waiting for an LSN + #[error("LSN timeout: {0}")] + LsnTimeout(WaitLsnError), + + /// The entity required to serve the request (tenant or timeline) is not found, + /// or is not found in a suitable state to serve a request. + #[error("Not found: {0}")] + NotFound(std::borrow::Cow<'static, str>), + + /// Request asked for something that doesn't make sense, like an invalid LSN + #[error("Bad request: {0}")] + BadRequest(std::borrow::Cow<'static, str>), +} + +impl From for PageStreamError { + fn from(value: PageReconstructError) -> Self { + match value { + PageReconstructError::Cancelled => Self::Shutdown, + e => Self::Read(e), + } + } +} + +impl From for PageStreamError { + fn from(value: GetActiveTimelineError) -> Self { + match value { + GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown, + GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()), + GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()), + } + } +} + +impl From for PageStreamError { + fn from(value: WaitLsnError) -> Self { + match value { + e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e), + WaitLsnError::Shutdown => Self::Shutdown, + WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()), + } + } +} + impl PageServerHandler { pub fn new( conf: &'static PageServerConf, @@ -428,7 +490,7 @@ impl PageServerHandler { // Check that the timeline exists let timeline = tenant .get_timeline(timeline_id, true) - .map_err(|e| anyhow::anyhow!(e))?; + .map_err(|e| QueryError::NotFound(format!("{e}").into()))?; // Avoid starting new requests if the timeline has already started shutting down, // and block timeline shutdown until this request is complete, or drops out due @@ -520,32 +582,44 @@ impl PageServerHandler { } }; - if let Err(e) = &response { - // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, - // because wait_lsn etc will drop out - // is_stopping(): [`Timeline::flush_and_shutdown`] has entered - // is_canceled(): [`Timeline::shutdown`]` has entered - if timeline.cancel.is_cancelled() || timeline.is_stopping() { + match response { + Err(PageStreamError::Shutdown) => { // If we fail to fulfil a request during shutdown, which may be _because_ of // shutdown, then do not send the error to the client. Instead just drop the // connection. - span.in_scope(|| info!("dropped response during shutdown: {e:#}")); + span.in_scope(|| info!("dropping connection due to shutdown")); return Err(QueryError::Shutdown); } + Err(PageStreamError::Reconnect(reason)) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => { + // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean + // shutdown error, this may be buried inside a PageReconstructError::Other for example. + // + // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, + // because wait_lsn etc will drop out + // is_stopping(): [`Timeline::flush_and_shutdown`] has entered + // is_canceled(): [`Timeline::shutdown`]` has entered + span.in_scope(|| info!("dropped error response during shutdown: {e:#}")); + return Err(QueryError::Shutdown); + } + r => { + let response_msg = r.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough. Do not log if shutting down, as the anyhow::Error + // here includes cancellation which is not an error. + span.in_scope(|| error!("error reading relation or page version: {:#}", e)); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); + + pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + self.flush_cancellable(pgb, &timeline.cancel).await?; + } } - - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough. Do not log if shutting down, as the anyhow::Error - // here includes cancellation which is not an error. - span.in_scope(|| error!("error reading relation or page version: {:#}", e)); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); - - pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?; - self.flush_cancellable(pgb, &timeline.cancel).await?; } Ok(()) } @@ -692,7 +766,7 @@ impl PageServerHandler { latest: bool, latest_gc_cutoff_lsn: &RcuReadGuard, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { if latest { // Latest page version was requested. If LSN is given, it is a hint // to the page server that there have been no modifications to the @@ -723,15 +797,19 @@ impl PageServerHandler { } } else { if lsn == Lsn(0) { - anyhow::bail!("invalid LSN(0) in request"); + return Err(PageStreamError::BadRequest( + "invalid LSN(0) in request".into(), + )); } timeline.wait_lsn(lsn, ctx).await?; } - anyhow::ensure!( - lsn >= **latest_gc_cutoff_lsn, - "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", - lsn, **latest_gc_cutoff_lsn - ); + + if lsn < **latest_gc_cutoff_lsn { + return Err(PageStreamError::BadRequest(format!( + "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", + lsn, **latest_gc_cutoff_lsn + ).into())); + } Ok(lsn) } @@ -740,14 +818,14 @@ impl PageServerHandler { timeline: &Timeline, req: &PagestreamExistsRequest, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; let exists = timeline - .get_rel_exists(req.rel, lsn, req.latest, ctx) + .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -760,13 +838,15 @@ impl PageServerHandler { timeline: &Timeline, req: &PagestreamNblocksRequest, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?; + let n_blocks = timeline + .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx) + .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -778,14 +858,20 @@ impl PageServerHandler { timeline: &Timeline, req: &PagestreamDbSizeRequest, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; let total_blocks = timeline - .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx) + .get_db_size( + DEFAULTTABLESPACE_OID, + req.dbnode, + Version::Lsn(lsn), + req.latest, + ctx, + ) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -794,30 +880,35 @@ impl PageServerHandler { })) } + async fn do_handle_get_page_at_lsn_request( + &self, + timeline: &Timeline, + req: &PagestreamGetPageRequest, + ctx: &RequestContext, + ) -> Result { + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let lsn = + Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) + .await?; + let page = timeline + .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx) + .await?; + + Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { + page, + })) + } + async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, ctx: &RequestContext, - ) -> anyhow::Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; - /* - // Add a 1s delay to some requests. The delay helps the requests to - // hit the race condition from github issue #1047 more easily. - use rand::Rng; - if rand::thread_rng().gen::() < 5 { - std::thread::sleep(std::time::Duration::from_millis(1000)); - } - */ - + ) -> Result { let key = rel_block_to_key(req.rel, req.blkno); - let page = if timeline.get_shard_identity().is_key_local(&key) { - timeline - .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx) - .await? + if timeline.get_shard_identity().is_key_local(&key) { + self.do_handle_get_page_at_lsn_request(timeline, req, ctx) + .await } else { // The Tenant shard we looked up at connection start does not hold this particular // key: look for other shards in this tenant. This scenario occurs if a pageserver @@ -836,30 +927,30 @@ impl PageServerHandler { Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { // We already know this tenant exists in general, because we resolved it at // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node. - - // TODO: this should be some kind of structured error that the client will understand, - // so that it can block until its config is updated: this error is expected in the case - // that the Tenant's shards' placements are being updated and the client hasn't been - // informed yet. - // - // https://github.com/neondatabase/neon/issues/6038 - return Err(anyhow::anyhow!("Request routed to wrong shard")); + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}", + timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key); + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return Err(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into(), + )); } Err(e) => return Err(e.into()), }; // Take a GateGuard for the duration of this request. If we were using our main Timeline object, // the GateGuard was already held over the whole connection. - let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?; - timeline - .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx) - .await? - }; + let _timeline_guard = timeline + .gate + .enter() + .map_err(|_| PageStreamError::Shutdown)?; - Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { - page, - })) + self.do_handle_get_page_at_lsn_request(&timeline, req, ctx) + .await + } } #[allow(clippy::too_many_arguments)] @@ -1000,9 +1091,7 @@ impl PageServerHandler { ) .await .map_err(GetActiveTimelineError::Tenant)?; - let timeline = tenant - .get_timeline(timeline_id, true) - .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?; + let timeline = tenant.get_timeline(timeline_id, true)?; Ok(timeline) } } @@ -1424,14 +1513,15 @@ enum GetActiveTimelineError { #[error(transparent)] Tenant(GetActiveTenantError), #[error(transparent)] - Timeline(anyhow::Error), + Timeline(#[from] GetTimelineError), } impl From for QueryError { fn from(e: GetActiveTimelineError) -> Self { match e { + GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown, GetActiveTimelineError::Tenant(e) => e.into(), - GetActiveTimelineError::Timeline(e) => QueryError::Other(e), + GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()), } } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index e9884a15f5..f11a72f2ab 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -11,7 +11,7 @@ use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::repository::*; use crate::walrecord::NeonWalRecord; -use anyhow::Context; +use anyhow::{ensure, Context}; use bytes::{Buf, Bytes}; use pageserver_api::key::is_rel_block_key; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -147,6 +147,7 @@ impl Timeline { { DatadirModification { tline: self, + pending_lsns: Vec::new(), pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, @@ -159,11 +160,11 @@ impl Timeline { //------------------------------------------------------------------------------ /// Look up given page version. - pub async fn get_rel_page_at_lsn( + pub(crate) async fn get_rel_page_at_lsn( &self, tag: RelTag, blknum: BlockNumber, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { @@ -173,44 +174,47 @@ impl Timeline { )); } - let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?; + let nblocks = self.get_rel_size(tag, version, latest, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", - tag, blknum, lsn, nblocks + tag, + blknum, + version.get_lsn(), + nblocks ); return Ok(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); - self.get(key, lsn, ctx).await + version.get(self, key, ctx).await } // Get size of a database in blocks - pub async fn get_db_size( + pub(crate) async fn get_db_size( &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?; + let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?; + let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) } /// Get size of a relation file - pub async fn get_rel_size( + pub(crate) async fn get_rel_size( &self, tag: RelTag, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { @@ -220,12 +224,12 @@ impl Timeline { )); } - if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { + if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(nblocks); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest, ctx).await? + && !self.get_rel_exists(tag, version, latest, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -235,7 +239,7 @@ impl Timeline { } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn, ctx).await?; + let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); if latest { @@ -246,16 +250,16 @@ impl Timeline { // latest=true, then it can not cause cache corruption, because with latest=true // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be // associated with most recent value of LSN. - self.update_cached_rel_size(tag, lsn, nblocks); + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); } Ok(nblocks) } /// Does relation exist? - pub async fn get_rel_exists( + pub(crate) async fn get_rel_exists( &self, tag: RelTag, - lsn: Lsn, + version: Version<'_>, _latest: bool, ctx: &RequestContext, ) -> Result { @@ -266,12 +270,12 @@ impl Timeline { } // first try to lookup relation in cache - if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { + if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(true); } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -287,16 +291,16 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn list_rels( + pub(crate) async fn list_rels( &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -315,7 +319,7 @@ impl Timeline { } /// Look up given SLRU page version. - pub async fn get_slru_page_at_lsn( + pub(crate) async fn get_slru_page_at_lsn( &self, kind: SlruKind, segno: u32, @@ -328,29 +332,29 @@ impl Timeline { } /// Get size of an SLRU segment - pub async fn get_slru_segment_size( + pub(crate) async fn get_slru_segment_size( &self, kind: SlruKind, segno: u32, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn, ctx).await?; + let mut buf = version.get(self, key, ctx).await?; Ok(buf.get_u32_le()) } /// Get size of an SLRU segment - pub async fn get_slru_segment_exists( + pub(crate) async fn get_slru_segment_exists( &self, kind: SlruKind, segno: u32, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -368,7 +372,7 @@ impl Timeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - pub async fn find_lsn_for_timestamp( + pub(crate) async fn find_lsn_for_timestamp( &self, search_timestamp: TimestampTz, cancel: &CancellationToken, @@ -448,7 +452,7 @@ impl Timeline { /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits /// with a smaller/larger timestamp. /// - pub async fn is_latest_commit_timestamp_ge_than( + pub(crate) async fn is_latest_commit_timestamp_ge_than( &self, search_timestamp: TimestampTz, probe_lsn: Lsn, @@ -471,7 +475,7 @@ impl Timeline { /// Obtain the possible timestamp range for the given lsn. /// /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps. - pub async fn get_timestamp_for_lsn( + pub(crate) async fn get_timestamp_for_lsn( &self, probe_lsn: Lsn, ctx: &RequestContext, @@ -501,11 +505,11 @@ impl Timeline { mut f: impl FnMut(TimestampTz) -> ControlFlow, ) -> Result { for segno in self - .list_slru_segments(SlruKind::Clog, probe_lsn, ctx) + .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx) .await? { let nblocks = self - .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx) + .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) .await?; for blknum in (0..nblocks).rev() { let clog_page = self @@ -528,36 +532,36 @@ impl Timeline { } /// Get a list of SLRU segments - pub async fn list_slru_segments( + pub(crate) async fn list_slru_segments( &self, kind: SlruKind, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => Ok(dir.segments), Err(e) => Err(PageReconstructError::from(e)), } } - pub async fn get_relmap_file( + pub(crate) async fn get_relmap_file( &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = relmap_file_key(spcnode, dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; Ok(buf) } - pub async fn list_dbdirs( + pub(crate) async fn list_dbdirs( &self, lsn: Lsn, ctx: &RequestContext, @@ -571,7 +575,7 @@ impl Timeline { } } - pub async fn get_twophase_file( + pub(crate) async fn get_twophase_file( &self, xid: TransactionId, lsn: Lsn, @@ -582,7 +586,7 @@ impl Timeline { Ok(buf) } - pub async fn list_twophase_files( + pub(crate) async fn list_twophase_files( &self, lsn: Lsn, ctx: &RequestContext, @@ -596,7 +600,7 @@ impl Timeline { } } - pub async fn get_control_file( + pub(crate) async fn get_control_file( &self, lsn: Lsn, ctx: &RequestContext, @@ -604,7 +608,7 @@ impl Timeline { self.get(CONTROLFILE_KEY, lsn, ctx).await } - pub async fn get_checkpoint( + pub(crate) async fn get_checkpoint( &self, lsn: Lsn, ctx: &RequestContext, @@ -612,7 +616,7 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - pub async fn list_aux_files( + pub(crate) async fn list_aux_files( &self, lsn: Lsn, ctx: &RequestContext, @@ -652,7 +656,10 @@ impl Timeline { let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? { + for rel in self + .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx) + .await? + { if self.cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } @@ -692,7 +699,7 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn, ctx) + .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx) .await? .into_iter() .collect(); @@ -799,18 +806,39 @@ pub struct DatadirModification<'a> { /// in the state in 'tline' yet. pub tline: &'a Timeline, - /// Lsn assigned by begin_modification - pub lsn: Lsn, + /// Current LSN of the modification + lsn: Lsn, // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. - pending_updates: HashMap, - pending_deletions: Vec>, + pending_lsns: Vec, + pending_updates: HashMap>, + pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, } impl<'a> DatadirModification<'a> { + /// Get the current lsn + pub(crate) fn get_lsn(&self) -> Lsn { + self.lsn + } + + /// Set the current lsn + pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { + ensure!( + lsn >= self.lsn, + "setting an older lsn {} than {} is not allowed", + lsn, + self.lsn + ); + if lsn > self.lsn { + self.pending_lsns.push(self.lsn); + self.lsn = lsn; + } + Ok(()) + } + /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> { dbnode: Oid, ctx: &RequestContext, ) -> anyhow::Result<()> { - let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self .tline - .get_db_size(spcnode, dbnode, req_lsn, true, ctx) + .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx) .await?; // Remove entry from dbdir @@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> { ctx: &RequestContext, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); - let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? { + if self + .tline + .get_rel_exists(rel, Version::Modified(self), true, ctx) + .await? + { let size_key = rel_size_to_key(rel); // Fetch the old size first let old_size = self.get(size_key, ctx).await?.get_u32_le(); @@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> { let writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. - let mut retained_pending_updates = HashMap::new(); - for (key, value) in self.pending_updates.drain() { - if is_rel_block_key(&key) || is_slru_block_key(key) { - // This bails out on first error without modifying pending_updates. - // That's Ok, cf this function's doc comment. - writer.put(key, self.lsn, &value, ctx).await?; - } else { - retained_pending_updates.insert(key, value); + let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); + for (key, values) in self.pending_updates.drain() { + for (lsn, value) in values { + if is_rel_block_key(&key) || is_slru_block_key(key) { + // This bails out on first error without modifying pending_updates. + // That's Ok, cf this function's doc comment. + writer.put(key, lsn, &value, ctx).await?; + } else { + retained_pending_updates + .entry(key) + .or_default() + .push((lsn, value)); + } } } - self.pending_updates.extend(retained_pending_updates); + + self.pending_updates = retained_pending_updates; if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> { /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let writer = self.tline.writer().await; - let lsn = self.lsn; + let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; - for (key, value) in self.pending_updates.drain() { - writer.put(key, lsn, &value, ctx).await?; - } - for key_range in self.pending_deletions.drain(..) { - writer.delete(key_range, lsn).await?; + if !self.pending_updates.is_empty() { + writer.put_batch(&self.pending_updates, ctx).await?; + self.pending_updates.clear(); } - writer.finish_write(lsn); + if !self.pending_deletions.is_empty() { + writer.delete_batch(&self.pending_deletions).await?; + self.pending_deletions.clear(); + } + + self.pending_lsns.push(self.lsn); + for pending_lsn in self.pending_lsns.drain(..) { + // Ideally, we should be able to call writer.finish_write() only once + // with the highest LSN. However, the last_record_lsn variable in the + // timeline keeps track of the latest LSN and the immediate previous LSN + // so we need to record every LSN to not leave a gap between them. + writer.finish_write(pending_lsn); + } if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub(crate) fn is_empty(&self) -> bool { - self.pending_updates.is_empty() && self.pending_deletions.is_empty() + pub(crate) fn len(&self) -> usize { + self.pending_updates.len() + self.pending_deletions.len() } // Internal helper functions to batch the modifications async fn get(&self, key: Key, ctx: &RequestContext) -> Result { - // Have we already updated the same key? Read the pending updated + // Have we already updated the same key? Read the latest pending updated // version in that case. // // Note: we don't check pending_deletions. It is an error to request a // value that has been removed, deletion only avoids leaking storage. - if let Some(value) = self.pending_updates.get(&key) { - if let Value::Image(img) = value { - Ok(img.clone()) - } else { - // Currently, we never need to read back a WAL record that we - // inserted in the same "transaction". All the metadata updates - // work directly with Images, and we never need to read actual - // data pages. We could handle this if we had to, by calling - // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::from(anyhow::anyhow!( - "unexpected pending WAL record" - ))) + if let Some(values) = self.pending_updates.get(&key) { + if let Some((_, value)) = values.last() { + return if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + Err(PageReconstructError::from(anyhow::anyhow!( + "unexpected pending WAL record" + ))) + }; } - } else { - let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); - self.tline.get(key, lsn, ctx).await } + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + self.tline.get(key, lsn, ctx).await } fn put(&mut self, key: Key, val: Value) { - self.pending_updates.insert(key, val); + let values = self.pending_updates.entry(key).or_default(); + // Replace the previous value if it exists at the same lsn + if let Some((last_lsn, last_value)) = values.last_mut() { + if *last_lsn == self.lsn { + *last_value = val; + return; + } + } + values.push((self.lsn, val)); } fn delete(&mut self, key_range: Range) { trace!("DELETE {}-{}", key_range.start, key_range.end); - self.pending_deletions.push(key_range); + self.pending_deletions.push((key_range, self.lsn)); + } +} + +/// This struct facilitates accessing either a committed key from the timeline at a +/// specific LSN, or the latest uncommitted key from a pending modification. +/// During WAL ingestion, the records from multiple LSNs may be batched in the same +/// modification before being flushed to the timeline. Hence, the routines in WalIngest +/// need to look up the keys in the modification first before looking them up in the +/// timeline to not miss the latest updates. +#[derive(Clone, Copy)] +pub enum Version<'a> { + Lsn(Lsn), + Modified(&'a DatadirModification<'a>), +} + +impl<'a> Version<'a> { + async fn get( + &self, + timeline: &Timeline, + key: Key, + ctx: &RequestContext, + ) -> Result { + match self { + Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await, + Version::Modified(modification) => modification.get(key, ctx).await, + } + } + + fn get_lsn(&self) -> Lsn { + match self { + Version::Lsn(lsn) => *lsn, + Version::Modified(modification) => modification.lsn, + } } } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index cb1b2b8011..5a06a97525 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| // else, but that has not been needed in a long time. std::env::var("TOKIO_WORKER_THREADS") .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(1, num_cpus::get())) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) }); #[derive(Debug, Clone, Copy)] @@ -258,6 +258,9 @@ pub enum TaskKind { /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, + /// See [`crate::tenant::secondary`]. + SecondaryDownloads, + /// See [`crate::tenant::secondary`]. SecondaryUploads, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e50987c84b..7c609452e5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -56,6 +56,7 @@ use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; use self::timeline::TimelineResources; +use self::timeline::WaitLsnError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; @@ -595,10 +596,9 @@ impl Tenant { mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result> { - // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, ))); let TenantSharedResources { @@ -1145,10 +1145,9 @@ impl Tenant { tenant_shard_id: TenantShardId, reason: String, ) -> Arc { - // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, ))); Arc::new(Tenant::new( TenantState::Broken { @@ -1760,7 +1759,15 @@ impl Tenant { // decoding the new WAL might need to look up previous pages, relation // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. - ancestor_timeline.wait_lsn(*lsn, ctx).await?; + ancestor_timeline + .wait_lsn(*lsn, ctx) + .await + .map_err(|e| match e { + e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { + CreateTimelineError::AncestorLsn(anyhow::anyhow!(e)) + } + WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown, + })?; } self.branch_timeline( diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 25d97f51ce..2d4cd350d7 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -46,6 +46,8 @@ pub mod defaults { pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index b21bad51ba..2f606ed822 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -588,7 +588,7 @@ impl DeleteTenantFlow { } break; } - TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => { + TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => { // This is unexpected: this secondary tenants should not have been created, and we // are not in a position to shut it down from here. tracing::warn!("Tenant transitioned to secondary mode while deleting!"); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 62922e8c99..70b41b7b1f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -44,6 +44,7 @@ use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; +use super::secondary::SecondaryTenant; use super::TenantSharedResources; /// For a tenant that appears in TenantsMap, it may either be @@ -57,7 +58,7 @@ use super::TenantSharedResources; /// having a properly acquired generation (Secondary doesn't need a generation) pub(crate) enum TenantSlot { Attached(Arc), - Secondary, + Secondary(Arc), /// In this state, other administrative operations acting on the TenantId should /// block, or return a retry indicator equivalent to HTTP 503. InProgress(utils::completion::Barrier), @@ -67,7 +68,7 @@ impl std::fmt::Debug for TenantSlot { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()), - Self::Secondary => write!(f, "Secondary"), + Self::Secondary(_) => write!(f, "Secondary"), Self::InProgress(_) => write!(f, "InProgress"), } } @@ -78,7 +79,7 @@ impl TenantSlot { fn get_attached(&self) -> Option<&Arc> { match self { Self::Attached(t) => Some(t), - Self::Secondary => None, + Self::Secondary(_) => None, Self::InProgress(_) => None, } } @@ -130,7 +131,7 @@ impl TenantsMap { /// A page service client sends a TenantId, and to look up the correct Tenant we must /// resolve this to a fully qualified TenantShardId. - fn resolve_shard( + fn resolve_attached_shard( &self, tenant_id: &TenantId, selector: ShardSelector, @@ -140,25 +141,27 @@ impl TenantsMap { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { + // Ignore all slots that don't contain an attached tenant + let tenant = match &slot.1 { + TenantSlot::Attached(t) => t, + _ => continue, + }; + match selector { ShardSelector::First => return Some(*slot.0), ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { return Some(*slot.0) } ShardSelector::Page(key) => { - if let Some(tenant) = slot.1.get_attached() { - // First slot we see for this tenant, calculate the expected shard number - // for the key: we will use this for checking if this and subsequent - // slots contain the key, rather than recalculating the hash each time. - if want_shard.is_none() { - want_shard = Some(tenant.shard_identity.get_shard_number(&key)); - } + // First slot we see for this tenant, calculate the expected shard number + // for the key: we will use this for checking if this and subsequent + // slots contain the key, rather than recalculating the hash each time. + if want_shard.is_none() { + want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + } - if Some(tenant.shard_identity.number) == want_shard { - return Some(*slot.0); - } - } else { - continue; + if Some(tenant.shard_identity.number) == want_shard { + return Some(*slot.0); } } _ => continue, @@ -464,12 +467,18 @@ pub async fn init_tenant_mgr( *gen } else { match &location_conf.mode { - LocationMode::Secondary(_) => { + LocationMode::Secondary(secondary_config) => { // We do not require the control plane's permission for secondary mode // tenants, because they do no remote writes and hence require no // generation number info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); - tenants.insert(tenant_shard_id, TenantSlot::Secondary); + tenants.insert( + tenant_shard_id, + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + secondary_config, + )), + ); } LocationMode::Attached(_) => { // TODO: augment re-attach API to enable the control plane to @@ -661,8 +670,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { total_attached += 1; } - TenantSlot::Secondary => { - shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary); + TenantSlot::Secondary(state) => { + // We don't need to wait for this individually per-tenant: the + // downloader task will be waited on eventually, this cancel + // is just to encourage it to drop out if it is doing work + // for this tenant right now. + state.cancel.cancel(); + + shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state)); } TenantSlot::InProgress(notify) => { // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will @@ -845,12 +860,28 @@ impl TenantManager { Some(TenantSlot::InProgress(_)) => { Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) } - None | Some(TenantSlot::Secondary) => { + None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) } } } + pub(crate) fn get_secondary_tenant_shard( + &self, + tenant_shard_id: TenantShardId, + ) -> Option> { + let locked = self.tenants.read().unwrap(); + + let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) + .ok() + .flatten(); + + match peek_slot { + Some(TenantSlot::Secondary(s)) => Some(s.clone()), + _ => None, + } + } + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, @@ -862,10 +893,15 @@ impl TenantManager { debug_assert_current_span_has_tenant_id(); info!("configuring tenant location to state {new_location_config:?}"); - // Special case fast-path for updates to Tenant: if our upsert is only updating configuration, + enum FastPathModified { + Attached(Arc), + Secondary(Arc), + } + + // Special case fast-path for updates to existing slots: if our upsert is only updating configuration, // then we do not need to set the slot to InProgress, we can just call into the // existng tenant. - let modify_tenant = { + let fast_path_taken = { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?; @@ -879,12 +915,19 @@ impl TenantManager { new_location_config.clone(), )?); - Some(tenant.clone()) + Some(FastPathModified::Attached(tenant.clone())) } else { // Different generations, fall through to general case None } } + ( + LocationMode::Secondary(secondary_conf), + Some(TenantSlot::Secondary(secondary_tenant)), + ) => { + secondary_tenant.set_config(secondary_conf); + Some(FastPathModified::Secondary(secondary_tenant.clone())) + } _ => { // Not an Attached->Attached transition, fall through to general case None @@ -893,34 +936,51 @@ impl TenantManager { }; // Fast-path continued: having dropped out of the self.tenants lock, do the async - // phase of waiting for flush, before returning. - if let Some(tenant) = modify_tenant { - // Transition to AttachedStale means we may well hold a valid generation - // still, and have been requested to go stale as part of a migration. If - // the caller set `flush`, then flush to remote storage. - if let LocationMode::Attached(AttachedLocationConfig { - generation: _, - attach_mode: AttachmentMode::Stale, - }) = &new_location_config.mode - { - if let Some(flush_timeout) = flush { - match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await { - Ok(Err(e)) => { - return Err(e); - } - Ok(Ok(_)) => return Ok(()), - Err(_) => { - tracing::warn!( + // phase of writing config and/or waiting for flush, before returning. + match fast_path_taken { + Some(FastPathModified::Attached(tenant)) => { + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; + + // Transition to AttachedStale means we may well hold a valid generation + // still, and have been requested to go stale as part of a migration. If + // the caller set `flush`, then flush to remote storage. + if let LocationMode::Attached(AttachedLocationConfig { + generation: _, + attach_mode: AttachmentMode::Stale, + }) = &new_location_config.mode + { + if let Some(flush_timeout) = flush { + match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await { + Ok(Err(e)) => { + return Err(e); + } + Ok(Ok(_)) => return Ok(()), + Err(_) => { + tracing::warn!( timeout_ms = flush_timeout.as_millis(), "Timed out waiting for flush to remote storage, proceeding anyway." ) + } } } } - } - return Ok(()); - } + return Ok(()); + } + Some(FastPathModified::Secondary(_secondary_tenant)) => { + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; + + return Ok(()); + } + None => { + // Proceed with the general case procedure, where we will shutdown & remove any existing + // slot contents and replace with a fresh one + } + }; // General case for upserts to TenantsMap, excluding the case above: we will substitute an // InProgress value to the slot while we make whatever changes are required. The state for @@ -929,33 +989,47 @@ impl TenantManager { // not do significant I/O, and shutdowns should be prompt via cancellation tokens. let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; - if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() { - // The case where we keep a Tenant alive was covered above in the special case - // for Attached->Attached transitions in the same generation. By this point, - // if we see an attached tenant we know it will be discarded and should be - // shut down. - let (_guard, progress) = utils::completion::channel(); + match slot_guard.get_old_value() { + Some(TenantSlot::Attached(tenant)) => { + // The case where we keep a Tenant alive was covered above in the special case + // for Attached->Attached transitions in the same generation. By this point, + // if we see an attached tenant we know it will be discarded and should be + // shut down. + let (_guard, progress) = utils::completion::channel(); - match tenant.get_attach_mode() { - AttachmentMode::Single | AttachmentMode::Multi => { - // Before we leave our state as the presumed holder of the latest generation, - // flush any outstanding deletions to reduce the risk of leaking objects. - self.resources.deletion_queue_client.flush_advisory() - } - AttachmentMode::Stale => { - // If we're stale there's not point trying to flush deletions - } - }; + match tenant.get_attach_mode() { + AttachmentMode::Single | AttachmentMode::Multi => { + // Before we leave our state as the presumed holder of the latest generation, + // flush any outstanding deletions to reduce the risk of leaking objects. + self.resources.deletion_queue_client.flush_advisory() + } + AttachmentMode::Stale => { + // If we're stale there's not point trying to flush deletions + } + }; - info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { - Ok(()) => {} - Err(barrier) => { - info!("Shutdown already in progress, waiting for it to complete"); - barrier.wait().await; + info!("Shutting down attached tenant"); + match tenant.shutdown(progress, false).await { + Ok(()) => {} + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } } + slot_guard.drop_old_value().expect("We just shut it down"); + } + Some(TenantSlot::Secondary(state)) => { + info!("Shutting down secondary tenant"); + state.shutdown().await; + } + Some(TenantSlot::InProgress(_)) => { + // This should never happen: acquire_slot should error out + // if the contents of a slot were InProgress. + anyhow::bail!("Acquired an InProgress slot, this is a bug.") + } + None => { + // Slot was vacant, nothing needs shutting down. } - slot_guard.drop_old_value().expect("We just shut it down"); } let tenant_path = self.conf.tenant_path(&tenant_shard_id); @@ -978,7 +1052,9 @@ impl TenantManager { .map_err(SetNewTenantConfigError::Persist)?; let new_slot = match &new_location_config.mode { - LocationMode::Secondary(_) => TenantSlot::Secondary, + LocationMode::Secondary(secondary_config) => { + TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config)) + } LocationMode::Attached(_attach_config) => { let shard_identity = new_location_config.shard; let tenant = tenant_spawn( @@ -1091,6 +1167,30 @@ impl TenantManager { .collect(), } } + // Do some synchronous work for all tenant slots in Secondary state. The provided + // callback should be small and fast, as it will be called inside the global + // TenantsMap lock. + pub(crate) fn foreach_secondary_tenants(&self, mut func: F) + where + // TODO: let the callback return a hint to drop out of the loop early + F: FnMut(&TenantShardId, &Arc), + { + let locked = self.tenants.read().unwrap(); + + let map = match &*locked { + TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return, + TenantsMap::Open(m) => m, + }; + + for (tenant_id, slot) in map { + if let TenantSlot::Secondary(state) = slot { + // Only expose secondary tenants that are not currently shutting down + if !state.cancel.is_cancelled() { + func(tenant_id, state) + } + } + } + } pub(crate) async fn delete_tenant( &self, @@ -1205,7 +1305,7 @@ pub(crate) fn get_tenant( Some(TenantSlot::InProgress(_)) => { Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) } - None | Some(TenantSlot::Secondary) => { + None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) } } @@ -1257,9 +1357,11 @@ pub(crate) async fn get_active_tenant_with_timeout( let locked = TENANTS.read().unwrap(); // Resolve TenantId to TenantShardId - let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or( - GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)), - )?; + let tenant_shard_id = locked + .resolve_attached_shard(&tenant_id, shard_selector) + .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound( + tenant_id, + )))?; let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) .map_err(GetTenantError::MapState)?; @@ -1276,7 +1378,7 @@ pub(crate) async fn get_active_tenant_with_timeout( } } } - Some(TenantSlot::Secondary) => { + Some(TenantSlot::Secondary(_)) => { return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( tenant_id, ))) @@ -1540,7 +1642,7 @@ pub(crate) async fn list_tenants() -> Result, Ok(m.iter() .filter_map(|(id, tenant)| match tenant { TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), - TenantSlot::Secondary => None, + TenantSlot::Secondary(_) => None, TenantSlot::InProgress(_) => None, }) .collect()) @@ -1797,11 +1899,7 @@ impl SlotGuard { fn old_value_is_shutdown(&self) -> bool { match self.old_value.as_ref() { Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(), - Some(TenantSlot::Secondary) => { - // TODO: when adding secondary mode tenants, this will check for shutdown - // in the same way that we do for `Tenant` above - true - } + Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(), Some(TenantSlot::InProgress(_)) => { // A SlotGuard cannot be constructed for a slot that was already InProgress unreachable!() @@ -2011,26 +2109,19 @@ where let mut slot_guard = tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?; - // The SlotGuard allows us to manipulate the Tenant object without fear of some - // concurrent API request doing something else for the same tenant ID. - let attached_tenant = match slot_guard.get_old_value() { - Some(TenantSlot::Attached(t)) => Some(t), - _ => None, - }; - // allow pageserver shutdown to await for our completion let (_guard, progress) = completion::channel(); - // If the tenant was attached, shut it down gracefully. For secondary - // locations this part is not necessary - match &attached_tenant { - Some(attached_tenant) => { + // The SlotGuard allows us to manipulate the Tenant object without fear of some + // concurrent API request doing something else for the same tenant ID. + let attached_tenant = match slot_guard.get_old_value() { + Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload let freeze_and_flush = false; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. - match attached_tenant.shutdown(progress, freeze_and_flush).await { + match tenant.shutdown(progress, freeze_and_flush).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to @@ -2039,11 +2130,19 @@ where return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id)); } } + Some(tenant) } - None => { - // Nothing to wait on when not attached, proceed. + Some(TenantSlot::Secondary(secondary_state)) => { + tracing::info!("Shutting down in secondary mode"); + secondary_state.shutdown().await; + None } - } + Some(TenantSlot::InProgress(_)) => { + // Acquiring a slot guarantees its old value was not InProgress + unreachable!(); + } + None => None, + }; match tenant_cleanup .await diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1b0cf39fbe..2ea3ced008 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -229,6 +229,7 @@ use crate::{ tenant::upload_queue::{ UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, }, + TENANT_HEATMAP_BASENAME, }; use utils::id::{TenantId, TimelineId}; @@ -818,8 +819,25 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, ) { + // Filter out any layers which were not created by this tenant shard. These are + // layers that originate from some ancestor shard after a split, and may still + // be referenced by other shards. We are free to delete them locally and remove + // them from our index (and would have already done so when we reach this point + // in the code), but we may not delete them remotely. + with_metadata.retain(|(name, meta)| { + let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number + && meta.shard.shard_count == self.tenant_shard_id.shard_count; + if !retain { + tracing::debug!( + "Skipping deletion of ancestor-shard layer {name}, from shard {}", + meta.shard + ); + } + retain + }); + for (name, meta) in &with_metadata { info!( "scheduling deletion of layer {}{} (shard {})", @@ -1724,11 +1742,11 @@ pub fn remote_index_path( .expect("Failed to construct path") } -pub const HEATMAP_BASENAME: &str = "heatmap-v1.json"; - pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath { - RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}")) - .expect("Failed to construct path") + RemotePath::from_string(&format!( + "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}" + )) + .expect("Failed to construct path") } /// Given the key of an index, parse out the generation part of the name diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index d25fe56b92..2331447266 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -1,24 +1,48 @@ +mod downloader; pub mod heatmap; mod heatmap_uploader; +mod scheduler; use std::sync::Arc; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use self::heatmap_uploader::heatmap_uploader_task; +use self::{ + downloader::{downloader_task, SecondaryDetail}, + heatmap_uploader::heatmap_uploader_task, +}; -use super::mgr::TenantManager; +use super::{config::SecondaryLocationConfig, mgr::TenantManager}; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use tokio_util::sync::CancellationToken; -use utils::completion::Barrier; +use utils::{completion::Barrier, sync::gate::Gate}; +enum DownloadCommand { + Download(TenantShardId), +} enum UploadCommand { Upload(TenantShardId), } +impl UploadCommand { + fn get_tenant_shard_id(&self) -> &TenantShardId { + match self { + Self::Upload(id) => id, + } + } +} + +impl DownloadCommand { + fn get_tenant_shard_id(&self) -> &TenantShardId { + match self { + Self::Download(id) => id, + } + } +} + struct CommandRequest { payload: T, response_tx: tokio::sync::oneshot::Sender, @@ -28,12 +52,73 @@ struct CommandResponse { result: anyhow::Result<()>, } +// Whereas [`Tenant`] represents an attached tenant, this type represents the work +// we do for secondary tenant locations: where we are not serving clients or +// ingesting WAL, but we are maintaining a warm cache of layer files. +// +// This type is all about the _download_ path for secondary mode. The upload path +// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists. +// +// This structure coordinates TenantManager and SecondaryDownloader, +// so that the downloader can indicate which tenants it is currently +// operating on, and the manager can indicate when a particular +// secondary tenant should cancel any work in flight. +#[derive(Debug)] +pub(crate) struct SecondaryTenant { + /// Carrying a tenant shard ID simplifies callers such as the downloader + /// which need to organize many of these objects by ID. + tenant_shard_id: TenantShardId, + + /// Cancellation token indicates to SecondaryDownloader that it should stop doing + /// any work for this tenant at the next opportunity. + pub(crate) cancel: CancellationToken, + + pub(crate) gate: Gate, + + detail: std::sync::Mutex, +} + +impl SecondaryTenant { + pub(crate) fn new( + tenant_shard_id: TenantShardId, + config: &SecondaryLocationConfig, + ) -> Arc { + Arc::new(Self { + tenant_shard_id, + // todo: shall we make this a descendent of the + // main cancellation token, or is it sufficient that + // on shutdown we walk the tenants and fire their + // individual cancellations? + cancel: CancellationToken::new(), + gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")), + + detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), + }) + } + + pub(crate) async fn shutdown(&self) { + self.cancel.cancel(); + + // Wait for any secondary downloader work to complete + self.gate.close().await; + } + + pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { + self.detail.lock().unwrap().config = config.clone(); + } + + fn get_tenant_shard_id(&self) -> &TenantShardId { + &self.tenant_shard_id + } +} + /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, /// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests, /// where we want to immediately upload/download for a particular tenant. In normal operation /// uploads & downloads are autonomous and not driven by this interface. pub struct SecondaryController { upload_req_tx: tokio::sync::mpsc::Sender>, + download_req_tx: tokio::sync::mpsc::Sender>, } impl SecondaryController { @@ -63,6 +148,13 @@ impl SecondaryController { self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) .await } + pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + self.dispatch( + &self.download_req_tx, + DownloadCommand::Download(tenant_shard_id), + ) + .await + } } pub fn spawn_tasks( @@ -71,9 +163,37 @@ pub fn spawn_tasks( background_jobs_can_start: Barrier, cancel: CancellationToken, ) -> SecondaryController { + let mgr_clone = tenant_manager.clone(); + let storage_clone = remote_storage.clone(); + let cancel_clone = cancel.clone(); + let bg_jobs_clone = background_jobs_can_start.clone(); + + let (download_req_tx, download_req_rx) = + tokio::sync::mpsc::channel::>(16); let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::SecondaryDownloads, + None, + None, + "secondary tenant downloads", + false, + async move { + downloader_task( + mgr_clone, + storage_clone, + download_req_rx, + bg_jobs_clone, + cancel_clone, + ) + .await; + + Ok(()) + }, + ); + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::SecondaryUploads, @@ -89,16 +209,26 @@ pub fn spawn_tasks( background_jobs_can_start, cancel, ) - .await + .await; + + Ok(()) }, ); - SecondaryController { upload_req_tx } + SecondaryController { + download_req_tx, + upload_req_tx, + } } /// For running with remote storage disabled: a SecondaryController that is connected to nothing. pub fn null_controller() -> SecondaryController { + let (download_req_tx, _download_req_rx) = + tokio::sync::mpsc::channel::>(16); let (upload_req_tx, _upload_req_rx) = tokio::sync::mpsc::channel::>(16); - SecondaryController { upload_req_tx } + SecondaryController { + upload_req_tx, + download_req_tx, + } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs new file mode 100644 index 0000000000..6fdee08a4e --- /dev/null +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -0,0 +1,801 @@ +use std::{ + collections::{HashMap, HashSet}, + pin::Pin, + str::FromStr, + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use crate::{ + config::PageServerConf, + metrics::SECONDARY_MODE, + tenant::{ + config::SecondaryLocationConfig, + debug_assert_current_span_has_tenant_and_timeline_id, + remote_timeline_client::{ + index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, + }, + span::debug_assert_current_span_has_tenant_id, + storage_layer::LayerFileName, + tasks::{warn_when_period_overrun, BackgroundLoopKind}, + }, + virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, + METADATA_FILE_NAME, TEMP_FILE_SUFFIX, +}; + +use super::{ + heatmap::HeatMapLayer, + scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs}, + SecondaryTenant, +}; + +use crate::tenant::{ + mgr::TenantManager, + remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, +}; + +use chrono::format::{DelayedFormat, StrftimeItems}; +use futures::Future; +use pageserver_api::shard::TenantShardId; +use rand::Rng; +use remote_storage::{DownloadError, GenericRemoteStorage}; + +use tokio_util::sync::CancellationToken; +use tracing::{info_span, instrument, Instrument}; +use utils::{ + backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId, +}; + +use super::{ + heatmap::{HeatMapTenant, HeatMapTimeline}, + CommandRequest, DownloadCommand, +}; + +/// For each tenant, how long must have passed since the last download_tenant call before +/// calling it again. This is approximately the time by which local data is allowed +/// to fall behind remote data. +/// +/// TODO: this should just be a default, and the actual period should be controlled +/// via the heatmap itself +/// `` +const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000); + +pub(super) async fn downloader_task( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) { + let concurrency = tenant_manager.get_conf().secondary_download_concurrency; + + let generator = SecondaryDownloader { + tenant_manager, + remote_storage, + }; + let mut scheduler = Scheduler::new(generator, concurrency); + + scheduler + .run(command_queue, background_jobs_can_start, cancel) + .instrument(info_span!("secondary_downloads")) + .await +} + +struct SecondaryDownloader { + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, +} + +#[derive(Debug, Clone)] +pub(super) struct OnDiskState { + metadata: LayerFileMetadata, + access_time: SystemTime, +} + +impl OnDiskState { + fn new( + _conf: &'static PageServerConf, + _tenant_shard_id: &TenantShardId, + _imeline_id: &TimelineId, + _ame: LayerFileName, + metadata: LayerFileMetadata, + access_time: SystemTime, + ) -> Self { + Self { + metadata, + access_time, + } + } +} + +#[derive(Debug, Clone, Default)] +pub(super) struct SecondaryDetailTimeline { + pub(super) on_disk_layers: HashMap, + + /// We remember when layers were evicted, to prevent re-downloading them. + pub(super) evicted_at: HashMap, +} + +/// This state is written by the secondary downloader, it is opaque +/// to TenantManager +#[derive(Debug)] +pub(super) struct SecondaryDetail { + pub(super) config: SecondaryLocationConfig, + + last_download: Option, + next_download: Option, + pub(super) timelines: HashMap, +} + +/// Helper for logging SystemTime +fn strftime(t: &'_ SystemTime) -> DelayedFormat> { + let datetime: chrono::DateTime = (*t).into(); + datetime.format("%d/%m/%Y %T") +} + +impl SecondaryDetail { + pub(super) fn new(config: SecondaryLocationConfig) -> Self { + Self { + config, + last_download: None, + next_download: None, + timelines: HashMap::new(), + } + } +} + +struct PendingDownload { + secondary_state: Arc, + last_download: Option, + target_time: Option, + period: Option, +} + +impl scheduler::PendingJob for PendingDownload { + fn get_tenant_shard_id(&self) -> &TenantShardId { + self.secondary_state.get_tenant_shard_id() + } +} + +struct RunningDownload { + barrier: Barrier, +} + +impl scheduler::RunningJob for RunningDownload { + fn get_barrier(&self) -> Barrier { + self.barrier.clone() + } +} + +struct CompleteDownload { + secondary_state: Arc, + completed_at: Instant, +} + +impl scheduler::Completion for CompleteDownload { + fn get_tenant_shard_id(&self) -> &TenantShardId { + self.secondary_state.get_tenant_shard_id() + } +} + +type Scheduler = TenantBackgroundJobs< + SecondaryDownloader, + PendingDownload, + RunningDownload, + CompleteDownload, + DownloadCommand, +>; + +#[async_trait::async_trait] +impl JobGenerator + for SecondaryDownloader +{ + #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))] + fn on_completion(&mut self, completion: CompleteDownload) { + let CompleteDownload { + secondary_state, + completed_at: _completed_at, + } = completion; + + tracing::debug!("Secondary tenant download completed"); + + // Update freshened_at even if there was an error: we don't want errored tenants to implicitly + // take priority to run again. + let mut detail = secondary_state.detail.lock().unwrap(); + detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL); + } + + async fn schedule(&mut self) -> SchedulingResult { + let mut result = SchedulingResult { + jobs: Vec::new(), + want_interval: None, + }; + + // Step 1: identify some tenants that we may work on + let mut tenants: Vec> = Vec::new(); + self.tenant_manager + .foreach_secondary_tenants(|_id, secondary_state| { + tenants.push(secondary_state.clone()); + }); + + // Step 2: filter out tenants which are not yet elegible to run + let now = Instant::now(); + result.jobs = tenants + .into_iter() + .filter_map(|secondary_tenant| { + let (last_download, next_download) = { + let mut detail = secondary_tenant.detail.lock().unwrap(); + + if !detail.config.warm { + // Downloads are disabled for this tenant + detail.next_download = None; + return None; + } + + if detail.next_download.is_none() { + // Initialize with a jitter: this spreads initial downloads on startup + // or mass-attach across our freshen interval. + let jittered_period = + rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL); + detail.next_download = Some(now.checked_add(jittered_period).expect( + "Using our constant, which is known to be small compared with clock range", + )); + } + (detail.last_download, detail.next_download.unwrap()) + }; + + if now < next_download { + Some(PendingDownload { + secondary_state: secondary_tenant, + last_download, + target_time: Some(next_download), + period: Some(DOWNLOAD_FRESHEN_INTERVAL), + }) + } else { + None + } + }) + .collect(); + + // Step 3: sort by target execution time to run most urgent first. + result.jobs.sort_by_key(|j| j.target_time); + + result + } + + fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result { + let tenant_shard_id = command.get_tenant_shard_id(); + + let tenant = self + .tenant_manager + .get_secondary_tenant_shard(*tenant_shard_id); + let Some(tenant) = tenant else { + { + return Err(anyhow::anyhow!("Not found or not in Secondary mode")); + } + }; + + Ok(PendingDownload { + target_time: None, + period: None, + last_download: None, + secondary_state: tenant, + }) + } + + fn spawn( + &mut self, + job: PendingDownload, + ) -> ( + RunningDownload, + Pin + Send>>, + ) { + let PendingDownload { + secondary_state, + last_download, + target_time, + period, + } = job; + + let (completion, barrier) = utils::completion::channel(); + let remote_storage = self.remote_storage.clone(); + let conf = self.tenant_manager.get_conf(); + let tenant_shard_id = *secondary_state.get_tenant_shard_id(); + (RunningDownload { barrier }, Box::pin(async move { + let _completion = completion; + + match TenantDownloader::new(conf, &remote_storage, &secondary_state) + .download() + .await + { + Err(UpdateError::NoData) => { + tracing::info!("No heatmap found for tenant. This is fine if it is new."); + }, + Err(UpdateError::NoSpace) => { + tracing::warn!("Insufficient space while downloading. Will retry later."); + } + Err(UpdateError::Cancelled) => { + tracing::debug!("Shut down while downloading"); + }, + Err(UpdateError::Deserialize(e)) => { + tracing::error!("Corrupt content while downloading tenant: {e}"); + }, + Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => { + tracing::error!("Error while downloading tenant: {e}"); + }, + Ok(()) => {} + }; + + // Irrespective of the result, we will reschedule ourselves to run after our usual period. + + // If the job had a target execution time, we may check our final execution + // time against that for observability purposes. + if let (Some(target_time), Some(period)) = (target_time, period) { + // Only track execution lag if this isn't our first download: otherwise, it is expected + // that execution will have taken longer than our configured interval, for example + // when starting up a pageserver and + if last_download.is_some() { + // Elapsed time includes any scheduling lag as well as the execution of the job + let elapsed = Instant::now().duration_since(target_time); + + warn_when_period_overrun( + elapsed, + period, + BackgroundLoopKind::SecondaryDownload, + ); + } + } + + CompleteDownload { + secondary_state, + completed_at: Instant::now(), + } + }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) + } +} + +/// This type is a convenience to group together the various functions involved in +/// freshening a secondary tenant. +struct TenantDownloader<'a> { + conf: &'static PageServerConf, + remote_storage: &'a GenericRemoteStorage, + secondary_state: &'a SecondaryTenant, +} + +/// Errors that may be encountered while updating a tenant +#[derive(thiserror::Error, Debug)] +enum UpdateError { + #[error("No remote data found")] + NoData, + #[error("Insufficient local storage space")] + NoSpace, + #[error("Failed to download")] + DownloadError(DownloadError), + #[error(transparent)] + Deserialize(#[from] serde_json::Error), + #[error("Cancelled")] + Cancelled, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for UpdateError { + fn from(value: DownloadError) -> Self { + match &value { + DownloadError::Cancelled => Self::Cancelled, + DownloadError::NotFound => Self::NoData, + _ => Self::DownloadError(value), + } + } +} + +impl From for UpdateError { + fn from(value: std::io::Error) -> Self { + if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { + UpdateError::NoSpace + } else { + // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue + UpdateError::Other(anyhow::anyhow!(value)) + } + } +} + +impl<'a> TenantDownloader<'a> { + fn new( + conf: &'static PageServerConf, + remote_storage: &'a GenericRemoteStorage, + secondary_state: &'a SecondaryTenant, + ) -> Self { + Self { + conf, + remote_storage, + secondary_state, + } + } + + async fn download(&self) -> Result<(), UpdateError> { + debug_assert_current_span_has_tenant_id(); + + // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure + // cover our access to local storage. + let Ok(_guard) = self.secondary_state.gate.enter() else { + // Shutting down + return Ok(()); + }; + + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + // Download the tenant's heatmap + let heatmap_bytes = tokio::select!( + bytes = self.download_heatmap() => {bytes?}, + _ = self.secondary_state.cancel.cancelled() => return Ok(()) + ); + + let heatmap = serde_json::from_slice::(&heatmap_bytes)?; + + // Save the heatmap: this will be useful on restart, allowing us to reconstruct + // layer metadata without having to re-download it. + let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id); + + let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); + let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); + let heatmap_path_bg = heatmap_path.clone(); + tokio::task::spawn_blocking(move || { + tokio::runtime::Handle::current().block_on(async move { + VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await + }) + }) + .await + .expect("Blocking task is never aborted") + .maybe_fatal_err(&context_msg)?; + + tracing::debug!("Wrote local heatmap to {}", heatmap_path); + + // Download the layers in the heatmap + for timeline in heatmap.timelines { + if self.secondary_state.cancel.is_cancelled() { + return Ok(()); + } + + let timeline_id = timeline.timeline_id; + self.download_timeline(timeline) + .instrument(tracing::info_span!( + "secondary_download_timeline", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + %timeline_id + )) + .await?; + } + + Ok(()) + } + + async fn download_heatmap(&self) -> Result, UpdateError> { + debug_assert_current_span_has_tenant_id(); + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + // TODO: make download conditional on ETag having changed since last download + // (https://github.com/neondatabase/neon/issues/6199) + tracing::debug!("Downloading heatmap for secondary tenant",); + + let heatmap_path = remote_heatmap_path(tenant_shard_id); + + let heatmap_bytes = backoff::retry( + || async { + let download = self + .remote_storage + .download(&heatmap_path) + .await + .map_err(UpdateError::from)?; + let mut heatmap_bytes = Vec::new(); + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?; + Ok(heatmap_bytes) + }, + |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "download heatmap", + backoff::Cancel::new(self.secondary_state.cancel.clone(), || { + UpdateError::Cancelled + }), + ) + .await?; + + SECONDARY_MODE.download_heatmap.inc(); + + Ok(heatmap_bytes) + } + + async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + let timeline_path = self + .conf + .timeline_path(tenant_shard_id, &timeline.timeline_id); + + // Accumulate updates to the state + let mut touched = Vec::new(); + + // Clone a view of what layers already exist on disk + let timeline_state = self + .secondary_state + .detail + .lock() + .unwrap() + .timelines + .get(&timeline.timeline_id) + .cloned(); + + let timeline_state = match timeline_state { + Some(t) => t, + None => { + // We have no existing state: need to scan local disk for layers first. + let timeline_state = + init_timeline_state(self.conf, tenant_shard_id, &timeline).await; + + // Re-acquire detail lock now that we're done with async load from local FS + self.secondary_state + .detail + .lock() + .unwrap() + .timelines + .insert(timeline.timeline_id, timeline_state.clone()); + timeline_state + } + }; + + let layers_in_heatmap = timeline + .layers + .iter() + .map(|l| &l.name) + .collect::>(); + let layers_on_disk = timeline_state + .on_disk_layers + .iter() + .map(|l| l.0) + .collect::>(); + + // Remove on-disk layers that are no longer present in heatmap + for layer in layers_on_disk.difference(&layers_in_heatmap) { + let local_path = timeline_path.join(layer.to_string()); + tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",); + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary layer")?; + } + + // Download heatmap layers that are not present on local disk, or update their + // access time if they are already present. + for layer in timeline.layers { + if self.secondary_state.cancel.is_cancelled() { + return Ok(()); + } + + // Existing on-disk layers: just update their access time. + if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { + tracing::debug!("Layer {} is already on disk", layer.name); + if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) + || on_disk.access_time != layer.access_time + { + // We already have this layer on disk. Update its access time. + tracing::debug!( + "Access time updated for layer {}: {} -> {}", + layer.name, + strftime(&on_disk.access_time), + strftime(&layer.access_time) + ); + touched.push(layer); + } + continue; + } else { + tracing::debug!("Layer {} not present on disk yet", layer.name); + } + + // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more + // recently than it was evicted. + if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) { + if &layer.access_time > evicted_at { + tracing::info!( + "Re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + } else { + tracing::trace!( + "Not re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + continue; + } + } + + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally + let downloaded_bytes = match download_layer_file( + self.conf, + self.remote_storage, + *tenant_shard_id, + timeline.timeline_id, + &layer.name, + &LayerFileMetadata::from(&layer.metadata), + &self.secondary_state.cancel, + ) + .await + { + Ok(bytes) => bytes, + Err(e) => { + if let DownloadError::NotFound = e { + // A heatmap might be out of date and refer to a layer that doesn't exist any more. + // This is harmless: continue to download the next layer. It is expected during compaction + // GC. + tracing::debug!( + "Skipped downloading missing layer {}, raced with compaction/gc?", + layer.name + ); + continue; + } else { + return Err(e.into()); + } + } + }; + + if downloaded_bytes != layer.metadata.file_size { + let local_path = timeline_path.join(layer.name.to_string()); + + tracing::warn!( + "Downloaded layer {} with unexpected size {} != {}. Removing download.", + layer.name, + downloaded_bytes, + layer.metadata.file_size + ); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found)?; + } + + SECONDARY_MODE.download_layer.inc(); + touched.push(layer) + } + + // Write updates to state to record layers we just downloaded or touched. + { + let mut detail = self.secondary_state.detail.lock().unwrap(); + let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default(); + + tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); + + for t in touched { + use std::collections::hash_map::Entry; + match timeline_detail.on_disk_layers.entry(t.name.clone()) { + Entry::Occupied(mut v) => { + v.get_mut().access_time = t.access_time; + } + Entry::Vacant(e) => { + e.insert(OnDiskState::new( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + t.name, + LayerFileMetadata::from(&t.metadata), + t.access_time, + )); + } + } + } + } + + Ok(()) + } +} + +/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline +async fn init_timeline_state( + conf: &'static PageServerConf, + tenant_shard_id: &TenantShardId, + heatmap: &HeatMapTimeline, +) -> SecondaryDetailTimeline { + let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); + let mut detail = SecondaryDetailTimeline::default(); + + let mut dir = match tokio::fs::read_dir(&timeline_path).await { + Ok(d) => d, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + let context = format!("Creating timeline directory {timeline_path}"); + tracing::info!("{}", context); + tokio::fs::create_dir_all(&timeline_path) + .await + .fatal_err(&context); + + // No entries to report: drop out. + return detail; + } else { + on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}")); + } + } + }; + + // As we iterate through layers found on disk, we will look up their metadata from this map. + // Layers not present in metadata will be discarded. + let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + heatmap.layers.iter().map(|l| (&l.name, l)).collect(); + + while let Some(dentry) = dir + .next_entry() + .await + .fatal_err(&format!("Listing {timeline_path}")) + { + let dentry_file_name = dentry.file_name(); + let file_name = dentry_file_name.to_string_lossy(); + let local_meta = dentry.metadata().await.fatal_err(&format!( + "Read metadata on {}", + dentry.path().to_string_lossy() + )); + + // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. + if file_name == METADATA_FILE_NAME { + continue; + } + + match LayerFileName::from_str(&file_name) { + Ok(name) => { + let remote_meta = heatmap_metadata.get(&name); + match remote_meta { + Some(remote_meta) => { + // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784) + if local_meta.len() != remote_meta.metadata.file_size { + // This should not happen, because we do crashsafe write-then-rename when downloading + // layers, and layers in remote storage are immutable. Remove the local file because + // we cannot trust it. + tracing::warn!( + "Removing local layer {name} with unexpected local size {} != {}", + local_meta.len(), + remote_meta.metadata.file_size + ); + } else { + // We expect the access time to be initialized immediately afterwards, when + // the latest heatmap is applied to the state. + detail.on_disk_layers.insert( + name.clone(), + OnDiskState::new( + conf, + tenant_shard_id, + &heatmap.timeline_id, + name, + LayerFileMetadata::from(&remote_meta.metadata), + remote_meta.access_time, + ), + ); + } + } + None => { + // FIXME: consider some optimization when transitioning from attached to secondary: maybe + // wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise + // we will end up deleting any layers which were created+uploaded more recently than the heatmap. + tracing::info!( + "Removing secondary local layer {} because it's absent in heatmap", + name + ); + tokio::fs::remove_file(&dentry.path()) + .await + .or_else(fs_ext::ignore_not_found) + .fatal_err(&format!( + "Removing layer {}", + dentry.path().to_string_lossy() + )); + } + } + } + Err(_) => { + // Ignore it. + tracing::warn!("Unexpected file in timeline directory: {file_name}"); + } + } + } + + detail +} diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index ece2b93ce1..ef01c33e8e 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -1,5 +1,6 @@ use std::{ collections::HashMap, + pin::Pin, sync::{Arc, Weak}, time::{Duration, Instant}, }; @@ -7,35 +8,86 @@ use std::{ use crate::{ metrics::SECONDARY_MODE, tenant::{ - config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path, - secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant, + config::AttachmentMode, + mgr::TenantManager, + remote_timeline_client::remote_heatmap_path, + span::debug_assert_current_span_has_tenant_id, + tasks::{warn_when_period_overrun, BackgroundLoopKind}, + Tenant, }, }; +use futures::Future; use md5; use pageserver_api::shard::TenantShardId; +use rand::Rng; use remote_storage::GenericRemoteStorage; -use tokio::task::JoinSet; +use super::{ + scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs}, + CommandRequest, +}; use tokio_util::sync::CancellationToken; -use tracing::instrument; -use utils::{backoff, completion::Barrier}; +use tracing::{info_span, instrument, Instrument}; +use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; -use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand}; +use super::{heatmap::HeatMapTenant, UploadCommand}; -/// Period between heatmap uploader walking Tenants to look for work to do. -/// If any tenants have a heatmap upload period lower than this, it will be adjusted -/// downward to match. -const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000); -const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000); +pub(super) async fn heatmap_uploader_task( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) { + let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency; + + let generator = HeatmapUploader { + tenant_manager, + remote_storage, + cancel: cancel.clone(), + tenants: HashMap::new(), + }; + let mut scheduler = Scheduler::new(generator, concurrency); + + scheduler + .run(command_queue, background_jobs_can_start, cancel) + .instrument(info_span!("heatmap_uploader")) + .await +} + +/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event +/// handling loop and mutates it as needed: there are no locks here, because that event loop +/// can hold &mut references to this type throughout. +struct HeatmapUploader { + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + cancel: CancellationToken, + + tenants: HashMap, +} struct WriteInProgress { barrier: Barrier, } +impl RunningJob for WriteInProgress { + fn get_barrier(&self) -> Barrier { + self.barrier.clone() + } +} + struct UploadPending { tenant: Arc, last_digest: Option, + target_time: Option, + period: Option, +} + +impl scheduler::PendingJob for UploadPending { + fn get_tenant_shard_id(&self) -> &TenantShardId { + self.tenant.get_tenant_shard_id() + } } struct WriteComplete { @@ -45,6 +97,12 @@ struct WriteComplete { next_upload: Option, } +impl scheduler::Completion for WriteComplete { + fn get_tenant_shard_id(&self) -> &TenantShardId { + &self.tenant_shard_id + } +} + /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember /// when we last did a write. We only populate this after doing at least one /// write for a tenant -- this avoids holding state for tenants that have @@ -68,267 +126,111 @@ struct UploaderTenantState { next_upload: Option, } -/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event -/// handling loop and mutates it as needed: there are no locks here, because that event loop -/// can hold &mut references to this type throughout. -struct HeatmapUploader { - tenant_manager: Arc, - remote_storage: GenericRemoteStorage, - cancel: CancellationToken, +type Scheduler = TenantBackgroundJobs< + HeatmapUploader, + UploadPending, + WriteInProgress, + WriteComplete, + UploadCommand, +>; - tenants: HashMap, - - /// Tenants with work to do, for which tasks should be spawned as soon as concurrency - /// limits permit it. - tenants_pending: std::collections::VecDeque, - - /// Tenants for which a task in `tasks` has been spawned. - tenants_uploading: HashMap, - - tasks: JoinSet<()>, - - /// Channel for our child tasks to send results to: we use a channel for results rather than - /// just getting task results via JoinSet because we need the channel's recv() "sleep until something - /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty" - /// behavior. - task_result_tx: tokio::sync::mpsc::UnboundedSender, - task_result_rx: tokio::sync::mpsc::UnboundedReceiver, - - concurrent_uploads: usize, - - scheduling_interval: Duration, -} - -/// The uploader task runs a loop that periodically wakes up and schedules tasks for -/// tenants that require an upload, or handles any commands that have been sent into -/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we -/// spawn. -/// -/// Scheduling iterations are somewhat infrequent. However, each one will enqueue -/// all tenants that require an upload, and in between scheduling iterations we will -/// continue to spawn new tasks for pending tenants, as our concurrency limit permits. -/// -/// While we take a CancellationToken here, it is subordinate to the CancellationTokens -/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise -/// we might block waiting on a Tenant. -pub(super) async fn heatmap_uploader_task( - tenant_manager: Arc, - remote_storage: GenericRemoteStorage, - mut command_queue: tokio::sync::mpsc::Receiver>, - background_jobs_can_start: Barrier, - cancel: CancellationToken, -) -> anyhow::Result<()> { - let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency; - - let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); - - let mut uploader = HeatmapUploader { - tenant_manager, - remote_storage, - cancel: cancel.clone(), - tasks: JoinSet::new(), - tenants: HashMap::new(), - tenants_pending: std::collections::VecDeque::new(), - tenants_uploading: HashMap::new(), - task_result_tx: result_tx, - task_result_rx: result_rx, - concurrent_uploads, - scheduling_interval: DEFAULT_SCHEDULING_INTERVAL, - }; - - tracing::info!("Waiting for background_jobs_can start..."); - background_jobs_can_start.wait().await; - tracing::info!("background_jobs_can is ready, proceeding."); - - while !cancel.is_cancelled() { - // Look for new work: this is relatively expensive because we have to go acquire the lock on - // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones - // require an upload. - uploader.schedule_iteration().await?; - - // Between scheduling iterations, we will: - // - Drain any complete tasks and spawn pending tasks - // - Handle incoming administrative commands - // - Check our cancellation token - let next_scheduling_iteration = Instant::now() - .checked_add(uploader.scheduling_interval) - .unwrap_or_else(|| { - tracing::warn!( - "Scheduling interval invalid ({}s), running immediately!", - uploader.scheduling_interval.as_secs_f64() - ); - Instant::now() - }); - loop { - tokio::select! { - _ = cancel.cancelled() => { - // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation. - tracing::info!("Heatmap uploader joining tasks"); - while let Some(_r) = uploader.tasks.join_next().await {}; - tracing::info!("Heatmap uploader terminating"); - - break; - }, - _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => { - tracing::debug!("heatmap_uploader_task: woke for scheduling interval"); - break;}, - cmd = command_queue.recv() => { - tracing::debug!("heatmap_uploader_task: woke for command queue"); - let cmd = match cmd { - Some(c) =>c, - None => { - // SecondaryController was destroyed, and this has raced with - // our CancellationToken - tracing::info!("Heatmap uploader terminating"); - cancel.cancel(); - break; - } - }; - - let CommandRequest{ - response_tx, - payload - } = cmd; - uploader.handle_command(payload, response_tx); - }, - _ = uploader.process_next_completion() => { - if !cancel.is_cancelled() { - uploader.spawn_pending(); - } - } - } - } - } - - Ok(()) -} - -impl HeatmapUploader { - /// Periodic execution phase: inspect all attached tenants and schedule any work they require. - async fn schedule_iteration(&mut self) -> anyhow::Result<()> { +#[async_trait::async_trait] +impl JobGenerator + for HeatmapUploader +{ + async fn schedule(&mut self) -> SchedulingResult { // Cull any entries in self.tenants whose Arc is gone self.tenants .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some()); - // The priority order of previously scheduled work may be invalidated by current state: drop - // all pending work (it will be re-scheduled if still needed) - self.tenants_pending.clear(); - - // Used a fixed 'now' through the following loop, for efficiency and fairness. let now = Instant::now(); - // While iterating over the potentially-long list of tenants, we will periodically yield - // to avoid blocking executor. - const YIELD_ITERATIONS: usize = 1000; + let mut result = SchedulingResult { + jobs: Vec::new(), + want_interval: None, + }; - // Iterate over tenants looking for work to do. let tenants = self.tenant_manager.get_attached_active_tenant_shards(); - for (i, tenant) in tenants.into_iter().enumerate() { - // Process is shutting down, drop out - if self.cancel.is_cancelled() { - return Ok(()); - } - // Skip tenants that already have a write in flight - if self - .tenants_uploading - .contains_key(tenant.get_tenant_shard_id()) - { - continue; - } + yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| { + let period = match tenant.get_heatmap_period() { + None => { + // Heatmaps are disabled for this tenant + return; + } + Some(period) => { + // If any tenant has asked for uploads more frequent than our scheduling interval, + // reduce it to match so that we can keep up. This is mainly useful in testing, where + // we may set rather short intervals. + result.want_interval = match result.want_interval { + None => Some(period), + Some(existing) => Some(std::cmp::min(period, existing)), + }; - self.maybe_schedule_upload(&now, tenant); + period + } + }; - if i + 1 % YIELD_ITERATIONS == 0 { - tokio::task::yield_now().await; - } - } - - // Spawn tasks for as many of our pending tenants as we can. - self.spawn_pending(); - - Ok(()) - } - - /// - /// Cancellation: this method is cancel-safe. - async fn process_next_completion(&mut self) { - match self.task_result_rx.recv().await { - Some(r) => { - self.on_completion(r); - } - None => { - unreachable!("Result sender is stored on Self"); - } - } - } - - /// The 'maybe' refers to the tenant's state: whether it is configured - /// for heatmap uploads at all, and whether sufficient time has passed - /// since the last upload. - fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc) { - match tenant.get_heatmap_period() { - None => { - // Heatmaps are disabled for this tenant + // Stale attachments do not upload anything: if we are in this state, there is probably some + // other attachment in mode Single or Multi running on another pageserver, and we don't + // want to thrash and overwrite their heatmap uploads. + if tenant.get_attach_mode() == AttachmentMode::Stale { return; } - Some(period) => { - // If any tenant has asked for uploads more frequent than our scheduling interval, - // reduce it to match so that we can keep up. This is mainly useful in testing, where - // we may set rather short intervals. - if period < self.scheduling_interval { - self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL); - } + + // Create an entry in self.tenants if one doesn't already exist: this will later be updated + // with the completion time in on_completion. + let state = self + .tenants + .entry(*tenant.get_tenant_shard_id()) + .or_insert_with(|| { + let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period); + + UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)), + last_digest: None, + } + }); + + // Decline to do the upload if insufficient time has passed + if state.next_upload.map(|nu| nu > now).unwrap_or(false) { + return; } - } - // Stale attachments do not upload anything: if we are in this state, there is probably some - // other attachment in mode Single or Multi running on another pageserver, and we don't - // want to thrash and overwrite their heatmap uploads. - if tenant.get_attach_mode() == AttachmentMode::Stale { - return; - } - - // Create an entry in self.tenants if one doesn't already exist: this will later be updated - // with the completion time in on_completion. - let state = self - .tenants - .entry(*tenant.get_tenant_shard_id()) - .or_insert_with(|| UploaderTenantState { - tenant: Arc::downgrade(&tenant), - last_upload: None, - next_upload: Some(Instant::now()), - last_digest: None, + let last_digest = state.last_digest; + result.jobs.push(UploadPending { + tenant, + last_digest, + target_time: state.next_upload, + period: Some(period), }); + }) + .await + .ok(); - // Decline to do the upload if insufficient time has passed - if state.next_upload.map(|nu| &nu > now).unwrap_or(false) { - return; - } + result + } - let last_digest = state.last_digest; - self.tenants_pending.push_back(UploadPending { + fn spawn( + &mut self, + job: UploadPending, + ) -> ( + WriteInProgress, + Pin + Send>>, + ) { + let UploadPending { tenant, last_digest, - }) - } + target_time, + period, + } = job; - fn spawn_pending(&mut self) { - while !self.tenants_pending.is_empty() - && self.tenants_uploading.len() < self.concurrent_uploads - { - // unwrap: loop condition includes !is_empty() - let pending = self.tenants_pending.pop_front().unwrap(); - self.spawn_upload(pending.tenant, pending.last_digest); - } - } - - fn spawn_upload(&mut self, tenant: Arc, last_digest: Option) { let remote_storage = self.remote_storage.clone(); - let tenant_shard_id = *tenant.get_tenant_shard_id(); let (completion, barrier) = utils::completion::channel(); - let result_tx = self.task_result_tx.clone(); - self.tasks.spawn(async move { + let tenant_shard_id = *tenant.get_tenant_shard_id(); + (WriteInProgress { barrier }, Box::pin(async move { // Guard for the barrier in [`WriteInProgress`] let _completion = completion; @@ -362,22 +264,47 @@ impl HeatmapUploader { }; let now = Instant::now(); + + // If the job had a target execution time, we may check our final execution + // time against that for observability purposes. + if let (Some(target_time), Some(period)) = (target_time, period) { + // Elapsed time includes any scheduling lag as well as the execution of the job + let elapsed = now.duration_since(target_time); + + warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload); + } + let next_upload = tenant .get_heatmap_period() .and_then(|period| now.checked_add(period)); - result_tx - .send(WriteComplete { + WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), completed_at: now, digest, next_upload, - }) - .ok(); - }); + } + }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) + } - self.tenants_uploading - .insert(tenant_shard_id, WriteInProgress { barrier }); + fn on_command(&mut self, command: UploadCommand) -> anyhow::Result { + let tenant_shard_id = command.get_tenant_shard_id(); + + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Starting heatmap write on command"); + let tenant = self + .tenant_manager + .get_attached_tenant_shard(*tenant_shard_id, true) + .map_err(|e| anyhow::anyhow!(e))?; + + Ok(UploadPending { + // Ignore our state for last digest: this forces an upload even if nothing has changed + last_digest: None, + tenant, + target_time: None, + period: None, + }) } #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))] @@ -389,7 +316,6 @@ impl HeatmapUploader { digest, next_upload, } = completion; - self.tenants_uploading.remove(&tenant_shard_id); use std::collections::hash_map::Entry; match self.tenants.entry(tenant_shard_id) { Entry::Vacant(_) => { @@ -402,69 +328,6 @@ impl HeatmapUploader { } } } - - fn handle_command( - &mut self, - command: UploadCommand, - response_tx: tokio::sync::oneshot::Sender, - ) { - match command { - UploadCommand::Upload(tenant_shard_id) => { - // If an upload was ongoing for this tenant, let it finish first. - let barrier = if let Some(writing_state) = - self.tenants_uploading.get(&tenant_shard_id) - { - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Waiting for heatmap write to complete"); - writing_state.barrier.clone() - } else { - // Spawn the upload then immediately wait for it. This will block processing of other commands and - // starting of other background work. - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Starting heatmap write on command"); - let tenant = match self - .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, true) - { - Ok(t) => t, - Err(e) => { - // Drop result of send: we don't care if caller dropped their receiver - drop(response_tx.send(CommandResponse { - result: Err(e.into()), - })); - return; - } - }; - self.spawn_upload(tenant, None); - let writing_state = self - .tenants_uploading - .get(&tenant_shard_id) - .expect("We just inserted this"); - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Waiting for heatmap upload to complete"); - - writing_state.barrier.clone() - }; - - // This task does no I/O: it only listens for a barrier's completion and then - // sends to the command response channel. It is therefore safe to spawn this without - // any gates/task_mgr hooks. - tokio::task::spawn(async move { - barrier.wait().await; - - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Heatmap upload complete"); - - // Drop result of send: we don't care if caller dropped their receiver - drop(response_tx.send(CommandResponse { result: Ok(()) })) - }); - } - } - } } enum UploadHeatmapOutcome { @@ -487,7 +350,6 @@ enum UploadHeatmapError { /// The inner upload operation. This will skip if `last_digest` is Some and matches the digest /// of the object we would have uploaded. -#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))] async fn upload_tenant_heatmap( remote_storage: GenericRemoteStorage, tenant: &Arc, diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs new file mode 100644 index 0000000000..cf01a100d9 --- /dev/null +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -0,0 +1,361 @@ +use async_trait; +use futures::Future; +use std::{ + collections::HashMap, + marker::PhantomData, + pin::Pin, + time::{Duration, Instant}, +}; + +use pageserver_api::shard::TenantShardId; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use utils::{completion::Barrier, yielding_loop::yielding_loop}; + +use super::{CommandRequest, CommandResponse}; + +/// Scheduling interval is the time between calls to JobGenerator::schedule. +/// When we schedule jobs, the job generator may provide a hint of its preferred +/// interval, which we will respect within these intervals. +const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10); +const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1); + +/// Scheduling helper for background work across many tenants. +/// +/// Systems that need to run background work across many tenants may use this type +/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`] +/// implementation to provide the work to execute. This is a simple scheduler that just +/// polls the generator for outstanding work, replacing its queue of pending work with +/// what the generator yields on each call: the job generator can change its mind about +/// the order of jobs between calls. The job generator is notified when jobs complete, +/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement +/// admin APIs). +/// +/// For an example see [`crate::tenant::secondary::heatmap_uploader`] +/// +/// G: A JobGenerator that this scheduler will poll to find pending jobs +/// PJ: 'Pending Job': type for job descriptors that are ready to run +/// RJ: 'Running Job' type' for jobs that have been spawned +/// C : 'Completion' type that spawned jobs will send when they finish +/// CMD: 'Command' type that the job generator will accept to create jobs on-demand +pub(super) struct TenantBackgroundJobs +where + G: JobGenerator, + C: Completion, + PJ: PendingJob, + RJ: RunningJob, +{ + generator: G, + + /// Ready to run. Will progress to `running` once concurrent limit is satisfied, or + /// be removed on next scheduling pass. + pending: std::collections::VecDeque, + + /// Tasks currently running in Self::tasks for these tenants. Check this map + /// before pushing more work into pending for the same tenant. + running: HashMap, + + tasks: JoinSet, + + concurrency: usize, + + /// How often we would like schedule_interval to be called. + pub(super) scheduling_interval: Duration, + + _phantom: PhantomData<(PJ, RJ, C, CMD)>, +} + +#[async_trait::async_trait] +pub(crate) trait JobGenerator +where + C: Completion, + PJ: PendingJob, + RJ: RunningJob, +{ + /// Called at each scheduling interval. Return a list of jobs to run, most urgent first. + /// + /// This function may be expensive (e.g. walk all tenants), but should not do any I/O. + /// Implementations should take care to yield the executor periodically if running + /// very long loops. + /// + /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending + /// jobs is not drained by the next scheduling interval, pending jobs will be cleared + /// and re-generated. + async fn schedule(&mut self) -> SchedulingResult; + + /// Called when a pending job is ready to be run. + /// + /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it. + fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin + Send>>); + + /// Called when a job previously spawned with spawn() transmits its completion + fn on_completion(&mut self, completion: C); + + /// Called when a command is received. A job will be spawned immediately if the return + /// value is Some, ignoring concurrency limits and the pending queue. + fn on_command(&mut self, cmd: CMD) -> anyhow::Result; +} + +/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling +pub(super) struct SchedulingResult { + pub(super) jobs: Vec, + /// The job generator would like to be called again this soon + pub(super) want_interval: Option, +} + +/// See [`TenantBackgroundJobs`]. +pub(super) trait PendingJob { + fn get_tenant_shard_id(&self) -> &TenantShardId; +} + +/// See [`TenantBackgroundJobs`]. +pub(super) trait Completion: Send + 'static { + fn get_tenant_shard_id(&self) -> &TenantShardId; +} + +/// See [`TenantBackgroundJobs`]. +pub(super) trait RunningJob { + fn get_barrier(&self) -> Barrier; +} + +impl TenantBackgroundJobs +where + C: Completion, + PJ: PendingJob, + RJ: RunningJob, + G: JobGenerator, +{ + pub(super) fn new(generator: G, concurrency: usize) -> Self { + Self { + generator, + pending: std::collections::VecDeque::new(), + running: HashMap::new(), + tasks: JoinSet::new(), + concurrency, + scheduling_interval: MAX_SCHEDULING_INTERVAL, + _phantom: PhantomData, + } + } + + pub(super) async fn run( + &mut self, + mut command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, + ) { + tracing::info!("Waiting for background_jobs_can start..."); + background_jobs_can_start.wait().await; + tracing::info!("background_jobs_can is ready, proceeding."); + + while !cancel.is_cancelled() { + // Look for new work: this is relatively expensive because we have to go acquire the lock on + // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones + // require an upload. + self.schedule_iteration(&cancel).await; + + if cancel.is_cancelled() { + return; + } + + // Schedule some work, if concurrency limit permits it + self.spawn_pending(); + + // Between scheduling iterations, we will: + // - Drain any complete tasks and spawn pending tasks + // - Handle incoming administrative commands + // - Check our cancellation token + let next_scheduling_iteration = Instant::now() + .checked_add(self.scheduling_interval) + .unwrap_or_else(|| { + tracing::warn!( + "Scheduling interval invalid ({}s)", + self.scheduling_interval.as_secs_f64() + ); + // unwrap(): this constant is small, cannot fail to add to time unless + // we are close to the end of the universe. + Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap() + }); + loop { + tokio::select! { + _ = cancel.cancelled() => { + tracing::info!("joining tasks"); + // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation. + // It is the callers responsibility to make sure that the tasks they scheduled + // respect an appropriate cancellation token, to shut down promptly. It is only + // safe to wait on joining these tasks because we can see the cancellation token + // has been set. + while let Some(_r) = self.tasks.join_next().await {} + tracing::info!("terminating on cancellation token."); + + break; + }, + _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => { + tracing::debug!("woke for scheduling interval"); + break;}, + cmd = command_queue.recv() => { + tracing::debug!("woke for command queue"); + let cmd = match cmd { + Some(c) =>c, + None => { + // SecondaryController was destroyed, and this has raced with + // our CancellationToken + tracing::info!("terminating on command queue destruction"); + cancel.cancel(); + break; + } + }; + + let CommandRequest{ + response_tx, + payload + } = cmd; + self.handle_command(payload, response_tx); + }, + _ = async { + let completion = self.process_next_completion().await; + match completion { + Some(c) => { + self.generator.on_completion(c); + if !cancel.is_cancelled() { + self.spawn_pending(); + } + }, + None => { + // Nothing is running, so just wait: expect that this future + // will be dropped when something in the outer select! fires. + cancel.cancelled().await; + } + } + + } => {} + } + } + } + } + + fn do_spawn(&mut self, job: PJ) { + let tenant_shard_id = *job.get_tenant_shard_id(); + let (in_progress, fut) = self.generator.spawn(job); + + self.tasks.spawn(fut); + + self.running.insert(tenant_shard_id, in_progress); + } + + /// For all pending tenants that are elegible for execution, spawn their task. + /// + /// Caller provides the spawn operation, we track the resulting execution. + fn spawn_pending(&mut self) { + while !self.pending.is_empty() && self.running.len() < self.concurrency { + // unwrap: loop condition includes !is_empty() + let pending = self.pending.pop_front().unwrap(); + self.do_spawn(pending); + } + } + + /// For administrative commands: skip the pending queue, ignore concurrency limits + fn spawn_now(&mut self, job: PJ) -> &RJ { + let tenant_shard_id = *job.get_tenant_shard_id(); + self.do_spawn(job); + self.running + .get(&tenant_shard_id) + .expect("We just inserted this") + } + + /// Wait until the next task completes, and handle its completion + /// + /// Cancellation: this method is cancel-safe. + async fn process_next_completion(&mut self) -> Option { + match self.tasks.join_next().await { + Some(r) => { + // We use a channel to drive completions, but also + // need to drain the JoinSet to avoid completed tasks + // accumulating. These calls are 1:1 because every task + // we spawn into this joinset submits is result to the channel. + let completion = r.expect("Panic in background task"); + + self.running.remove(completion.get_tenant_shard_id()); + Some(completion) + } + None => { + // Nothing is running, so we have nothing to wait for. We may drop out: the + // main even loop will call us again after the next time it has run something. + None + } + } + } + + /// Convert the command into a pending job, spawn it, and when the spawned + /// job completes, send the result down `response_tx`. + fn handle_command( + &mut self, + cmd: CMD, + response_tx: tokio::sync::oneshot::Sender, + ) { + let job = match self.generator.on_command(cmd) { + Ok(j) => j, + Err(e) => { + response_tx.send(CommandResponse { result: Err(e) }).ok(); + return; + } + }; + + let tenant_shard_id = job.get_tenant_shard_id(); + let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { + barrier + } else { + let running = self.spawn_now(job); + running.get_barrier().clone() + }; + + // This task does no I/O: it only listens for a barrier's completion and then + // sends to the command response channel. It is therefore safe to spawn this without + // any gates/task_mgr hooks. + tokio::task::spawn(async move { + barrier.wait().await; + + response_tx.send(CommandResponse { result: Ok(()) }).ok(); + }); + } + + fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option { + self.running.get(tenant_shard_id).map(|r| r.get_barrier()) + } + + /// Periodic execution phase: inspect all attached tenants and schedule any work they require. + /// + /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`] + /// + /// This function resets the pending list: it is assumed that the caller may change their mind about + /// which tenants need work between calls to schedule_iteration. + async fn schedule_iteration(&mut self, cancel: &CancellationToken) { + let SchedulingResult { + jobs, + want_interval, + } = self.generator.schedule().await; + + // Adjust interval based on feedback from the job generator + if let Some(want_interval) = want_interval { + // Calculation uses second granularity: this scheduler is not intended for high frequency tasks + self.scheduling_interval = Duration::from_secs(std::cmp::min( + std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()), + MAX_SCHEDULING_INTERVAL.as_secs(), + )); + } + + // The priority order of previously scheduled work may be invalidated by current state: drop + // all pending work (it will be re-scheduled if still needed) + self.pending.clear(); + + // While iterating over the potentially-long list of tenants, we will periodically yield + // to avoid blocking executor. + yielding_loop(1000, cancel, jobs.into_iter(), |job| { + // Skip tenants that already have a write in flight + if !self.running.contains_key(job.get_tenant_shard_id()) { + self.pending.push_back(job); + } + }) + .await + .ok(); + } +} diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 003cf0e92b..7c9103eea8 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::ops::Range; -use tokio::sync::RwLock; +use tokio::sync::{RwLock, RwLockWriteGuard}; use super::{DeltaLayerWriter, ResidentLayer}; @@ -246,16 +246,43 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub async fn put_value( + pub(crate) async fn put_value( &self, key: Key, lsn: Lsn, val: &Value, ctx: &RequestContext, ) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); - let inner: &mut _ = &mut *self.inner.write().await; + let mut inner = self.inner.write().await; self.assert_writable(); + self.put_value_locked(&mut inner, key, lsn, val, ctx).await + } + + pub(crate) async fn put_values( + &self, + values: &HashMap>, + ctx: &RequestContext, + ) -> Result<()> { + let mut inner = self.inner.write().await; + self.assert_writable(); + for (key, vals) in values { + for (lsn, val) in vals { + self.put_value_locked(&mut inner, *key, *lsn, val, ctx) + .await?; + } + } + Ok(()) + } + + async fn put_value_locked( + &self, + locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, + key: Key, + lsn: Lsn, + val: &Value, + ctx: &RequestContext, + ) -> Result<()> { + trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let off = { // Avoid doing allocations for "small" values. @@ -264,7 +291,7 @@ impl InMemoryLayer { let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); buf.clear(); val.ser_into(&mut buf)?; - inner + locked_inner .file .write_blob( &buf, @@ -275,7 +302,7 @@ impl InMemoryLayer { .await? }; - let vec_map = inner.index.entry(key).or_default(); + let vec_map = locked_inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. @@ -285,13 +312,11 @@ impl InMemoryLayer { Ok(()) } - pub async fn put_tombstone(&self, _key_range: Range, _lsn: Lsn) -> Result<()> { + pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { // TODO: Currently, we just leak the storage for any deleted keys - Ok(()) } - /// Make the layer non-writeable. Only call once. /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub async fn freeze(&self, end_lsn: Lsn) { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8ae911b31e..f5adf9d639 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1118,6 +1118,7 @@ impl LayerInner { tracing::info!("evicted layer after unknown residence period"); } } + timeline.metrics.evictions.inc(); timeline .metrics .resident_physical_size_sub(self.desc.file_size); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 7ff1873eda..aa5894cc37 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind { ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, InitialLogicalSizeCalculation, + HeatmapUpload, + SecondaryDownload, } impl BackgroundLoopKind { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1e84fa1848..24a92859b7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -373,15 +373,20 @@ pub struct GcInfo { } /// An error happened in a get() operation. -#[derive(thiserror::Error)] -pub enum PageReconstructError { +#[derive(thiserror::Error, Debug)] +pub(crate) enum PageReconstructError { #[error(transparent)] Other(#[from] anyhow::Error), + #[error("Ancestor LSN wait error: {0}")] + AncestorLsnTimeout(#[from] WaitLsnError), + /// The operation was cancelled + #[error("Cancelled")] Cancelled, /// The ancestor of this is being stopped + #[error("ancestor timeline {0} is being stopped")] AncestorStopping(TimelineId), /// An error happened replaying WAL records @@ -402,32 +407,6 @@ enum FlushLayerError { Other(#[from] anyhow::Error), } -impl std::fmt::Debug for PageReconstructError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - Self::Other(err) => err.fmt(f), - Self::Cancelled => write!(f, "cancelled"), - Self::AncestorStopping(timeline_id) => { - write!(f, "ancestor timeline {timeline_id} is being stopped") - } - Self::WalRedo(err) => err.fmt(f), - } - } -} - -impl std::fmt::Display for PageReconstructError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - Self::Other(err) => err.fmt(f), - Self::Cancelled => write!(f, "cancelled"), - Self::AncestorStopping(timeline_id) => { - write!(f, "ancestor timeline {timeline_id} is being stopped") - } - Self::WalRedo(err) => err.fmt(f), - } - } -} - #[derive(Clone, Copy)] pub enum LogicalSizeCalculationCause { Initial, @@ -452,6 +431,21 @@ impl std::fmt::Debug for Timeline { } } +#[derive(thiserror::Error, Debug)] +pub(crate) enum WaitLsnError { + // Called on a timeline which is shutting down + #[error("Shutdown")] + Shutdown, + + // Called on an timeline not in active state or shutting down + #[error("Bad state (not active)")] + BadState, + + // Timeout expired while waiting for LSN to catch up with goal. + #[error("{0}")] + Timeout(String), +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -486,7 +480,7 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn get( + pub(crate) async fn get( &self, key: Key, lsn: Lsn, @@ -496,6 +490,11 @@ impl Timeline { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } + // This check is debug-only because of the cost of hashing, and because it's a double-check: we + // already checked the key against the shard_identity when looking up the Timeline from + // page_service. + debug_assert!(!self.shard_identity.is_key_disposable(&key)); + // XXX: structured stats collection for layer eviction here. trace!( "get page request for {}@{} from task kind {:?}", @@ -629,24 +628,28 @@ impl Timeline { /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// - pub async fn wait_lsn( + pub(crate) async fn wait_lsn( &self, lsn: Lsn, _ctx: &RequestContext, /* Prepare for use by cancellation */ - ) -> anyhow::Result<()> { - anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline"); + ) -> Result<(), WaitLsnError> { + if self.cancel.is_cancelled() { + return Err(WaitLsnError::Shutdown); + } else if !self.is_active() { + return Err(WaitLsnError::BadState); + } // This should never be called from the WAL receiver, because that could lead // to a deadlock. - anyhow::ensure!( + debug_assert!( task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), "wait_lsn cannot be called in WAL receiver" ); - anyhow::ensure!( + debug_assert!( task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), "wait_lsn cannot be called in WAL receiver" ); - anyhow::ensure!( + debug_assert!( task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), "wait_lsn cannot be called in WAL receiver" ); @@ -660,18 +663,22 @@ impl Timeline { { Ok(()) => Ok(()), Err(e) => { - // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo - drop(_timer); - let walreceiver_status = self.walreceiver_status(); - Err(anyhow::Error::new(e).context({ - format!( + use utils::seqwait::SeqWaitError::*; + match e { + Shutdown => Err(WaitLsnError::Shutdown), + Timeout => { + // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo + drop(_timer); + let walreceiver_status = self.walreceiver_status(); + Err(WaitLsnError::Timeout(format!( "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn(), walreceiver_status, - ) - })) + ))) + } + } } } } @@ -1459,6 +1466,7 @@ impl Timeline { max_lsn_wal_lag, auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), + ingest_batch_size: self.conf.ingest_batch_size, }, broker_client, ctx, @@ -2223,13 +2231,13 @@ impl Timeline { return Err(layer_traversal_error( if cfg!(test) { format!( - "could not find data for key {} at LSN {}, for request at LSN {}\n{}", - key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), ) } else { format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn ) }, traversal_path, @@ -2289,11 +2297,12 @@ impl Timeline { ancestor .wait_lsn(timeline.ancestor_lsn, ctx) .await - .with_context(|| { - format!( - "wait for lsn {} on ancestor timeline_id={}", - timeline.ancestor_lsn, ancestor.timeline_id - ) + .map_err(|e| match e { + e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e), + WaitLsnError::Shutdown => PageReconstructError::Cancelled, + e @ WaitLsnError::BadState => { + PageReconstructError::Other(anyhow::anyhow!(e)) + } })?; timeline_owned = ancestor; @@ -2471,9 +2480,27 @@ impl Timeline { Ok(()) } - async fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { - let layer = self.get_layer_for_write(lsn).await?; - layer.put_tombstone(key_range, lsn).await?; + async fn put_values( + &self, + values: &HashMap>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // Pick the first LSN in the batch to get the layer to write to. + for lsns in values.values() { + if let Some((lsn, _)) = lsns.first() { + let layer = self.get_layer_for_write(*lsn).await?; + layer.put_values(values, ctx).await?; + break; + } + } + Ok(()) + } + + async fn put_tombstones(&self, tombstones: &[(Range, Lsn)]) -> anyhow::Result<()> { + if let Some((_, lsn)) = tombstones.first() { + let layer = self.get_layer_for_write(*lsn).await?; + layer.put_tombstones(tombstones).await?; + } Ok(()) } @@ -3035,6 +3062,15 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + key = key.next(); + continue; + } let img = match self.get(key, lsn, ctx).await { Ok(img) => img, Err(err) => { @@ -3061,6 +3097,7 @@ impl Timeline { } } }; + image_layer_writer.put_image(key, &img).await?; key = key.next(); } @@ -3631,7 +3668,15 @@ impl Timeline { ))) }); - writer.as_mut().unwrap().put_value(key, lsn, value).await?; + if !self.shard_identity.is_key_disposable(&key) { + writer.as_mut().unwrap().put_value(key, lsn, value).await?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } if !new_layers.is_empty() { fail_point!("after-timeline-compacted-first-L1"); @@ -4186,7 +4231,7 @@ impl Timeline { .context("Failed to reconstruct a page image:") { Ok(img) => img, - Err(e) => return Err(PageReconstructError::from(e)), + Err(e) => return Err(PageReconstructError::WalRedo(e)), }; if img.len() == page_cache::PAGE_SZ { @@ -4529,8 +4574,16 @@ impl<'a> TimelineWriter<'a> { self.tl.put_value(key, lsn, value, ctx).await } - pub async fn delete(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { - self.tl.put_tombstone(key_range, lsn).await + pub(crate) async fn put_batch( + &self, + batch: &HashMap>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.tl.put_values(batch, ctx).await + } + + pub(crate) async fn delete_batch(&self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { + self.tl.put_tombstones(batch).await } /// Track the end of the latest digested WAL record. @@ -4541,11 +4594,11 @@ impl<'a> TimelineWriter<'a> { /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for /// the 'lsn' or anything older. The previous last record LSN is stored alongside /// the latest and can be read. - pub fn finish_write(&self, new_lsn: Lsn) { + pub(crate) fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } - pub fn update_current_logical_size(&self, delta: i64) { + pub(crate) fn update_current_logical_size(&self, delta: i64) { self.tl.update_current_logical_size(delta) } } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index e32265afb5..2fab6722b8 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -58,6 +58,7 @@ pub struct WalReceiverConf { pub max_lsn_wal_lag: NonZeroU64, pub auth_token: Option>, pub availability_zone: Option, + pub ingest_batch_size: u64, } pub struct WalReceiver { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 5a5b3d7586..7fa5bb7689 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -411,6 +411,7 @@ impl ConnectionManagerState { let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; + let ingest_batch_size = self.conf.ingest_batch_size; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -430,6 +431,7 @@ impl ConnectionManagerState { connect_timeout, ctx, node_id, + ingest_batch_size, ) .await; @@ -1345,6 +1347,7 @@ mod tests { max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), auth_token: None, availability_zone: None, + ingest_batch_size: 1, }, wal_connection: None, wal_stream_candidates: HashMap::new(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 61ab236322..e398d683e5 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; use crate::{ context::RequestContext, - metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS}, + metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, @@ -106,6 +106,7 @@ impl From for WalReceiverError { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. +#[allow(clippy::too_many_arguments)] pub(super) async fn handle_walreceiver_connection( timeline: Arc, wal_source_connconf: PgConnectionConfig, @@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection( connect_timeout: Duration, ctx: RequestContext, node: NodeId, + ingest_batch_size: u64, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection( { let mut decoded = DecodedWALRecord::default(); - let mut modification = timeline.begin_modification(endlsn); + let mut modification = timeline.begin_modification(startlsn); + let mut uncommitted_records = 0; + let mut filtered_records = 0; while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are @@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection( return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); } - walingest + // Ingest the records without immediately committing them. + let ingested = walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; + if !ingested { + tracing::debug!("ingest: filtered out record @ LSN {lsn}"); + WAL_INGEST.records_filtered.inc(); + filtered_records += 1; + } fail_point!("walreceiver-after-ingest"); last_rec_lsn = lsn; + + // Commit every ingest_batch_size records. Even if we filtered out + // all records, we still need to call commit to advance the LSN. + uncommitted_records += 1; + if uncommitted_records >= ingest_batch_size { + WAL_INGEST + .records_committed + .inc_by(uncommitted_records - filtered_records); + modification.commit(&ctx).await?; + uncommitted_records = 0; + filtered_records = 0; + } + } + + // Commit the remaining records. + if uncommitted_records > 0 { + WAL_INGEST + .records_committed + .inc_by(uncommitted_records - filtered_records); + modification.commit(&ctx).await?; } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index a6a8972970..8df0c81c7a 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -48,20 +48,18 @@ use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; -pub struct WalIngest<'a> { +pub struct WalIngest { shard: ShardIdentity, - timeline: &'a Timeline, - checkpoint: CheckPoint, checkpoint_modified: bool, } -impl<'a> WalIngest<'a> { +impl WalIngest { pub async fn new( - timeline: &'a Timeline, + timeline: &Timeline, startpoint: Lsn, - ctx: &'_ RequestContext, - ) -> anyhow::Result> { + ctx: &RequestContext, + ) -> anyhow::Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; @@ -70,7 +68,6 @@ impl<'a> WalIngest<'a> { Ok(WalIngest { shard: *timeline.get_shard_identity(), - timeline, checkpoint, checkpoint_modified: false, }) @@ -84,6 +81,8 @@ impl<'a> WalIngest<'a> { /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// + /// This function returns `true` if the record was ingested, and `false` if it was filtered out + /// pub async fn ingest_record( &mut self, recdata: Bytes, @@ -91,11 +90,13 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { WAL_INGEST.records_received.inc(); + let pg_version = modification.tline.pg_version; + let prev_len = modification.len(); - modification.lsn = lsn; - decode_wal_record(recdata, decoded, self.timeline.pg_version)?; + modification.set_lsn(lsn)?; + decode_wal_record(recdata, decoded, pg_version)?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -132,9 +133,9 @@ impl<'a> WalIngest<'a> { } pg_constants::RM_DBASE_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID"); + debug!(%info, %pg_version, "handle RM_DBASE_ID"); - if self.timeline.pg_version == 14 { + if pg_version == 14 { if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); @@ -150,7 +151,7 @@ impl<'a> WalIngest<'a> { .await?; } } - } else if self.timeline.pg_version == 15 { + } else if pg_version == 15 { if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { @@ -170,7 +171,7 @@ impl<'a> WalIngest<'a> { .await?; } } - } else if self.timeline.pg_version == 16 { + } else if pg_version == 16 { if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { @@ -399,19 +400,11 @@ impl<'a> WalIngest<'a> { self.checkpoint_modified = false; } - if modification.is_empty() { - tracing::debug!("ingest: filtered out record @ LSN {lsn}"); - WAL_INGEST.records_filtered.inc(); - modification.tline.finish_write(lsn); - } else { - WAL_INGEST.records_committed.inc(); - modification.commit(ctx).await?; - } + // Note that at this point this record is only cached in the modification + // until commit() is called to flush the data into the repository and update + // the latest LSN. - // Now that this record has been fully handled, including updating the - // checkpoint data, let the repository know that it is up-to-date to this LSN. - - Ok(()) + Ok(modification.len() > prev_len) } /// Do not store this block, but observe it for the purposes of updating our relation size state. @@ -458,7 +451,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)? // do not materialize null pages because them most likely be soon replaced with real data && blk.bimg_len != 0 { @@ -511,7 +504,7 @@ impl<'a> WalIngest<'a> { let mut old_heap_blkno: Option = None; let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; - match self.timeline.pg_version { + match modification.tline.pg_version { 14 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -735,7 +728,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + let vm_size = get_relsize(modification, vm_rel, ctx).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -816,10 +809,11 @@ impl<'a> WalIngest<'a> { let mut new_heap_blkno: Option = None; let mut old_heap_blkno: Option = None; let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; + let pg_version = modification.tline.pg_version; assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); - match self.timeline.pg_version { + match pg_version { 16 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -882,7 +876,7 @@ impl<'a> WalIngest<'a> { } _ => bail!( "Neon RMGR has no known compatibility with PostgreSQL version {}", - self.timeline.pg_version + pg_version ), } @@ -905,7 +899,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + let vm_size = get_relsize(modification, vm_rel, ctx).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -983,16 +977,14 @@ impl<'a> WalIngest<'a> { let src_db_id = rec.src_db_id; let src_tablespace_id = rec.src_tablespace_id; - // Creating a database is implemented by copying the template (aka. source) database. - // To copy all the relations, we need to ask for the state as of the same LSN, but we - // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for - // the last valid LSN to advance up to it. So we use the previous record's LSN in the - // get calls instead. - let req_lsn = modification.tline.get_last_record_lsn(); - let rels = modification .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx) + .list_rels( + src_tablespace_id, + src_db_id, + Version::Modified(modification), + ctx, + ) .await?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); @@ -1000,7 +992,12 @@ impl<'a> WalIngest<'a> { // Copy relfilemap let filemap = modification .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx) + .get_relmap_file( + src_tablespace_id, + src_db_id, + Version::Modified(modification), + ctx, + ) .await?; modification .put_relmap_file(tablespace_id, db_id, filemap, ctx) @@ -1014,7 +1011,7 @@ impl<'a> WalIngest<'a> { let nblocks = modification .tline - .get_rel_size(src_rel, req_lsn, true, ctx) + .get_rel_size(src_rel, Version::Modified(modification), true, ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -1032,7 +1029,13 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx) + .get_rel_page_at_lsn( + src_rel, + blknum, + Version::Modified(modification), + true, + ctx, + ) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1103,7 +1106,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; fsm_physical_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; + let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > fsm_physical_page_no { // check if something to do: FSM is larger than truncate position self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx) @@ -1125,7 +1128,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; vm_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; + let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position self.put_rel_truncation(modification, rel, vm_page_no, ctx) @@ -1198,10 +1201,9 @@ impl<'a> WalIngest<'a> { dbnode: xnode.dbnode, relnode: xnode.relnode, }; - let last_lsn = self.timeline.get_last_record_lsn(); if modification .tline - .get_rel_exists(rel, last_lsn, true, ctx) + .get_rel_exists(rel, Version::Modified(modification), true, ctx) .await? { self.put_rel_drop(modification, rel, ctx).await?; @@ -1255,10 +1257,9 @@ impl<'a> WalIngest<'a> { // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. - let req_lsn = modification.tline.get_last_record_lsn(); for segno in modification .tline - .list_slru_segments(SlruKind::Clog, req_lsn, ctx) + .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx) .await? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -1470,20 +1471,6 @@ impl<'a> WalIngest<'a> { Ok(()) } - async fn get_relsize( - &mut self, - rel: RelTag, - lsn: Lsn, - ctx: &RequestContext, - ) -> anyhow::Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? { - 0 - } else { - self.timeline.get_rel_size(rel, lsn, true, ctx).await? - }; - Ok(nblocks) - } - async fn handle_rel_extend( &mut self, modification: &mut DatadirModification<'_>, @@ -1495,7 +1482,6 @@ impl<'a> WalIngest<'a> { // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it - let last_lsn = modification.lsn; // Get current size and put rel creation if rel doesn't exist // @@ -1503,11 +1489,14 @@ impl<'a> WalIngest<'a> { // check the cache too. This is because eagerly checking the cache results in // less work overall and 10% better performance. It's more work on cache miss // but cache miss is rare. - let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) { + let old_nblocks = if let Some(nblocks) = modification + .tline + .get_cached_rel_size(&rel, modification.get_lsn()) + { nblocks - } else if !self - .timeline - .get_rel_exists(rel, last_lsn, true, ctx) + } else if !modification + .tline + .get_rel_exists(rel, Version::Modified(modification), true, ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1517,7 +1506,10 @@ impl<'a> WalIngest<'a> { .context("Relation Error")?; 0 } else { - self.timeline.get_rel_size(rel, last_lsn, true, ctx).await? + modification + .tline + .get_rel_size(rel, Version::Modified(modification), true, ctx) + .await? }; if new_nblocks > old_nblocks { @@ -1570,10 +1562,9 @@ impl<'a> WalIngest<'a> { // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it - let last_lsn = self.timeline.get_last_record_lsn(); - let old_nblocks = if !self - .timeline - .get_slru_segment_exists(kind, segno, last_lsn, ctx) + let old_nblocks = if !modification + .tline + .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1582,8 +1573,9 @@ impl<'a> WalIngest<'a> { .await?; 0 } else { - self.timeline - .get_slru_segment_size(kind, segno, last_lsn, ctx) + modification + .tline + .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx) .await? }; @@ -1606,6 +1598,26 @@ impl<'a> WalIngest<'a> { } } +async fn get_relsize( + modification: &DatadirModification<'_>, + rel: RelTag, + ctx: &RequestContext, +) -> anyhow::Result { + let nblocks = if !modification + .tline + .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .await? + { + 0 + } else { + modification + .tline + .get_rel_size(rel, Version::Modified(modification), true, ctx) + .await? + }; + Ok(nblocks) +} + #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { @@ -1632,10 +1644,7 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - async fn init_walingest_test<'a>( - tline: &'a Timeline, - ctx: &RequestContext, - ) -> Result> { + async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file @@ -1680,29 +1689,29 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, 3 ); @@ -1710,46 +1719,46 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); @@ -1765,19 +1774,19 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); @@ -1785,13 +1794,13 @@ mod tests { // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, 3 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); @@ -1804,7 +1813,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx) .await?, 0 ); @@ -1817,19 +1826,19 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, TEST_IMG("foo blk 1") ); @@ -1842,21 +1851,21 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, TEST_IMG("foo blk 1500") ); @@ -1883,13 +1892,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, 1 ); @@ -1902,7 +1911,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx) .await?, false ); @@ -1920,13 +1929,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, 1 ); @@ -1959,24 +1968,24 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, relsize ); @@ -1987,7 +1996,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2004,7 +2013,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, 1 ); @@ -2014,7 +2023,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2023,7 +2032,7 @@ mod tests { // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, relsize ); @@ -2032,7 +2041,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2052,13 +2061,13 @@ mod tests { assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, relsize ); @@ -2068,7 +2077,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2101,7 +2110,9 @@ mod tests { assert_current_logical_size(&tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE + 1 ); @@ -2113,7 +2124,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -2126,7 +2139,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE - 1 ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -2142,7 +2157,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, size as BlockNumber ); @@ -2179,7 +2196,7 @@ mod tests { let wal_segment_path = format!("{path}/000000010000000000000001.zst"); let source_initdb_path = format!("{path}/{INITDB_PATH}"); let startpoint = Lsn::from_hex("14AEC08").unwrap(); - let endpoint = Lsn::from_hex("1FFFF98").unwrap(); + let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); let (tenant, ctx) = harness.load().await; @@ -2221,7 +2238,7 @@ mod tests { let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx) .await .unwrap(); - let mut modification = tline.begin_modification(endpoint); + let mut modification = tline.begin_modification(startpoint); let mut decoded = DecodedWALRecord::default(); println!("decoding {} bytes", bytes.len() - xlogoff); @@ -2235,6 +2252,7 @@ mod tests { .await .unwrap(); } + modification.commit(&ctx).await.unwrap(); } let duration = started_at.elapsed(); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 94e95fd3b3..6918698f29 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,6 +22,7 @@ use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; +use pageserver_api::shard::TenantShardId; use serde::Serialize; use std::collections::VecDeque; use std::io; @@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock}; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; +use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock}; #[cfg(feature = "testing")] use std::sync::atomic::{AtomicUsize, Ordering}; -#[cfg(feature = "testing")] -use pageserver_api::shard::TenantShardId; - use crate::config::PageServerConf; use crate::metrics::{ WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, @@ -92,7 +90,7 @@ struct ProcessOutput { /// records. /// pub struct PostgresRedoManager { - tenant_id: TenantId, + tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, redo_process: RwLock>>, @@ -186,10 +184,13 @@ impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// - pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager { + pub fn new( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + ) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { - tenant_id, + tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), redo_process: RwLock::new(None), @@ -244,8 +245,12 @@ impl PostgresRedoManager { let timer = WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer(); let proc = Arc::new( - WalRedoProcess::launch(self.conf, self.tenant_id, pg_version) - .context("launch walredo process")?, + WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, ); timer.observe_duration(); *proc_guard = Some(Arc::clone(&proc)); @@ -638,7 +643,7 @@ impl CloseFileDescriptors for C { struct WalRedoProcess { #[allow(dead_code)] conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, // Some() on construction, only becomes None on Drop. child: Option, stdout: Mutex, @@ -652,10 +657,10 @@ impl WalRedoProcess { // // Start postgres binary in special WAL redo mode. // - #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))] + #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))] fn launch( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. @@ -680,7 +685,7 @@ impl WalRedoProcess { // as close-on-exec by default, but that's not enough, since we use // libraries that directly call libc open without setting that flag. .close_fds() - .spawn_no_leak_child(tenant_id) + .spawn_no_leak_child(tenant_shard_id) .context("spawn process")?; WAL_REDO_PROCESS_COUNTERS.started.inc(); let mut child = scopeguard::guard(child, |child| { @@ -741,12 +746,12 @@ impl WalRedoProcess { error!(error=?e, "failed to read from walredo stderr"); } } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version)) + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) ); Ok(Self { conf, - tenant_id, + tenant_shard_id, child: Some(child), stdin: Mutex::new(ProcessInput { stdin, @@ -772,7 +777,7 @@ impl WalRedoProcess { // Apply given WAL records ('records') over an old page image. Returns // new page image. // - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] fn apply_wal_records( &self, tag: BufferTag, @@ -966,11 +971,7 @@ impl WalRedoProcess { // these files will be collected to an allure report let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId. - let path = self - .conf - .tenant_path(&TenantShardId::unsharded(self.tenant_id)) - .join(&filename); + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); let res = std::fs::OpenOptions::new() .write(true) @@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess { /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. struct NoLeakChild { - tenant_id: TenantId, + tenant_id: TenantShardId, child: Option, } @@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild { } impl NoLeakChild { - fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result { + fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { let child = command.spawn()?; Ok(NoLeakChild { tenant_id, @@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild { Some(child) => child, None => return, }; - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_id; // Offload the kill+wait of the child process into the background. // If someone stops the runtime, we'll leak the child process. // We can ignore that case because we only stop the runtime on pageserver exit. @@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild { tokio::task::spawn_blocking(move || { // Intentionally don't inherit the tracing context from whoever is dropping us. // This thread here is going to outlive of our dropper. - let span = tracing::info_span!("walredo", %tenant_id); + let span = tracing::info_span!( + "walredo", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + ); let _entered = span.enter(); Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); }) @@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild { } trait NoLeakChildCommandExt { - fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result; + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; } impl NoLeakChildCommandExt for Command { - fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { NoLeakChild::spawn(tenant_id, self) } } @@ -1155,6 +1160,7 @@ mod tests { use crate::repository::Key; use crate::{config::PageServerConf, walrecord::NeonWalRecord}; use bytes::Bytes; + use pageserver_api::shard::TenantShardId; use std::str::FromStr; use utils::{id::TenantId, lsn::Lsn}; @@ -1264,9 +1270,9 @@ mod tests { let repo_dir = camino_tempfile::tempdir()?; let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_id); + let manager = PostgresRedoManager::new(conf, tenant_shard_id); Ok(RedoHarness { _repo_dir: repo_dir, diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 3b038f906f..3a7c0f1bb6 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -35,7 +35,8 @@ #define PageStoreTrace DEBUG5 -#define RECONNECT_INTERVAL_USEC 1000000 +#define MIN_RECONNECT_INTERVAL_USEC 1000 +#define MAX_RECONNECT_INTERVAL_USEC 1000000 bool connected = false; PGconn *pageserver_conn = NULL; @@ -133,6 +134,11 @@ pageserver_connect(int elevel) const char *values[3]; int n; + static TimestampTz last_connect_time = 0; + static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; + TimestampTz now; + uint64_t us_since_last_connect; + Assert(!connected); if (CheckConnstringUpdated()) @@ -140,6 +146,22 @@ pageserver_connect(int elevel) ReloadConnstring(); } + now = GetCurrentTimestamp(); + us_since_last_connect = now - last_connect_time; + if (us_since_last_connect < delay_us) + { + pg_usleep(delay_us - us_since_last_connect); + delay_us *= 2; + if (delay_us > MAX_RECONNECT_INTERVAL_USEC) + delay_us = MAX_RECONNECT_INTERVAL_USEC; + last_connect_time = GetCurrentTimestamp(); + } + else + { + delay_us = MIN_RECONNECT_INTERVAL_USEC; + last_connect_time = now; + } + /* * Connect using the connection string we got from the * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment @@ -333,7 +355,6 @@ pageserver_send(NeonRequest *request) { HandleMainLoopInterrupts(); n_reconnect_attempts += 1; - pg_usleep(RECONNECT_INTERVAL_USEC); } n_reconnect_attempts = 0; } diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 7fb0cab9a0..1f7c473e7d 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) port = strchr(host, ':'); if (port == NULL) { - walprop_log(FATAL, "port is not specified"); + wp_log(FATAL, "port is not specified"); } *port++ = '\0'; sep = strchr(port, ','); @@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) *sep++ = '\0'; if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS) { - walprop_log(FATAL, "Too many safekeepers"); + wp_log(FATAL, "too many safekeepers"); } wp->safekeeper[wp->n_safekeepers].host = host; wp->safekeeper[wp->n_safekeepers].port = port; @@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant); if (written > MAXCONNINFO || written < 0) - walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf); @@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } if (wp->n_safekeepers < 1) { - walprop_log(FATAL, "Safekeepers addresses are not specified"); + wp_log(FATAL, "safekeepers addresses are not specified"); } wp->quorum = wp->n_safekeepers / 2 + 1; @@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); wp->greetRequest.systemId = wp->config->systemId; if (!wp->config->neon_timeline) - walprop_log(FATAL, "neon.timeline_id is not provided"); + wp_log(FATAL, "neon.timeline_id is not provided"); if (*wp->config->neon_timeline != '\0' && !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) - walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline); + wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline); if (!wp->config->neon_tenant) - walprop_log(FATAL, "neon.tenant_id is not provided"); + wp_log(FATAL, "neon.tenant_id is not provided"); if (*wp->config->neon_tenant != '\0' && !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) - walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant); + wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant); wp->greetRequest.timeline = wp->config->pgTimeline; wp->greetRequest.walSegSize = wp->config->wal_segment_size; @@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp) if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, wp->config->safekeeper_connection_timeout)) { - walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", - sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout); + wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", + sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout); ShutdownConnection(sk); } } @@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk) * * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */ - walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); /* * Even though the connection failed, we still need to clean up the @@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk) * (see libpqrcv_connect, defined in * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) */ - walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port); + wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port); sk->state = SS_CONNECTING_WRITE; sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); @@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp) static void AdvancePollState(Safekeeper *sk, uint32 events) { -#ifdef WALPROPOSER_LIB /* walprop_log needs wp in lib build */ +#ifdef WALPROPOSER_LIB /* wp_log needs wp in lib build */ WalProposer *wp = sk->wp; #endif @@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * ResetConnection */ case SS_OFFLINE: - walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", - sk->host, sk->port); + wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); break; /* actually unreachable, but prevents * -Wimplicit-fallthrough */ @@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * requests. */ case SS_VOTING: - walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk)); + wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * Idle state for waiting votes from quorum. */ case SS_IDLE: - walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk)); + wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk) switch (result) { case WP_CONN_POLLING_OK: - walprop_log(LOG, "connected with node %s:%s", sk->host, - sk->port); + wp_log(LOG, "connected with node %s:%s", sk->host, + sk->port); sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* @@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk) break; case WP_CONN_POLLING_FAILED: - walprop_log(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); /* * If connecting failed, we don't want to restart the connection @@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk) if (!wp->api.conn_send_query(sk, "START_WAL_PUSH")) { - walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } @@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk) break; case WP_EXEC_FAILED: - walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; @@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk) * wrong" */ case WP_EXEC_UNEXPECTED_SUCCESS: - walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution", - sk->host, sk->port); + wp_log(WARNING, "received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); ShutdownConnection(sk); return; } @@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (wp->n_connected == wp->quorum) { wp->propTerm++; - walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); + wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); wp->voteRequest = (VoteRequest) { @@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk) else if (sk->greetResponse.term > wp->propTerm) { /* Another compute with higher term is running. */ - walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->greetResponse.term, wp->propTerm); + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, wp->propTerm); } /* @@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk) WalProposer *wp = sk->wp; /* We have quorum for voting, send our vote request */ - walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); + wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); /* On failure, logging & resetting is handled */ if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) return; @@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) return; - walprop_log(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), - LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + wp_log(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it @@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk) if ((!sk->voteResponse.voteGiven) && (sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum)) { - walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->voteResponse.term, wp->propTerm); + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, wp->propTerm); } Assert(sk->voteResponse.term == wp->propTerm); @@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp) */ if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) { - walprop_log(FATAL, "failed to download WAL for logical replicaiton"); + wp_log(FATAL, "failed to download WAL for logical replicaiton"); } if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers) @@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp) if (wp->timelineStartLsn != InvalidXLogRecPtr && wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) { - walprop_log(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(wp->timelineStartLsn), - LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); + wp_log(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(wp->timelineStartLsn), + LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); } wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; } @@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp) { wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp); } - walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); + wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } /* @@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp) wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn; - walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - wp->quorum, - wp->propTerm, - LSN_FORMAT_ARGS(wp->propEpochStartLsn), - wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, - LSN_FORMAT_ARGS(wp->truncateLsn)); + wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + wp->quorum, + wp->propTerm, + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + LSN_FORMAT_ARGS(wp->truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN @@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp) * scenario. */ disable_core_dump(); - walprop_log(PANIC, - "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", - LSN_FORMAT_ARGS(wp->propEpochStartLsn), - LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); + wp_log(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } walprop_shared->mineLastElectedTerm = wp->propTerm; @@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - - if (sk->startStreamingAt < wp->truncateLsn) - { - /* - * There's a gap between the WAL starting point and a truncateLsn, - * which can't appear in a normal working cluster. That gap means - * that all safekeepers reported that they have persisted WAL up - * to the truncateLsn before, but now current safekeeper tells - * otherwise. - * - * Also we have a special condition here, which is empty - * safekeeper with no history. In combination with a gap, that can - * happen when we introduce a new safekeeper to the cluster. This - * is a rare case, which is triggered manually for now, and should - * be treated with care. - */ - - /* - * truncateLsn will not change without ack from current - * safekeeper, and it's aligned to the WAL record, so we can - * safely start streaming from this point. - */ - sk->startStreamingAt = wp->truncateLsn; - - walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", - sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn), - LSN_FORMAT_ARGS(sk->startStreamingAt)); - } + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" , + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); + /* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */ + Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); } else { @@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk) } } - Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn); + Assert(sk->startStreamingAt <= wp->availableLsn); msg.tag = 'e'; msg.term = wp->propTerm; @@ -1150,9 +1126,9 @@ SendProposerElected(Safekeeper *sk) msg.timelineStartLsn = wp->timelineStartLsn; lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0; - walprop_log(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + wp_log(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); resetStringInfo(&sk->outbuf); pq_sendint64_le(&sk->outbuf, msg.tag); @@ -1261,8 +1237,8 @@ HandleActiveState(Safekeeper *sk, uint32 events) /* expected never to happen, c.f. walprop_pg_active_state_update_event_set */ if (events & WL_SOCKET_CLOSED) { - walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket", - sk->host, sk->port); + wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket", + sk->host, sk->port); ShutdownConnection(sk); return; } @@ -1323,12 +1299,12 @@ SendAppendRequests(Safekeeper *sk) req = &sk->appendRequest; PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); - walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", - req->endLsn - req->beginLsn, - LSN_FORMAT_ARGS(req->beginLsn), - LSN_FORMAT_ARGS(req->endLsn), - LSN_FORMAT_ARGS(req->commitLsn), - LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); + wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); resetStringInfo(&sk->outbuf); @@ -1355,8 +1331,8 @@ SendAppendRequests(Safekeeper *sk) case NEON_WALREAD_WOULDBLOCK: return true; case NEON_WALREAD_ERROR: - walprop_log(WARNING, "WAL reading for node %s:%s failed: %s", - sk->host, sk->port, errmsg); + wp_log(WARNING, "WAL reading for node %s:%s failed: %s", + sk->host, sk->port, errmsg); ShutdownConnection(sk); return false; default: @@ -1388,9 +1364,9 @@ SendAppendRequests(Safekeeper *sk) return true; case PG_ASYNC_WRITE_FAIL: - walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; default: @@ -1429,11 +1405,11 @@ RecvAppendResponses(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) break; - walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", - sk->appendResponse.term, - LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), - LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), - sk->host, sk->port); + wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port); if (sk->appendResponse.term > wp->propTerm) { @@ -1443,9 +1419,9 @@ RecvAppendResponses(Safekeeper *sk) * core as this is kinda expected scenario. */ disable_core_dump(); - walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", - sk->host, sk->port, - sk->appendResponse.term, wp->propTerm); + wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, wp->propTerm); } readAnything = true; @@ -1489,32 +1465,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->currentClusterSize = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->last_received_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", - LSN_FORMAT_ARGS(rf->last_received_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", + LSN_FORMAT_ARGS(rf->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->disk_consistent_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->remote_consistent_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { @@ -1526,8 +1502,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", - rf->replytime, replyTimeStr); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", + rf->replytime, replyTimeStr); pfree(replyTimeStr); } @@ -1541,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese * Skip unknown keys to support backward compatibile protocol * changes */ - walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); + wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1606,7 +1582,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) if (wp->n_votes < wp->quorum) { - walprop_log(WARNING, "GetDonor called before elections are won"); + wp_log(WARNING, "GetDonor called before elections are won"); return NULL; } @@ -1734,9 +1710,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) return false; case PG_ASYNC_READ_FAIL: - walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, - sk->port, FormatSafekeeperState(sk), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } @@ -1774,8 +1750,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) tag = pq_getmsgint64_le(&s); if (tag != anymsg->tag) { - walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk)); + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return false; } @@ -1851,9 +1827,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes if (!wp->api.conn_blocking_write(sk, msg, msg_size)) { - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } @@ -1904,9 +1880,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); return false; case PG_ASYNC_WRITE_FAIL: - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; default: @@ -1943,9 +1919,9 @@ AsyncFlush(Safekeeper *sk) /* Nothing to do; try again when the socket's ready */ return false; case -1: - walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ResetConnection(sk); return false; default: @@ -1974,11 +1950,11 @@ CompareLsn(const void *a, const void *b) * * The strings are intended to be used as a prefix to "state", e.g.: * - * walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk)); + * wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk)); * * If this sort of phrasing doesn't fit the message, instead use something like: * - * walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk)); + * wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk)); */ static char * FormatSafekeeperState(Safekeeper *sk) @@ -2059,8 +2035,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk) * To give a descriptive message in the case of failure, we use elog * and then an assertion that's guaranteed to fail. */ - walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", - FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk)); + wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk)); Assert(events_ok_for_state); } } @@ -2199,8 +2175,8 @@ FormatEvents(WalProposer *wp, uint32 events) if (events & (~all_flags)) { - walprop_log(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); + wp_log(WARNING, "event formatting found unexpected component %d", + events & (~all_flags)); return_str[6] = '*'; return_str[7] = '\0'; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 6d478076fe..688d8e6e52 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -707,11 +707,23 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); #define WPEVENT 1337 /* special log level for walproposer internal * events */ +#define WP_LOG_PREFIX "[WP] " + +/* + * wp_log is used in pure wp code (walproposer.c), allowing API callback to + * catch logging. + */ #ifdef WALPROPOSER_LIB extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...); -#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__) +#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__) #else -#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__) +#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) #endif +/* + * And wpg_log is used all other (postgres specific) walproposer code, just + * adding prefix. + */ +#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) + #endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 7773aabfab..61a2a54809 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) { StartReplicationCmd cmd; - elog(LOG, "WAL proposer starts streaming at %X/%X", - LSN_FORMAT_ARGS(startpos)); + wpg_log(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; cmd.timeline = wp->greetRequest.timeline; cmd.startpoint = startpos; @@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void) { load_file("libpqwalreceiver", false); if (WalReceiverFunctions == NULL) - elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } /* Helper function */ @@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo) * PGconn structure" */ if (!pg_conn) - elog(FATAL, "failed to allocate new PGconn object"); + wpg_log(FATAL, "failed to allocate new PGconn object"); /* * And in theory this allocation can fail as well, but it's incredibly @@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk) * unused. We'll expect it's never returned. */ case PGRES_POLLING_ACTIVE: - elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); /* * This return is never actually reached, but it's here to make @@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn) */ if (!result) { - elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); return WP_EXEC_UNEXPECTED_SUCCESS; } @@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn) } if (unexpected_success) - elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); return return_val; } @@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount) ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status); /* * If there was actually an error, it'll be properly reported @@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size) case -1: return PG_ASYNC_WRITE_FAIL; default: - elog(FATAL, "invalid return %d from PQputCopyData", result); + wpg_log(FATAL, "invalid return %d from PQputCopyData", result); } /* @@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size) case -1: return PG_ASYNC_WRITE_FAIL; default: - elog(FATAL, "invalid return %d from PQflush", result); + wpg_log(FATAL, "invalid return %d from PQflush", result); } } @@ -1237,19 +1237,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk) return true; /* recovery not needed */ endpos = wp->propEpochStartLsn; - /* - * If we need to download more than a max_slot_wal_keep_size, cap to it to - * avoid risk of exploding pg_wal. Logical replication won't work until - * recreated, but at least compute would start; this also follows - * max_slot_wal_keep_size semantics. - */ - download_range_mb = (endpos - startpos) / 1024 / 1024; - if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) - { - startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024; - walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB", - LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb); - } timeline = wp->greetRequest.timeline; if (!neon_auth_token) @@ -1262,7 +1249,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk) written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); if (written > MAXCONNINFO || written < 0) - elog(FATAL, "could not append password to the safekeeper connection string"); + wpg_log(FATAL, "could not append password to the safekeeper connection string"); } #if PG_MAJORVERSION_NUM < 16 @@ -1279,11 +1266,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk) err))); return false; } - elog(LOG, - "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - sk->host, sk->port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + wpg_log(LOG, + "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + sk->host, sk->port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); options.logical = false; options.startpoint = startpos; @@ -1481,11 +1468,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) { char log_prefix[64]; - snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port); + snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); if (sk->xlogreader == NULL) - elog(FATAL, "Failed to allocate xlog reader"); + wpg_log(FATAL, "failed to allocate xlog reader"); } static NeonWALReadResult @@ -1549,7 +1536,7 @@ static void walprop_pg_init_event_set(WalProposer *wp) { if (waitEvents) - elog(FATAL, "double-initialization of event set"); + wpg_log(FATAL, "double-initialization of event set"); /* for each sk, we have socket plus potentially socket for neon walreader */ waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); @@ -1581,7 +1568,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events) Assert(sk->nwrEventPos == -1); sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk); sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader); - elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events); + wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events); } static void @@ -1680,8 +1667,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk) { WalProposer *wp = to_remove->wp; - elog(DEBUG5, "sk %s:%s: removing event, is_sk %d", - to_remove->host, to_remove->port, is_sk); + wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d", + to_remove->host, to_remove->port, is_sk); /* * Shortpath for exiting if have nothing to do. We never call this @@ -1835,13 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; - elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); + wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); } /* @@ -1987,7 +1974,7 @@ GetLogRepRestartLSN(WalProposer *wp) { uint64 download_range_mb; - elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); + wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); /* * If we need to download more than a max_slot_wal_keep_size, @@ -1999,8 +1986,8 @@ GetLogRepRestartLSN(WalProposer *wp) download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB; if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) { - walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB", - LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb); + wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB", + LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb); return InvalidXLogRecPtr; } diff --git a/poetry.lock b/poetry.lock index 76dfd6d37d..428698cb5a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -288,70 +288,21 @@ files = [ {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] -[[package]] -name = "black" -version = "23.3.0" -description = "The uncompromising code formatter." -optional = false -python-versions = ">=3.7" -files = [ - {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, - {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, - {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, - {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, - {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, - {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, - {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, - {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, - {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, - {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, - {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, - {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, - {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, - {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, - {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, - {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, -] - -[package.dependencies] -click = ">=8.0.0" -mypy-extensions = ">=0.4.3" -packaging = ">=22.0" -pathspec = ">=0.9.0" -platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} - -[package.extras] -colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.7.4)"] -jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] -uvloop = ["uvloop (>=0.15.2)"] - [[package]] name = "boto3" -version = "1.26.16" +version = "1.34.11" description = "The AWS SDK for Python" optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"}, - {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"}, + {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"}, + {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"}, ] [package.dependencies] -botocore = ">=1.29.16,<1.30.0" +botocore = ">=1.34.11,<1.35.0" jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.6.0,<0.7.0" +s3transfer = ">=0.10.0,<0.11.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] @@ -702,22 +653,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"] [[package]] name = "botocore" -version = "1.29.16" +version = "1.34.11" description = "Low-level, data-driven core of boto 3." optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"}, - {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"}, + {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"}, + {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"}, ] [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = ">=1.25.4,<1.27" +urllib3 = [ + {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, + {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, +] [package.extras] -crt = ["awscrt (==0.14.0)"] +crt = ["awscrt (==0.19.19)"] [[package]] name = "botocore-stubs" @@ -1624,17 +1578,6 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] -[[package]] -name = "pathspec" -version = "0.9.0" -description = "Utility library for gitignore style pattern matching of file paths." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" -files = [ - {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, - {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"}, -] - [[package]] name = "pbr" version = "5.9.0" @@ -1646,21 +1589,6 @@ files = [ {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, ] -[[package]] -name = "platformdirs" -version = "2.5.2" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -optional = false -python-versions = ">=3.7" -files = [ - {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, - {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, -] - -[package.extras] -docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"] -test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] - [[package]] name = "pluggy" version = "1.0.0" @@ -1889,13 +1817,13 @@ files = [ [[package]] name = "pytest" -version = "7.3.1" +version = "7.4.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, - {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, + {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, + {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] [package.dependencies] @@ -1907,7 +1835,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-asyncio" @@ -2204,46 +2132,46 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruff" -version = "0.0.269" -description = "An extremely fast Python linter, written in Rust." +version = "0.1.11" +description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"}, - {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"}, - {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"}, - {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"}, - {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"}, - {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"}, + {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"}, + {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"}, + {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"}, + {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"}, + {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"}, + {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"}, ] [[package]] name = "s3transfer" -version = "0.6.0" +version = "0.10.0" description = "An Amazon S3 Transfer Manager" optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"}, - {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"}, + {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"}, + {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"}, ] [package.dependencies] -botocore = ">=1.12.36,<2.0a.0" +botocore = ">=1.33.2,<2.0a.0" [package.extras] -crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] +crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] [[package]] name = "sarif-om" @@ -2493,16 +2421,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2740,4 +2658,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b" +content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482" diff --git a/pre-commit.py b/pre-commit.py index dc0b9ed588..c5ed63ac44 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: return cmd -def black(fix_inplace: bool) -> str: - cmd = "poetry run black" - if not fix_inplace: - cmd += " --diff --check" +def ruff_check(fix_inplace: bool) -> str: + cmd = "poetry run ruff check" + if fix_inplace: + cmd += " --fix" return cmd -def ruff(fix_inplace: bool) -> str: - cmd = "poetry run ruff" - if fix_inplace: - cmd += " --fix" +def ruff_format(fix_inplace: bool) -> str: + cmd = "poetry run ruff format" + if not fix_inplace: + cmd += " --diff --check" return cmd @@ -109,16 +109,16 @@ if __name__ == "__main__": no_color=args.no_color, ) check( - name="black", + name="ruff check", suffix=".py", - cmd=black(fix_inplace=args.fix_inplace), + cmd=ruff_check(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) check( - name="ruff", + name="ruff format", suffix=".py", - cmd=ruff(fix_inplace=args.fix_inplace), + cmd=ruff_format(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) diff --git a/pyproject.toml b/pyproject.toml index 401acaeba4..bb04123e05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = [] [tool.poetry.dependencies] python = "^3.9" -pytest = "^7.3.1" +pytest = "^7.4.4" psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} @@ -17,7 +17,7 @@ aiopg = "^1.4.0" Jinja2 = "^3.0.2" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" -boto3 = "^1.26.16" +boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} moto = {extras = ["server"], version = "^4.1.2"} backoff = "^2.2.1" @@ -40,22 +40,13 @@ pytest-split = "^0.8.1" zstandard = "^0.21.0" [tool.poetry.group.dev.dependencies] -black = "^23.3.0" mypy = "==1.3.0" -ruff = "^0.0.269" +ruff = "^0.1.11" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" -[tool.black] -line-length = 100 -extend-exclude = ''' -/( - vendor -)/ -''' - [tool.mypy] exclude = "^vendor/" check_untyped_defs = true @@ -82,7 +73,9 @@ ignore_missing_imports = true [tool.ruff] target-version = "py39" extend-exclude = ["vendor/"] -ignore = ["E501"] +ignore = [ + "E501", # Line too long, we don't want to be too strict about it +] select = [ "E", # pycodestyle "F", # Pyflakes @@ -90,3 +83,4 @@ select = [ "W", # pycodestyle "B", # bugbear ] +line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index fdae378d55..4d136472e0 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -6,6 +6,7 @@ license.workspace = true [dependencies] aws-sdk-s3.workspace = true +aws-smithy-async.workspace = true either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true @@ -39,3 +40,5 @@ tracing-subscriber.workspace = true clap.workspace = true tracing-appender = "0.2" histogram = "0.7" + +futures.workspace = true diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index 8fb1346c8e..d2842877d0 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -16,10 +16,12 @@ use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::retry::RetryConfig; use aws_config::sso::SsoCredentialsProvider; use aws_config::BehaviorVersion; -use aws_sdk_s3::config::Region; +use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; use aws_sdk_s3::{Client, Config}; +use aws_smithy_async::rt::sleep::TokioSleep; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; @@ -283,9 +285,13 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie ) }; + let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let mut builder = Config::builder() .behavior_version(BehaviorVersion::v2023_11_09()) .region(bucket_region) + .retry_config(RetryConfig::adaptive().with_max_attempts(3)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)) .credentials_provider(credentials_provider); if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index ef020edc2a..957213856b 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -1,3 +1,4 @@ +use pageserver_api::shard::TenantShardId; use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use s3_scrubber::scan_metadata::scan_metadata; use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; @@ -34,6 +35,8 @@ enum Command { ScanMetadata { #[arg(short, long, default_value_t = false)] json: bool, + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, }, } @@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) - } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); + Command::ScanMetadata { json, tenant_ids } => { + match scan_metadata(bucket_config.clone(), tenant_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) + } else { + Ok(()) + } } } - }, + } Command::FindGarbage { node_kind, depth, diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index bcc4d2e618..bfde8f0213 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -187,10 +187,17 @@ Timeline layer count: {6} } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. -pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result { +pub async fn scan_metadata( + bucket_config: BucketConfig, + tenant_ids: Vec, +) -> anyhow::Result { let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?; - let tenants = stream_tenants(&s3_client, &target); + let tenants = if tenant_ids.is_empty() { + futures::future::Either::Left(stream_tenants(&s3_client, &target)) + } else { + futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + }; // How many tenants to process in parallel. We need to be mindful of pageservers // accessing the same per tenant prefixes, so use a lower setting than pageservers. diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 4015c27933..364cad7892 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -54,6 +54,7 @@ postgres_ffi.workspace = true pq_proto.workspace = true remote_storage.workspace = true safekeeper_api.workspace = true +sha2.workspace = true sd-notify.workspace = true storage_broker.workspace = true tokio-stream.workspace = true diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 7aadd67ac6..591bfea182 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -66,12 +66,10 @@ impl FileStorage { /// Create file storage for a new timeline, but don't persist it yet. pub fn create_new( - ttid: &TenantTimelineId, + timeline_dir: Utf8PathBuf, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { - let timeline_dir = conf.timeline_dir(ttid); - let store = FileStorage { timeline_dir, conf: conf.clone(), @@ -277,7 +275,8 @@ mod test { .await .expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); - let storage = FileStorage::create_new(ttid, conf, state.clone())?; + let timeline_dir = conf.timeline_dir(ttid); + let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?; Ok((storage, state)) } diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs new file mode 100644 index 0000000000..ef88eb27e3 --- /dev/null +++ b/safekeeper/src/copy_timeline.rs @@ -0,0 +1,250 @@ +use std::sync::Arc; + +use anyhow::{bail, Result}; +use camino::Utf8PathBuf; + +use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use tokio::{ + fs::OpenOptions, + io::{AsyncSeekExt, AsyncWriteExt}, +}; +use tracing::{info, warn}; +use utils::{id::TenantTimelineId, lsn::Lsn}; + +use crate::{ + control_file::{FileStorage, Storage}, + pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline}, + safekeeper::SafeKeeperState, + timeline::{Timeline, TimelineError}, + wal_backup::copy_s3_segments, + wal_storage::{wal_file_paths, WalReader}, + GlobalTimelines, SafeKeeperConf, +}; + +// we don't want to have more than 10 segments on disk after copy, because they take space +const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64; + +pub struct Request { + pub source: Arc, + pub until_lsn: Lsn, + pub destination_ttid: TenantTimelineId, +} + +pub async fn handle_request(request: Request) -> Result<()> { + // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :( + // if LSN will point to the middle of a WAL record, timeline will be in "broken" state + + match GlobalTimelines::get(request.destination_ttid) { + // timeline already exists. would be good to check that this timeline is the copy + // of the source timeline, but it isn't obvious how to do that + Ok(_) => return Ok(()), + // timeline not found, we are going to create it + Err(TimelineError::NotFound(_)) => {} + // error, probably timeline was deleted + res => { + res?; + } + } + + let conf = &GlobalTimelines::get_global_config(); + let ttid = request.destination_ttid; + + let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; + + let (mem_state, state) = request.source.get_state().await; + let start_lsn = state.timeline_start_lsn; + if start_lsn == Lsn::INVALID { + bail!("timeline is not initialized"); + } + let backup_lsn = mem_state.backup_lsn; + + { + let commit_lsn = mem_state.commit_lsn; + let flush_lsn = request.source.get_flush_lsn().await; + + info!( + "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}", + start_lsn, backup_lsn, commit_lsn, flush_lsn + ); + + assert!(backup_lsn >= start_lsn); + assert!(commit_lsn >= start_lsn); + assert!(flush_lsn >= start_lsn); + + if request.until_lsn > flush_lsn { + bail!("requested LSN is beyond the end of the timeline"); + } + if request.until_lsn < start_lsn { + bail!("requested LSN is before the start of the timeline"); + } + + if request.until_lsn > commit_lsn { + warn!("copy_timeline WAL is not fully committed"); + } + + if backup_lsn < request.until_lsn && request.until_lsn.0 - backup_lsn.0 > MAX_BACKUP_LAG { + // we have a lot of segments that are not backed up. we can try to wait here until + // segments will be backed up to remote storage, but it's not clear how long to wait + bail!("too many segments are not backed up"); + } + } + + let wal_seg_size = state.server.wal_seg_size as usize; + if wal_seg_size == 0 { + bail!("wal_seg_size is not set"); + } + + let first_segment = start_lsn.segment_number(wal_seg_size); + let last_segment = request.until_lsn.segment_number(wal_seg_size); + + let new_backup_lsn = { + // we can't have new backup_lsn greater than existing backup_lsn or start of the last segment + let max_backup_lsn = backup_lsn.min(Lsn(last_segment * wal_seg_size as u64)); + + if max_backup_lsn <= start_lsn { + // probably we are starting from the first segment, which was not backed up yet. + // note that start_lsn can be in the middle of the segment + start_lsn + } else { + // we have some segments backed up, so we will assume all WAL below max_backup_lsn is backed up + assert!(max_backup_lsn.segment_offset(wal_seg_size) == 0); + max_backup_lsn + } + }; + + // all previous segments will be copied inside S3 + let first_ondisk_segment = new_backup_lsn.segment_number(wal_seg_size); + assert!(first_ondisk_segment <= last_segment); + assert!(first_ondisk_segment >= first_segment); + + copy_s3_segments( + wal_seg_size, + &request.source.ttid, + &request.destination_ttid, + first_segment, + first_ondisk_segment, + ) + .await?; + + copy_disk_segments( + conf, + &state, + wal_seg_size, + &request.source.ttid, + new_backup_lsn, + request.until_lsn, + &tli_dir_path, + ) + .await?; + + let mut new_state = SafeKeeperState::new( + &request.destination_ttid, + state.server.clone(), + vec![], + request.until_lsn, + start_lsn, + ); + new_state.timeline_start_lsn = start_lsn; + new_state.peer_horizon_lsn = request.until_lsn; + new_state.backup_lsn = new_backup_lsn; + + let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?; + file_storage.persist(&new_state).await?; + + // now we have a ready timeline in a temp directory + validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?; + load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?; + + Ok(()) +} + +async fn copy_disk_segments( + conf: &SafeKeeperConf, + persisted_state: &SafeKeeperState, + wal_seg_size: usize, + source_ttid: &TenantTimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + tli_dir_path: &Utf8PathBuf, +) -> Result<()> { + let mut wal_reader = WalReader::new( + conf.workdir.clone(), + conf.timeline_dir(source_ttid), + persisted_state, + start_lsn, + true, + )?; + + let mut buf = [0u8; MAX_SEND_SIZE]; + + let first_segment = start_lsn.segment_number(wal_seg_size); + let last_segment = end_lsn.segment_number(wal_seg_size); + + for segment in first_segment..=last_segment { + let segment_start = segment * wal_seg_size as u64; + let segment_end = segment_start + wal_seg_size as u64; + + let copy_start = segment_start.max(start_lsn.0); + let copy_end = segment_end.min(end_lsn.0); + + let copy_start = copy_start - segment_start; + let copy_end = copy_end - segment_start; + + let wal_file_path = { + let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?; + + if segment == last_segment { + partial + } else { + normal + } + }; + + write_segment( + &mut buf, + &wal_file_path, + wal_seg_size as u64, + copy_start, + copy_end, + &mut wal_reader, + ) + .await?; + } + + Ok(()) +} + +async fn write_segment( + buf: &mut [u8], + file_path: &Utf8PathBuf, + wal_seg_size: u64, + from: u64, + to: u64, + reader: &mut WalReader, +) -> Result<()> { + assert!(from <= to); + assert!(to <= wal_seg_size); + + let mut file = OpenOptions::new() + .create(true) + .write(true) + .open(&file_path) + .await?; + + // maybe fill with zeros, as in wal_storage.rs? + file.set_len(wal_seg_size).await?; + file.seek(std::io::SeekFrom::Start(from)).await?; + + let mut bytes_left = to - from; + while bytes_left > 0 { + let len = bytes_left as usize; + let len = len.min(buf.len()); + let len = reader.read(&mut buf[..len]).await?; + file.write_all(&buf[..len]).await?; + bytes_left -= len as u64; + } + + file.flush().await?; + file.sync_all().await?; + Ok(()) +} diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index daf9255ecb..c9ff1afdea 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -7,13 +7,16 @@ use std::io::Read; use std::path::PathBuf; use std::sync::Arc; +use anyhow::bail; use anyhow::Result; use camino::Utf8Path; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; +use postgres_ffi::MAX_SEND_SIZE; use serde::Deserialize; use serde::Serialize; +use sha2::{Digest, Sha256}; use utils::id::NodeId; use utils::id::TenantTimelineId; use utils::id::{TenantId, TimelineId}; @@ -25,6 +28,7 @@ use crate::safekeeper::TermHistory; use crate::SafeKeeperConf; use crate::send_wal::WalSenderState; +use crate::wal_storage::WalReader; use crate::GlobalTimelines; /// Various filters that influence the resulting JSON output. @@ -300,3 +304,56 @@ fn build_config(config: SafeKeeperConf) -> Config { wal_backup_enabled: config.wal_backup_enabled, } } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineDigestRequest { + pub from_lsn: Lsn, + pub until_lsn: Lsn, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineDigest { + pub sha256: String, +} + +pub async fn calculate_digest( + tli: &Arc, + request: TimelineDigestRequest, +) -> Result { + if request.from_lsn > request.until_lsn { + bail!("from_lsn is greater than until_lsn"); + } + + let conf = GlobalTimelines::get_global_config(); + let (_, persisted_state) = tli.get_state().await; + + if persisted_state.timeline_start_lsn > request.from_lsn { + bail!("requested LSN is before the start of the timeline"); + } + + let mut wal_reader = WalReader::new( + conf.workdir.clone(), + tli.timeline_dir.clone(), + &persisted_state, + request.from_lsn, + true, + )?; + + let mut hasher = Sha256::new(); + let mut buf = [0u8; MAX_SEND_SIZE]; + + let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize; + while bytes_left > 0 { + let bytes_to_read = std::cmp::min(buf.len(), bytes_left); + let bytes_read = wal_reader.read(&mut buf[..bytes_to_read]).await?; + if bytes_read == 0 { + bail!("wal_reader.read returned 0 bytes"); + } + hasher.update(&buf[..bytes_read]); + bytes_left -= bytes_read; + } + + let digest = hasher.finalize(); + let digest = hex::encode(digest); + Ok(TimelineDigest { sha256: digest }) +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 25a3334e63..5283ea19c1 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -2,7 +2,7 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; use once_cell::sync::Lazy; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::SkTimelineInfo; +use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt; @@ -14,19 +14,21 @@ use tokio::fs::File; use tokio::io::AsyncReadExt; use tokio_util::sync::CancellationToken; use utils::failpoint_support::failpoints_handler; +use utils::http::request::parse_query_param; use std::io::Write as _; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; -use tracing::info_span; +use tracing::{info_span, Instrument}; use utils::http::endpoint::{request_span, ChannelWriter}; +use crate::debug_dump::TimelineDigestRequest; use crate::receive_wal::WalReceiverState; use crate::safekeeper::Term; use crate::safekeeper::{ServerInfo, TermLsn}; use crate::send_wal::WalSenderState; use crate::timeline::PeerInfo; -use crate::{debug_dump, pull_timeline}; +use crate::{copy_timeline, debug_dump, pull_timeline}; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; @@ -204,6 +206,56 @@ async fn timeline_pull_handler(mut request: Request) -> Result) -> Result, ApiError> { + check_permission(&request, None)?; + + let request_data: TimelineCopyRequest = json_request(&mut request).await?; + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "source_timeline_id")?, + ); + + let source = GlobalTimelines::get(ttid)?; + + copy_timeline::handle_request(copy_timeline::Request{ + source, + until_lsn: request_data.until_lsn, + destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id), + }) + .instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn)) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +async fn timeline_digest_handler(request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let from_lsn: Option = parse_query_param(&request, "from_lsn")?; + let until_lsn: Option = parse_query_param(&request, "until_lsn")?; + + let request = TimelineDigestRequest { + from_lsn: from_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!( + "from_lsn is required" + )))?, + until_lsn: until_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!( + "until_lsn is required" + )))?, + }; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let response = debug_dump::calculate_digest(&tli, request) + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, response) +} + /// Download a file from the timeline directory. // TODO: figure out a better way to copy files between safekeepers async fn timeline_files_handler(request: Request) -> Result, ApiError> { @@ -472,11 +524,18 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", |r| request_span(r, timeline_files_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", + |r| request_span(r, timeline_copy_handler), + ) // for tests .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) }) .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler)) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| { + request_span(r, timeline_digest_handler) + }) } #[cfg(test)] diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 3a086f1f54..fc5f99eb00 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -16,6 +16,7 @@ mod auth; pub mod broker; pub mod control_file; pub mod control_file_upgrade; +pub mod copy_timeline; pub mod debug_dump; pub mod handler; pub mod http; diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index ad3a18a536..93b51f32c0 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,16 +1,24 @@ +use std::sync::Arc; + +use camino::Utf8PathBuf; +use camino_tempfile::Utf8TempDir; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use anyhow::{bail, Context, Result}; use tokio::io::AsyncWriteExt; use tracing::info; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; use crate::{ control_file, debug_dump, http::routes::TimelineStatus, + timeline::{Timeline, TimelineError}, wal_storage::{self, Storage}, - GlobalTimelines, + GlobalTimelines, SafeKeeperConf, }; /// Info about timeline on safekeeper ready for reporting. @@ -91,7 +99,7 @@ pub async fn handle_request(request: Request) -> Result { async fn pull_timeline(status: TimelineStatus, host: String) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( - "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", + "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", ttid, host, status.commit_lsn, @@ -121,14 +129,14 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result if dump.timelines.len() != 1 { bail!( - "Expected to fetch single timeline, got {} timelines", + "expected to fetch single timeline, got {} timelines", dump.timelines.len() ); } let timeline = dump.timelines.into_iter().next().unwrap(); let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!( - "Timeline {} doesn't have disk content", + "timeline {} doesn't have disk content", ttid ))?; @@ -155,29 +163,12 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result filenames.insert(0, "safekeeper.control".to_string()); info!( - "Downloading {} files from safekeeper {}", + "downloading {} files from safekeeper {}", filenames.len(), host ); - // Creating temp directory for a new timeline. It needs to be - // located on the same filesystem as the rest of the timelines. - - // conf.workdir is usually /storage/safekeeper/data - // will try to transform it into /storage/safekeeper/tmp - let temp_base = conf - .workdir - .parent() - .ok_or(anyhow::anyhow!("workdir has no parent"))? - .join("tmp"); - - tokio::fs::create_dir_all(&temp_base).await?; - - let tli_dir = camino_tempfile::Builder::new() - .suffix("_temptli") - .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) - .tempdir_in(temp_base)?; - let tli_dir_path = tli_dir.path().to_path_buf(); + let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; // Note: some time happens between fetching list of files and fetching files themselves. // It's possible that some files will be removed from safekeeper and we will fail to fetch them. @@ -201,47 +192,105 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result // TODO: fsync? // Let's create timeline from temp directory and verify that it's correct + let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?; + info!( + "finished downloading timeline {}, commit_lsn={}, flush_lsn={}", + ttid, commit_lsn, flush_lsn + ); + assert!(status.commit_lsn <= status.flush_lsn); - let control_path = tli_dir_path.join("safekeeper.control"); + // Finally, load the timeline. + let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?; + + Ok(Response { + safekeeper_host: host, + }) +} + +/// Create temp directory for a new timeline. It needs to be located on the same +/// filesystem as the rest of the timelines. It will be automatically deleted when +/// Utf8TempDir goes out of scope. +pub async fn create_temp_timeline_dir( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, +) -> Result<(Utf8TempDir, Utf8PathBuf)> { + // conf.workdir is usually /storage/safekeeper/data + // will try to transform it into /storage/safekeeper/tmp + let temp_base = conf + .workdir + .parent() + .ok_or(anyhow::anyhow!("workdir has no parent"))? + .join("tmp"); + + tokio::fs::create_dir_all(&temp_base).await?; + + let tli_dir = camino_tempfile::Builder::new() + .suffix("_temptli") + .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) + .tempdir_in(temp_base)?; + + let tli_dir_path = tli_dir.path().to_path_buf(); + + Ok((tli_dir, tli_dir_path)) +} + +/// Do basic validation of a temp timeline, before moving it to the global map. +pub async fn validate_temp_timeline( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + path: &Utf8PathBuf, +) -> Result<(Lsn, Lsn)> { + let control_path = path.join("safekeeper.control"); let control_store = control_file::FileStorage::load_control_file(control_path)?; if control_store.server.wal_seg_size == 0 { bail!("wal_seg_size is not set"); } - let wal_store = - wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?; + let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?; - let commit_lsn = status.commit_lsn; + let commit_lsn = control_store.commit_lsn; let flush_lsn = wal_store.flush_lsn(); - info!( - "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}", - ttid, commit_lsn, flush_lsn - ); - assert!(status.commit_lsn <= status.flush_lsn); + Ok((commit_lsn, flush_lsn)) +} + +/// Move timeline from a temp directory to the main storage, and load it to the global map. +/// This operation is done under a lock to prevent bugs if several concurrent requests are +/// trying to load the same timeline. Note that it doesn't guard against creating the +/// timeline with the same ttid, but no one should be doing this anyway. +pub async fn load_temp_timeline( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + tmp_path: &Utf8PathBuf, +) -> Result> { + // Take a lock to prevent concurrent loadings + let load_lock = GlobalTimelines::loading_lock().await; + let guard = load_lock.lock().await; + + if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) { + bail!("timeline already exists, cannot overwrite it") + } // Move timeline dir to the correct location let timeline_path = conf.timeline_dir(&ttid); info!( - "Moving timeline {} from {} to {}", - ttid, tli_dir_path, timeline_path + "moving timeline {} from {} to {}", + ttid, tmp_path, timeline_path ); tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?; - tokio::fs::rename(tli_dir_path, &timeline_path).await?; + tokio::fs::rename(tmp_path, &timeline_path).await?; - let tli = GlobalTimelines::load_timeline(ttid) + let tli = GlobalTimelines::load_timeline(&guard, ttid) .await .context("Failed to load timeline after copy")?; info!( - "Loaded timeline {}, flush_lsn={}", + "loaded timeline {}, flush_lsn={}", ttid, tli.get_flush_lsn().await ); - Ok(Response { - safekeeper_host: host, - }) + Ok(tli) } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index bdc9088138..2f284abe8c 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -141,7 +141,8 @@ impl SharedState { // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. - let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; + let timeline_dir = conf.timeline_dir(ttid); + let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?; let wal_store = wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index cbb3342e40..92ac5ba66d 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -21,8 +21,12 @@ struct GlobalTimelinesState { timelines: HashMap>, wal_backup_launcher_tx: Option>, conf: Option, + load_lock: Arc>, } +// Used to prevent concurrent timeline loading. +pub struct TimelineLoadLock; + impl GlobalTimelinesState { /// Get configuration, which must be set once during init. fn get_conf(&self) -> &SafeKeeperConf { @@ -63,6 +67,7 @@ static TIMELINES_STATE: Lazy> = Lazy::new(|| { timelines: HashMap::new(), wal_backup_launcher_tx: None, conf: None, + load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), }) }); @@ -174,8 +179,16 @@ impl GlobalTimelines { Ok(()) } + /// Take a lock for timeline loading. + pub async fn loading_lock() -> Arc> { + TIMELINES_STATE.lock().unwrap().load_lock.clone() + } + /// Load timeline from disk to the memory. - pub async fn load_timeline(ttid: TenantTimelineId) -> Result> { + pub async fn load_timeline<'a>( + _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, + ttid: TenantTimelineId, + ) -> Result> { let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies(); match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index c99bbc7d61..e4499eaf50 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -7,7 +7,7 @@ use tokio::task::JoinHandle; use utils::id::NodeId; use std::cmp::min; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; @@ -531,3 +531,62 @@ pub async fn read_object( Ok(Box::pin(reader)) } + +/// Copy segments from one timeline to another. Used in copy_timeline. +pub async fn copy_s3_segments( + wal_seg_size: usize, + src_ttid: &TenantTimelineId, + dst_ttid: &TenantTimelineId, + from_segment: XLogSegNo, + to_segment: XLogSegNo, +) -> Result<()> { + const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024; + + let storage = REMOTE_STORAGE + .get() + .expect("failed to get remote storage") + .as_ref() + .unwrap(); + + let relative_dst_path = + Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string()); + + let remote_path = RemotePath::new(&relative_dst_path)?; + + let files = storage.list_files(Some(&remote_path)).await?; + let uploaded_segments = &files + .iter() + .filter_map(|file| file.object_name().map(ToOwned::to_owned)) + .collect::>(); + + debug!( + "these segments have already been uploaded: {:?}", + uploaded_segments + ); + + let relative_src_path = + Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string()); + + for segno in from_segment..to_segment { + if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 { + info!("copied all segments from {} until {}", from_segment, segno); + } + + let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size); + if uploaded_segments.contains(&segment_name) { + continue; + } + debug!("copying segment {}", segment_name); + + let from = RemotePath::new(&relative_src_path.join(&segment_name))?; + let to = RemotePath::new(&relative_dst_path.join(&segment_name))?; + + storage.copy_object(&from, &to).await?; + } + + info!( + "finished copying segments from {} until {}", + from_segment, to_segment + ); + Ok(()) +} diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e7538f805c..8d138c701f 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -728,7 +728,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { } /// Helper returning full path to WAL segment file and its .partial brother. -fn wal_file_paths( +pub fn wal_file_paths( timeline_dir: &Utf8Path, segno: XLogSegNo, wal_seg_size: usize, diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index ff584bd4b0..980f343047 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -63,7 +63,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: If those files already exist, we will overwrite them. Returns basepath for files with captured output. """ - assert type(cmd) is list + assert isinstance(cmd, list) base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) basepath = os.path.join(capture_dir, base) stdout_filename = basepath + ".stdout" diff --git a/scripts/reformat b/scripts/reformat index 8688044f66..3533c4dcb8 100755 --- a/scripts/reformat +++ b/scripts/reformat @@ -6,5 +6,5 @@ set -euox pipefail echo 'Reformatting Rust code' cargo fmt echo 'Reformatting Python code' -poetry run ruff --fix test_runner scripts -poetry run black test_runner scripts +poetry run ruff check --fix test_runner scripts +poetry run ruff format test_runner scripts diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5b1a8ba27d..001d4e23a9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1101,8 +1101,8 @@ class AbstractNeonCli(abc.ABC): If `local_binpath` is true, then we are invoking a test utility """ - assert type(arguments) == list - assert type(self.COMMAND) == str + assert isinstance(arguments, list) + assert isinstance(self.COMMAND, str) if local_binpath: # Test utility @@ -3032,6 +3032,28 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", + json=body, + ) + res.raise_for_status() + + def timeline_digest( + self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn + ) -> Dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", + params={ + "from_lsn": str(from_lsn), + "until_lsn": str(until_lsn), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def timeline_create( self, tenant_id: TenantId, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index add6c4288a..a779dcc436 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) + def tenant_secondary_download(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download") + self.verbose_error(res) + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( @@ -361,9 +365,9 @@ class PageserverHttpClient(requests.Session): assert isinstance(res, dict) assert TenantId(res["id"]) == tenant_id size = res["size"] - assert type(size) == int + assert isinstance(size, int) inputs = res["inputs"] - assert type(inputs) is dict + assert isinstance(inputs, dict) return (size, inputs) def tenant_size_debug(self, tenant_id: TenantId) -> str: @@ -714,7 +718,7 @@ class PageserverHttpClient(requests.Session): ) self.verbose_error(res) - assert res.status_code == 200 + assert res.status_code in (200, 304) def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 1e6e9a0174..8a9509ea44 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare): # Please do not alter the label for the query, as it is used to identify it. # Labels for ClickBench queries match the labels in ClickBench reports # on https://benchmark.clickhouse.com/ (the DB size may differ). +# +# Disable auto formatting for the list of queries so that it's easier to read +# fmt: off QUERIES: Tuple[LabelledQuery, ...] = ( - # Disable `black` formatting for the list of queries so that it's easier to read - # fmt: off ### ClickBench queries: LabelledQuery("Q0", r"SELECT COUNT(*) FROM hits;"), LabelledQuery("Q1", r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"), @@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = ( # LabelledQuery("NQ0", r"..."), # LabelledQuery("NQ1", r"..."), # ... - # fmt: on ) +# fmt: on EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)" diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 3cb4b667ff..7eb244d378 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare: else: assert ( len(x) == 2 - ), f"request param ({request.param}) should have a format of \ - `neon_{{safekeepers_enable_fsync}}`" + ), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`" # `NeonCompare` interface neon_env_builder = request.getfixturevalue("neon_env_builder") diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 32397bbcc1..ed389b1aa2 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv): assert set(our_tenant_config.effective_config.keys()) == set( fully_custom_config.keys() ), "ensure we cover all config options" - assert { - k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k] - for k in fully_custom_config.keys() - } == { - k: True for k in fully_custom_config.keys() - }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" + assert ( + { + k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k] + for k in fully_custom_config.keys() + } + == {k: True for k in fully_custom_config.keys()} + ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" ps_http.tenant_detach(tenant_id) env.pageserver.tenant_attach(tenant_id, config=fully_custom_config) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 5a9c2782e6..f9d6d0a934 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -186,9 +186,7 @@ def test_backward_compatibility( else: raise - assert ( - not breaking_changes_allowed - ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" @check_ondisk_data_compatibility_if_enabled @@ -247,9 +245,7 @@ def test_forward_compatibility( else: raise - assert ( - not breaking_changes_allowed - ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 7ec901af34..01ecc2b95f 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -2,7 +2,6 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft - # Restart nodes with WAL end having specially crafted shape, like last record # crossing segment boundary, to test decoding issues. diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 2cd2406065..efba2033fb 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -102,9 +102,7 @@ def test_basic_eviction( ), f"Did not expect to find {local_layer} layer after evicting" empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) - assert ( - not empty_layers - ), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" + assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) assert ( diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 340188c1ae..999e077e45 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 573d2139ce..e29db1e252 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -145,8 +145,7 @@ def expect_updated_msg_lsn( last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"]) assert ( prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn - ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ - compared to the previous message's LSN {prev_msg_lsn}" + ), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}" return last_msg_lsn diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8ae4297983..a9eff99a0c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,9 +1,11 @@ import random +from pathlib import Path from typing import Any, Dict, Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber +from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until @@ -251,6 +253,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): flush_ms=5000, ) + # Encourage the new location to download while still in secondary mode + pageserver_b.http_client().tenant_secondary_download(tenant_id) + migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) log.info(f"Acquired generation {migrated_generation} for destination pageserver") assert migrated_generation == initial_generation + 1 @@ -258,8 +263,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # Writes and reads still work in AttachedStale. workload.validate(pageserver_a.id) - # TODO: call into secondary mode API hooks to do an upload/download sync - # Generate some more dirty writes: we expect the origin to ingest WAL in # in AttachedStale workload.churn_rows(64, pageserver_a.id, upload=False) @@ -369,3 +372,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): log.info(f"Read back heatmap: {heatmap_second}") assert heatmap_second != heatmap_first validate_heatmap(heatmap_second) + + +def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: + """ + Inspect local storage on a pageserver to discover which layer files are present. + + :return: list of relative paths to layers, from the timeline root. + """ + timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) + + def relative(p: Path) -> Path: + return p.relative_to(timeline_path) + + return sorted( + list( + map( + relative, + filter( + lambda path: path.name != "metadata" + and "ephemeral" not in path.name + and "temp" not in path.name, + timeline_path.glob("*"), + ), + ) + ) + ) + + +def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): + """ + Test the overall data flow in secondary mode: + - Heatmap uploads from the attached location + - Heatmap & layer downloads from the secondary location + - Eviction of layers on the attached location results in deletion + on the secondary location as well. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps_attached = env.pageservers[0] + ps_secondary = env.pageservers[1] + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, ps_attached.id) + + # Configure a secondary location + log.info("Setting up secondary location...") + ps_secondary.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + readback_conf = ps_secondary.read_tenant_location_conf(tenant_id) + log.info(f"Read back conf: {readback_conf}") + + # Explicit upload/download cycle + # ============================== + log.info("Synchronizing after initial write...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + + # Make changes on attached pageserver, check secondary downloads them + # =================================================================== + log.info("Synchronizing after subsequent write...") + workload.churn_rows(128, ps_attached.id) + + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + + # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while + # walreceiver is still doing something. + import time + + time.sleep(5) + + # Do evictions on attached pageserver, check secondary follows along + # ================================================================== + log.info("Evicting a layer...") + layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] + ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name) + + log.info("Synchronizing after eviction...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + + # Scrub the remote storage + # ======================== + # This confirms that the scrubber isn't upset by the presence of the heatmap + S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata() + + # Detach secondary and delete tenant + # =================================== + # This confirms that the heatmap gets cleaned up as well as other normal content. + log.info("Detaching secondary location...") + ps_secondary.tenant_location_configure( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + }, + ) + + log.info("Deleting tenant...") + tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10) + + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 0dcbb23ad4..c6dbc77885 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -391,8 +391,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] assert ( tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} \ - should not be present in pageserver's memory" + ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach. @@ -430,8 +429,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] assert ( tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} \ - should not be present in pageserver's memory" + ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" def test_detach_while_attaching( @@ -817,9 +815,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( if found_broken: break time.sleep(0.5) - assert ( - found_broken - ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" + assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" env.pageserver.tenant_load(env.initial_tenant) @@ -837,6 +833,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( break time.sleep(0.5) - assert ( - found_active - ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" + assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index dcd7232b1b..1887bca23b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -161,12 +161,10 @@ def switch_pg_to_new_pageserver( files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( "metadata" in files_before_detach - ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ - but got: {files_before_detach}" + ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}" assert ( len(files_before_detach) >= 2 - ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ - but got {files_before_detach}" + ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}" return timeline_to_detach_local_path diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 07fb6dc5ca..6f05d7f7cb 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -201,8 +201,8 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): len(restored_timelines) == 1 ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" restored_timeline = restored_timelines[0] - assert restored_timeline["timeline_id"] == str( - timeline_id + assert ( + restored_timeline["timeline_id"] == str(timeline_id) ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" # Check that we had to retry the downloads @@ -280,8 +280,8 @@ def test_tenant_redownloads_truncated_file_on_startup( len(restored_timelines) == 1 ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" retored_timeline = restored_timelines[0] - assert retored_timeline["timeline_id"] == str( - timeline_id + assert ( + retored_timeline["timeline_id"] == str(timeline_id) ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" # Request non-incremental logical size. Calculating it needs the layer file that diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index cf8df389c8..b4ce633531 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -566,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) - endpoint.stop_and_destroy() + endpoint.stop() timeline_delete_wait_completed(ps_http, tenant_id, timeline_id) # Also delete and manually create timeline on safekeepers -- this tests @@ -1838,3 +1838,83 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): assert final_stats.get("START_REPLICATION", 0) >= 1 # walproposer should connect to each safekeeper at least once assert final_stats.get("START_WAL_PUSH", 0) >= 3 + + +@pytest.mark.parametrize("insert_rows", [0, 100, 100000, 500000]) +def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): + target_percents = [10, 50, 90, 100] + + neon_env_builder.num_safekeepers = 3 + # we need remote storage that supports copy_object S3 API + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + + lsns = [] + + def remember_lsn(): + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + lsns.append(lsn) + return lsn + + # remember LSN right after timeline creation + lsn = remember_lsn() + log.info(f"LSN after timeline creation: {lsn}") + + endpoint.safe_psql("create table t(key int, value text)") + + timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id) + timeline_start_lsn = timeline_status.timeline_start_lsn + log.info(f"Timeline start LSN: {timeline_start_lsn}") + + current_percent = 0.0 + for new_percent in target_percents: + new_rows = insert_rows * (new_percent - current_percent) / 100 + current_percent = new_percent + + if new_rows == 0: + continue + + endpoint.safe_psql( + f"insert into t select generate_series(1, {new_rows}), repeat('payload!', 10)" + ) + + # remember LSN right after reaching new_percent + lsn = remember_lsn() + log.info(f"LSN after inserting {new_rows} rows: {lsn}") + + # TODO: would be also good to test cases where not all segments are uploaded to S3 + + for lsn in lsns: + new_timeline_id = TimelineId.generate() + log.info(f"Copying branch for LSN {lsn}, to timeline {new_timeline_id}") + + orig_digest = ( + env.safekeepers[0] + .http_client() + .timeline_digest(tenant_id, timeline_id, timeline_start_lsn, lsn) + ) + log.info(f"Original digest: {orig_digest}") + + for sk in env.safekeepers: + sk.http_client().copy_timeline( + tenant_id, + timeline_id, + { + "target_timeline_id": str(new_timeline_id), + "until_lsn": str(lsn), + }, + ) + + new_digest = sk.http_client().timeline_digest( + tenant_id, new_timeline_id, timeline_start_lsn, lsn + ) + log.info(f"Digest after timeline copy on safekeeper {sk.id}: {new_digest}") + + assert orig_digest == new_digest + + # TODO: test timelines can start after copy diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 68be0b3617..704e3721d6 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -167,22 +167,21 @@ build: | && apt-get update \ && apt-get install -y \ build-essential \ - curl \ + git \ libevent-dev \ - libssl-dev \ - patchutils \ + libtool \ pkg-config - ENV PGBOUNCER_VERSION 1.21.0 - ENV PGBOUNCER_GITPATH 1_21_0 + # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits. + # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) + ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1 RUN set -e \ - && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ - && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ - && cd pgbouncer-${PGBOUNCER_VERSION} \ - && curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \ + && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \ + && cd pgbouncer \ + && ./autogen.sh \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ - && make -j $(nproc) \ - && make install + && make -j $(nproc) dist_man_MANS= \ + && make install dist_man_MANS= merge: | # tweak nofile limits RUN set -e \