diff --git a/Cargo.lock b/Cargo.lock index 1364c9d84f..1b6b423444 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -184,7 +184,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" dependencies = [ "concurrent-queue", - "event-listener", + "event-listener 2.5.3", "futures-core", ] @@ -205,11 +205,13 @@ dependencies = [ [[package]] name = "async-lock" -version = "2.8.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c" dependencies = [ - "event-listener", + "event-listener 4.0.0", + "event-listener-strategy", + "pin-project-lite", ] [[package]] @@ -692,9 +694,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.16.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467" +checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd" dependencies = [ "async-trait", "base64 0.21.1", @@ -702,8 +704,10 @@ dependencies = [ "dyn-clone", "futures", "getrandom 0.2.11", + "hmac", "http-types", "log", + "once_cell", "paste", "pin-project", "quick-xml", @@ -712,6 +716,7 @@ dependencies = [ "rustc_version", "serde", "serde_json", + "sha2", "time", "url", "uuid", @@ -719,9 +724,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.16.2" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9" +checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8" dependencies = [ "async-lock", "async-trait", @@ -731,7 +736,6 @@ dependencies = [ "oauth2", "pin-project", "serde", - "serde_json", "time", "tz-rs", "url", @@ -740,21 +744,18 @@ dependencies = [ [[package]] name = "azure_storage" -version = "0.16.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97" +checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1" dependencies = [ "RustyXML", + "async-lock", "async-trait", "azure_core", "bytes", - "futures", - "hmac", "log", "serde", "serde_derive", - "serde_json", - "sha2", "time", "url", "uuid", @@ -762,13 +763,14 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.16.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4" +checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872" dependencies = [ "RustyXML", "azure_core", "azure_storage", + "azure_svc_blobstorage", "bytes", "futures", "log", @@ -780,6 +782,22 @@ dependencies = [ "uuid", ] +[[package]] +name = "azure_svc_blobstorage" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backtrace" version = "0.3.67" @@ -1686,6 +1704,27 @@ version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" +[[package]] +name = "event-listener" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3" +dependencies = [ + "event-listener 4.0.0", + "pin-project-lite", +] + [[package]] name = "fail" version = "0.5.1" @@ -3678,9 +3717,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" dependencies = [ "memchr", "serde", diff --git a/Cargo.toml b/Cargo.toml index 33f56e084f..496a9d7839 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,10 +38,10 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.16" -azure_identity = "0.16" -azure_storage = "0.16" -azure_storage_blobs = "0.16" +azure_core = "0.18" +azure_identity = "0.18" +azure_storage = "0.18" +azure_storage_blobs = "0.18" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 8d53a6a658..6f0b929ac6 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -168,7 +168,7 @@ fn print_timelines_tree( info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings - .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)), + .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)), }, ) }) diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index c0c44e279f..fbb0358158 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -165,7 +165,7 @@ pub fn migrate_tenant( let found = other_ps_tenants .into_iter() .map(|t| t.id) - .any(|i| i == tenant_id); + .any(|i| i.tenant_id == tenant_id); if !found { continue; } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 495a58e865..a3029e67a5 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -357,7 +357,7 @@ pub enum TenantAttachmentStatus { #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { - pub id: TenantId, + pub id: TenantShardId, // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's pub state: TenantState, /// Sum of the size of all layer files. @@ -369,7 +369,7 @@ pub struct TenantInfo { /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { - pub tenant_id: TenantId, + pub tenant_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, @@ -385,6 +385,9 @@ pub struct TimelineInfo { /// The LSN that we are advertizing to safekeepers pub remote_consistent_lsn_visible: Lsn, + /// The LSN from the start of the root timeline (never changes) + pub initdb_lsn: Lsn, + pub current_logical_size: u64, pub current_logical_size_is_accurate: bool, @@ -823,7 +826,7 @@ mod tests { fn test_tenantinfo_serde() { // Test serialization/deserialization of TenantInfo let original_active = TenantInfo { - id: TenantId::generate(), + id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, @@ -840,7 +843,7 @@ mod tests { }); let original_broken = TenantInfo { - id: TenantId::generate(), + id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Broken { reason: "reason".into(), backtrace: "backtrace info".into(), diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 9e83e0eee2..052fbd1402 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -76,6 +76,11 @@ impl TenantShardId { pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { ShardSlug(self) } + + /// Convenience for code that has special behavior on the 0th shard. + pub fn is_zero(&self) -> bool { + self.shard_number == ShardNumber(0) + } } /// Formatting helper diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index e559d00ded..548bde02f6 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -271,17 +271,12 @@ impl RemoteStorage for AzureBlobStorage { let mut builder = blob_client.get(); - if let Some(end_exclusive) = end_exclusive { - builder = builder.range(Range::new(start_inclusive, end_exclusive)); + let range: Range = if let Some(end_exclusive) = end_exclusive { + (start_inclusive..end_exclusive).into() } else { - // Open ranges are not supported by the SDK so we work around - // by setting the upper limit extremely high (but high enough - // to still be representable by signed 64 bit integers). - // TODO remove workaround once the SDK adds open range support - // https://github.com/Azure/azure-sdk-for-rust/issues/1438 - let end_exclusive = u64::MAX / 4; - builder = builder.range(Range::new(start_inclusive, end_exclusive)); - } + (start_inclusive..).into() + }; + builder = builder.range(range); self.download_for_builder(builder).await } diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 9aad0af22d..31c76d2f74 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -30,18 +30,32 @@ async fn warn_if_stuck( let mut fut = std::pin::pin!(fut); - loop { + let mut warned = false; + let ret = loop { match tokio::time::timeout(warn_period, &mut fut).await { - Ok(ret) => return ret, + Ok(ret) => break ret, Err(_) => { tracing::warn!( gate = name, elapsed_ms = started.elapsed().as_millis(), "still waiting, taking longer than expected..." ); + warned = true; } } + }; + + // If we emitted a warning for slowness, also emit a message when we complete, so that + // someone debugging a shutdown can know for sure whether we have moved past this operation. + if warned { + tracing::info!( + gate = name, + elapsed_ms = started.elapsed().as_millis(), + "completed, after taking longer than expected" + ) } + + ret } #[derive(Debug)] diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index bfd7897b49..8f2b88d191 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -269,12 +269,18 @@ async fn calculate_synthetic_size_worker( } }; - for (tenant_id, tenant_state) in tenants { + for (tenant_shard_id, tenant_state) in tenants { if tenant_state != TenantState::Active { continue; } - if let Ok(tenant) = mgr::get_tenant(tenant_id, true) { + if !tenant_shard_id.is_zero() { + // We only send consumption metrics from shard 0, so don't waste time calculating + // synthetic size on other shards. + continue; + } + + if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) { // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? // We can put in some prioritization for consumption metrics. // Same for the loop that fetches computed metrics. @@ -291,7 +297,9 @@ async fn calculate_synthetic_size_worker( ); if !is_cancelled { - error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}"); + error!( + "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}" + ); } } } diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 918e45ea9e..0b827816bc 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -2,7 +2,6 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; -use pageserver_api::shard::ShardNumber; use std::{sync::Arc, time::SystemTime}; use utils::{ id::{TenantId, TimelineId}, @@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move { - if state != TenantState::Active { + if state != TenantState::Active || !id.is_zero() { None } else { crate::tenant::mgr::get_tenant(id, true) .ok() - .map(|tenant| (id, tenant)) + .map(|tenant| (id.tenant_id, tenant)) } }); @@ -229,11 +228,6 @@ where while let Some((tenant_id, tenant)) = tenants.next().await { let mut tenant_resident_size = 0; - // Sharded tenants report all consumption metrics from shard zero - if tenant.tenant_shard_id().shard_number != ShardNumber(0) { - continue; - } - for timeline in tenant.list_timelines() { let timeline_id = timeline.timeline_id; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 237109abfe..9422ccb2fd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -84,7 +84,6 @@ paths: required: true schema: type: string - format: hex get: description: Get tenant status responses: @@ -181,7 +180,6 @@ paths: required: true schema: type: string - format: hex get: description: Get timelines for tenant responses: @@ -232,7 +230,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -338,7 +335,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -401,7 +397,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -469,7 +464,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -523,7 +517,6 @@ paths: required: true schema: type: string - format: hex post: description: | Schedules attach operation to happen in the background for the given tenant. @@ -631,7 +624,6 @@ paths: required: true schema: type: string - format: hex - name: flush_ms in: query required: false @@ -724,7 +716,6 @@ paths: required: true schema: type: string - format: hex - name: detach_ignored in: query required: false @@ -784,7 +775,6 @@ paths: required: true schema: type: string - format: hex post: description: | Remove tenant data (including all corresponding timelines) from pageserver's memory. @@ -833,7 +823,6 @@ paths: required: true schema: type: string - format: hex post: description: | Schedules an operation that attempts to load a tenant from the local disk and @@ -890,7 +879,6 @@ paths: required: true schema: type: string - format: hex get: description: | Calculate tenant's synthetic size @@ -933,7 +921,6 @@ paths: required: true schema: type: string - format: hex - name: inputs_only in: query required: false @@ -1003,7 +990,6 @@ paths: required: true schema: type: string - format: hex post: description: | Create a timeline. Returns new timeline id on success.\ @@ -1137,7 +1123,6 @@ paths: application/json: schema: type: string - format: hex "400": description: Malformed tenant create request content: @@ -1234,7 +1219,6 @@ paths: required: true schema: type: string - format: hex get: description: | Returns tenant's config description: specific config overrides a tenant has @@ -1340,7 +1324,6 @@ components: properties: new_tenant_id: type: string - format: hex generation: type: integer description: Attachment generation number. @@ -1369,7 +1352,6 @@ components: properties: tenant_id: type: string - format: hex TenantLocationConfigRequest: type: object required: @@ -1377,7 +1359,6 @@ components: properties: tenant_id: type: string - format: hex mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] @@ -1446,7 +1427,6 @@ components: format: hex tenant_id: type: string - format: hex last_record_lsn: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9e41d912c2..fee50460a5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -319,6 +319,7 @@ async fn build_timeline_info_common( ctx: &RequestContext, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + let initdb_lsn = timeline.initdb_lsn; let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -352,14 +353,14 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); let info = TimelineInfo { - // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id - tenant_id: timeline.tenant_shard_id.tenant_id, + tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id, ancestor_lsn, disk_consistent_lsn: timeline.get_disk_consistent_lsn(), remote_consistent_lsn: remote_consistent_lsn_projected, remote_consistent_lsn_visible, + initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), @@ -480,15 +481,15 @@ async fn timeline_list_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -507,7 +508,9 @@ async fn timeline_list_handler( } Ok::, ApiError>(response_data) } - .instrument(info_span!("timeline_list", %tenant_id)) + .instrument(info_span!("timeline_list", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, response_data) @@ -517,17 +520,17 @@ async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; // Logical size calculation needs downloading. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline_info = async { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; let timeline = tenant .get_timeline(timeline_id, false) @@ -544,7 +547,10 @@ async fn timeline_detail_handler( Ok::<_, ApiError>(timeline_info) } - .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_detail", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) .await?; json_response(StatusCode::OK, timeline_info) @@ -554,8 +560,15 @@ async fn get_lsn_by_timestamp_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + if !tenant_shard_id.is_zero() { + // Requires SLRU contents, which are only stored on shard zero + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } let version: Option = parse_query_param(&request, "version")?; @@ -567,7 +580,7 @@ async fn get_lsn_by_timestamp_handler( let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; @@ -602,8 +615,15 @@ async fn get_timestamp_of_lsn_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + if !tenant_shard_id.is_zero() { + // Requires SLRU contents, which are only stored on shard zero + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -613,7 +633,7 @@ async fn get_timestamp_of_lsn_handler( .map_err(ApiError::BadRequest)?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -805,11 +825,11 @@ async fn tenant_status( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let tenant_info = async { - let tenant = mgr::get_tenant(tenant_id, false)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; // Calculate total physical size of all timelines let mut current_physical_size = 0; @@ -819,13 +839,15 @@ async fn tenant_status( let state = tenant.current_state(); Result::<_, ApiError>::Ok(TenantInfo { - id: tenant_id, + id: tenant_shard_id, state: state.clone(), current_physical_size: Some(current_physical_size), attachment_status: state.attachment_status(), }) } - .instrument(info_span!("tenant_status_handler", %tenant_id)) + .instrument(info_span!("tenant_status_handler", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, tenant_info) @@ -868,14 +890,20 @@ async fn tenant_size_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let inputs_only: Option = parse_query_param(&request, "inputs_only")?; let retention_period: Option = parse_query_param(&request, "retention_period")?; let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; + + if !tenant_shard_id.is_zero() { + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } // this can be long operation let inputs = tenant @@ -927,7 +955,7 @@ async fn tenant_size_handler( json_response( StatusCode::OK, TenantHistorySize { - id: tenant_id, + id: tenant_shard_id.tenant_id, size: sizes.as_ref().map(|x| x.total_size), segment_sizes: sizes.map(|x| x.segments), inputs, @@ -939,14 +967,14 @@ async fn layer_map_info_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset); - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let layer_map_info = timeline.layer_map_info(reset).await; json_response(StatusCode::OK, layer_map_info) @@ -956,13 +984,12 @@ async fn layer_download_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let downloaded = timeline .download_layer(layer_file_name) .await @@ -973,7 +1000,7 @@ async fn layer_download_handler( Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, - format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"), + format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } @@ -982,12 +1009,12 @@ async fn evict_timeline_layer_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let evicted = timeline .evict_layer(layer_file_name) .await @@ -998,7 +1025,7 @@ async fn evict_timeline_layer_handler( Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, - format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"), + format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } @@ -1130,10 +1157,10 @@ async fn get_tenant_config_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let tenant = mgr::get_tenant(tenant_id, false)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; let response = HashMap::from([ ( @@ -1227,9 +1254,9 @@ async fn handle_tenant_break( r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) + let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true) .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; tenant.set_broken("broken from test".to_owned()).await; @@ -1270,14 +1297,15 @@ async fn timeline_gc_handler( mut request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?; + let wait_task_done = + mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; let gc_result = wait_task_done .await .context("wait for gc task") @@ -1292,9 +1320,9 @@ async fn timeline_compact_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { @@ -1302,14 +1330,14 @@ async fn timeline_compact_handler( } async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; timeline .compact(&cancel, flags, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, ()) } - .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1318,9 +1346,9 @@ async fn timeline_checkpoint_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { @@ -1328,7 +1356,7 @@ async fn timeline_checkpoint_handler( } async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; timeline .freeze_and_flush() .await @@ -1340,7 +1368,7 @@ async fn timeline_checkpoint_handler( json_response(StatusCode::OK, ()) } - .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1348,12 +1376,12 @@ async fn timeline_download_remote_layers_handler_post( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; match timeline.spawn_download_all_remote_layers(body).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), @@ -1364,11 +1392,11 @@ async fn timeline_download_remote_layers_handler_get( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") @@ -1414,9 +1442,9 @@ async fn getpage_at_lsn_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; struct Key(crate::repository::Key); @@ -1435,7 +1463,7 @@ async fn getpage_at_lsn_handler( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let page = timeline.get(key.0, lsn, &ctx).await?; @@ -1447,7 +1475,7 @@ async fn getpage_at_lsn_handler( .unwrap(), ) } - .instrument(info_span!("timeline_get", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1455,9 +1483,9 @@ async fn timeline_collect_keyspace( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; struct Partitioning { keys: crate::keyspace::KeySpace, @@ -1526,7 +1554,7 @@ async fn timeline_collect_keyspace( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let keys = timeline .collect_keyspace(at_lsn, &ctx) @@ -1535,15 +1563,15 @@ async fn timeline_collect_keyspace( json_response(StatusCode::OK, Partitioning { keys, at_lsn }) } - .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } async fn active_timeline_of_active_tenant( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; tenant .get_timeline(timeline_id, true) .map_err(|e| ApiError::NotFound(e.into())) @@ -1820,23 +1848,25 @@ pub fn make_router( }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) - .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) + .get("/v1/tenant/:tenant_shard_id", |r| { + api_handler(r, tenant_status) + }) .delete("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_delete_handler) }) - .get("/v1/tenant/:tenant_id/synthetic_size", |r| { + .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| { api_handler(r, tenant_size_handler) }) .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) - .get("/v1/tenant/:tenant_id/config", |r| { + .get("/v1/tenant/:tenant_shard_id/config", |r| { api_handler(r, get_tenant_config_handler) }) .put("/v1/tenant/:tenant_shard_id/location_config", |r| { api_handler(r, put_tenant_location_config_handler) }) - .get("/v1/tenant/:tenant_id/timeline", |r| { + .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) .post("/v1/tenant/:tenant_shard_id/timeline", |r| { @@ -1857,47 +1887,50 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/ignore", |r| { api_handler(r, tenant_ignore_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp", |r| api_handler(r, get_lsn_by_timestamp_handler), ) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) - .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { - api_handler(r, timeline_gc_handler) - }) - .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| { - testing_api_handler("run timeline compaction", r, timeline_compact_handler) - }) .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", + |r| api_handler(r, timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", + |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), ) .post( - "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_post), ) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { - api_handler(r, layer_map_info_handler) - }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", + |r| api_handler(r, layer_map_info_handler), + ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), ) .delete( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) .put("/v1/disk_usage_eviction/run", |r| { @@ -1906,18 +1939,19 @@ pub fn make_router( .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) - .put("/v1/tenant/:tenant_id/break", |r| { + .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) .get("/v1/panic", |r| api_handler(r, always_panic_handler)) .post("/v1/tracing/event", |r| { testing_api_handler("emit a tracing event", r, post_tracing_event_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| { - testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler) - }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", + |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), + ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace), ) .any(handler_404)) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 67d798c1d4..7cc0333ee5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -650,7 +650,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| "pageserver_evictions_with_low_residence_duration", "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \ Residence duration is determined using the `residence_duration_data_source`.", - &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] + &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] ) .expect("failed to define a metric") }); @@ -714,10 +714,16 @@ impl EvictionsWithLowResidenceDurationBuilder { } } - fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration { + fn build( + &self, + tenant_id: &str, + shard_id: &str, + timeline_id: &str, + ) -> EvictionsWithLowResidenceDuration { let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION .get_metric_with_label_values(&[ tenant_id, + shard_id, timeline_id, self.data_source, &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold), @@ -748,21 +754,24 @@ impl EvictionsWithLowResidenceDuration { pub fn change_threshold( &mut self, tenant_id: &str, + shard_id: &str, timeline_id: &str, new_threshold: Duration, ) { if new_threshold == self.threshold { return; } - let mut with_new = - EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold) - .build(tenant_id, timeline_id); + let mut with_new = EvictionsWithLowResidenceDurationBuilder::new( + self.data_source, + new_threshold, + ) + .build(tenant_id, shard_id, timeline_id); std::mem::swap(self, &mut with_new); - with_new.remove(tenant_id, timeline_id); + with_new.remove(tenant_id, shard_id, timeline_id); } // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`. - fn remove(&mut self, tenant_id: &str, timeline_id: &str) { + fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) { let Some(_counter) = self.counter.take() else { return; }; @@ -771,6 +780,7 @@ impl EvictionsWithLowResidenceDuration { let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[ tenant_id, + shard_id, timeline_id, self.data_source, &threshold, @@ -1603,6 +1613,7 @@ impl StorageTimeMetrics { #[derive(Debug)] pub struct TimelineMetrics { tenant_id: String, + shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, @@ -1623,11 +1634,12 @@ pub struct TimelineMetrics { impl TimelineMetrics { pub fn new( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, ) -> Self { - let tenant_id = tenant_id.to_string(); + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); let flush_time_histo = StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id); @@ -1664,11 +1676,12 @@ impl TimelineMetrics { let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let evictions_with_low_residence_duration = - evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); + let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder + .build(&tenant_id, &shard_id, &timeline_id); TimelineMetrics { tenant_id, + shard_id, timeline_id, flush_time_histo, compact_time_histo, @@ -1714,6 +1727,7 @@ impl Drop for TimelineMetrics { fn drop(&mut self) { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; + let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); @@ -1727,7 +1741,7 @@ impl Drop for TimelineMetrics { self.evictions_with_low_residence_duration .write() .unwrap() - .remove(tenant_id, timeline_id); + .remove(tenant_id, shard_id, timeline_id); // The following metrics are born outside of the TimelineMetrics lifecycle but still // removed at the end of it. The idea is to have the metrics outlive the diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index dbd85d2dcf..c3c98af406 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -28,7 +28,7 @@ //! Page cache maps from a cache key to a buffer slot. //! The cache key uniquely identifies the piece of data that is being cached. //! -//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`]. +//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`]. //! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access. //! //! The cache key for **immutable file** pages is [`FileId`] and a block number. @@ -83,10 +83,8 @@ use std::{ use anyhow::Context; use once_cell::sync::OnceCell; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use pageserver_api::shard::TenantShardId; +use utils::{id::TimelineId, lsn::Lsn}; use crate::{ context::RequestContext, @@ -154,7 +152,13 @@ enum CacheKey { #[derive(Debug, PartialEq, Eq, Hash, Clone)] struct MaterializedPageHashKey { - tenant_id: TenantId, + /// Why is this TenantShardId rather than TenantId? + /// + /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this + /// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this + /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are + /// special-cased in some other way. + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: Key, } @@ -378,7 +382,7 @@ impl PageCache { /// returned page. pub async fn lookup_materialized_page( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: &Key, lsn: Lsn, @@ -395,7 +399,7 @@ impl PageCache { let mut cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { - tenant_id, + tenant_shard_id, timeline_id, key: *key, }, @@ -436,7 +440,7 @@ impl PageCache { /// pub async fn memorize_materialized_page( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: Key, lsn: Lsn, @@ -444,7 +448,7 @@ impl PageCache { ) -> anyhow::Result<()> { let cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { - tenant_id, + tenant_shard_id, timeline_id, key, }, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c653f0b7ea..b81037ae47 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -822,10 +822,7 @@ impl<'a> DatadirModification<'a> { self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + self.init_aux_dir()?; let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), @@ -933,10 +930,7 @@ impl<'a> DatadirModification<'a> { self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory as well - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + self.init_aux_dir()?; } if r.is_none() { // Create RelDirectory @@ -1261,6 +1255,14 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { + let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { + files: HashMap::new(), + })?; + self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + Ok(()) + } + pub async fn put_file( &mut self, path: &str, @@ -1767,6 +1769,13 @@ const AUX_FILES_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. +// AUX_FILES currently stores only data for logical replication (slots etc), and +// we don't preserve these on a branch because safekeepers can't follow timeline +// switch (and generally it likely should be optional), so ignore these. +pub fn is_inherited_key(key: Key) -> bool { + key != AUX_FILES_KEY +} + pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 4270b6edb0..8747d9ad50 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -42,6 +42,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; +use pageserver_api::shard::TenantShardId; use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; @@ -51,7 +52,7 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use crate::shutdown_pageserver; @@ -317,7 +318,7 @@ struct PageServerTask { /// Tasks may optionally be launched for a particular tenant/timeline, enabling /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`] - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, mutable: Mutex, @@ -329,7 +330,7 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, name: &str, shutdown_process_on_error: bool, @@ -345,7 +346,7 @@ where kind, name: name.to_string(), cancel: cancel.clone(), - tenant_id, + tenant_shard_id, timeline_id, mutable: Mutex::new(MutableTaskState { join_handle: None }), }); @@ -424,28 +425,28 @@ async fn task_finish( Ok(Err(err)) => { if shutdown_process_on_error { error!( - "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); shutdown_process = true; } else { error!( - "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); } } Err(err) => { if shutdown_process_on_error { error!( - "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); shutdown_process = true; } else { error!( - "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); } } @@ -467,11 +468,11 @@ async fn task_finish( /// /// Or to shut down all tasks for given timeline: /// -/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id)) +/// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id)) /// pub async fn shutdown_tasks( kind: Option, - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, ) { let mut victim_tasks = Vec::new(); @@ -480,35 +481,35 @@ pub async fn shutdown_tasks( let tasks = TASKS.lock().unwrap(); for task in tasks.values() { if (kind.is_none() || Some(task.kind) == kind) - && (tenant_id.is_none() || task.tenant_id == tenant_id) + && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id) && (timeline_id.is_none() || task.timeline_id == timeline_id) { task.cancel.cancel(); victim_tasks.push(( Arc::clone(task), task.kind, - task.tenant_id, + task.tenant_shard_id, task.timeline_id, )); } } } - let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none(); + let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none(); - for (task, task_kind, tenant_id, timeline_id) in victim_tasks { + for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); task_mut.join_handle.take() }; if let Some(mut join_handle) = join_handle { if log_all { - if tenant_id.is_none() { + if tenant_shard_id.is_none() { // there are quite few of these info!(name = task.name, kind = ?task_kind, "stopping global task"); } else { // warn to catch these in tests; there shouldn't be any - warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); + warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } } if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle) @@ -517,12 +518,13 @@ pub async fn shutdown_tasks( { // allow some time to elapse before logging to cut down the number of log // lines. - info!("waiting for {} to shut down", task.name); + info!("waiting for task {} to shut down", task.name); // we never handled this return value, but: // - we don't deschedule which would lead to is_cancelled // - panics are already logged (is_panicked) // - task errors are already logged in the wrapper let _ = join_handle.await; + info!("task {} completed", task.name); } } else { // Possibly one of: diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 48f71d7747..a8e8b4cbfa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -608,7 +608,7 @@ impl Tenant { task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), None, "attach tenant", false, @@ -1917,7 +1917,7 @@ impl Tenant { // // this will additionally shutdown and await all timeline tasks. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await; // Wait for any in-flight operations to complete self.gate.close().await; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index b8d6d0a321..acd311ace6 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -463,7 +463,7 @@ impl DeleteTenantFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), None, "tenant_delete", false, @@ -550,7 +550,7 @@ impl DeleteTenantFlow { // we encounter an InProgress marker, yield the barrier it contains and wait on it. let barrier = { let mut locked = tenants.write().unwrap(); - let removed = locked.remove(&tenant.tenant_shard_id.tenant_id); + let removed = locked.remove(tenant.tenant_shard_id); // FIXME: we should not be modifying this from outside of mgr.rs. // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 8466fe7fca..4d7bd4259f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -98,33 +98,6 @@ pub(crate) enum TenantsMap { ShuttingDown(BTreeMap), } -/// Helper for mapping shard-unaware functions to a sharding-aware map -/// TODO(sharding): all users of this must be made shard-aware. -fn exactly_one_or_none<'a>( - map: &'a BTreeMap, - tenant_id: &TenantId, -) -> Option<(&'a TenantShardId, &'a TenantSlot)> { - let mut slots = map.range(TenantShardId::tenant_range(*tenant_id)); - - // Retrieve the first two slots in the range: if both are populated, we must panic because the caller - // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time. - let slot_a = slots.next(); - let slot_b = slots.next(); - match (slot_a, slot_b) { - (None, None) => None, - (Some(slot), None) => { - // Exactly one matching slot - Some(slot) - } - (Some(_slot_a), Some(_slot_b)) => { - // Multiple shards for this tenant: cannot handle this yet. - // TODO(sharding): callers of get() should be shard-aware. - todo!("Attaching multiple shards in teh same tenant to the same pageserver") - } - (None, Some(_)) => unreachable!(), - } -} - pub(crate) enum TenantsMapRemoveResult { Occupied(TenantSlot), Vacant, @@ -147,12 +120,11 @@ impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, /// None is returned. - pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc> { + pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc> { match self { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - // TODO(sharding): callers of get() should be shard-aware. - exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached()) + m.get(tenant_shard_id).and_then(|slot| slot.get_attached()) } } } @@ -204,25 +176,19 @@ impl TenantsMap { /// /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded /// slot if the enclosed tenant is shutdown. - pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult { + pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult { use std::collections::btree_map::Entry; match self { TenantsMap::Initializing => TenantsMapRemoveResult::Vacant, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k); - match key { - Some(key) => match m.entry(key) { - Entry::Occupied(entry) => match entry.get() { - TenantSlot::InProgress(barrier) => { - TenantsMapRemoveResult::InProgress(barrier.clone()) - } - _ => TenantsMapRemoveResult::Occupied(entry.remove()), - }, - Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, - }, - None => TenantsMapRemoveResult::Vacant, - } - } + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) { + Entry::Occupied(entry) => match entry.get() { + TenantSlot::InProgress(barrier) => { + TenantsMapRemoveResult::InProgress(barrier.clone()) + } + _ => TenantsMapRemoveResult::Occupied(entry.remove()), + }, + Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, + }, } } @@ -822,14 +788,16 @@ pub(crate) async fn set_new_tenant_config( new_tenant_conf: TenantConfOpt, tenant_id: TenantId, ) -> Result<(), SetNewTenantConfigError> { + // Legacy API: does not support sharding + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_id, true)?; + let tenant = get_tenant(tenant_shard_id, true)?; // This is a legacy API that only operates on attached tenants: the preferred // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation); - let tenant_shard_id = TenantShardId::unsharded(tenant_id); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) .await @@ -1143,14 +1111,11 @@ pub(crate) enum GetTenantError { /// /// This method is cancel-safe. pub(crate) fn get_tenant( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, active_only: bool, ) -> Result, GetTenantError> { let locked = TENANTS.read().unwrap(); - // TODO(sharding): make all callers of get_tenant shard-aware - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; match peek_slot { @@ -1162,14 +1127,18 @@ pub(crate) fn get_tenant( TenantState::Active => Ok(Arc::clone(tenant)), _ => { if active_only { - Err(GetTenantError::NotActive(tenant_id)) + Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) } else { Ok(Arc::clone(tenant)) } } }, - Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)), - None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)), + Some(TenantSlot::InProgress(_)) => { + Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) + } + None | Some(TenantSlot::Secondary) => { + Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) + } } } @@ -1542,7 +1511,8 @@ pub(crate) enum TenantMapListError { /// /// Get list of tenants, for the mgmt API /// -pub(crate) async fn list_tenants() -> Result, TenantMapListError> { +pub(crate) async fn list_tenants() -> Result, TenantMapListError> +{ let tenants = TENANTS.read().unwrap(); let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), @@ -1550,12 +1520,10 @@ pub(crate) async fn list_tenants() -> Result, Tenan }; Ok(m.iter() .filter_map(|(id, tenant)| match tenant { - TenantSlot::Attached(tenant) => Some((id, tenant.current_state())), + TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), TenantSlot::Secondary => None, TenantSlot::InProgress(_) => None, }) - // TODO(sharding): make callers of this function shard-aware - .map(|(k, v)| (k.tenant_id, v)) .collect()) } @@ -2089,21 +2057,19 @@ use { }; pub(crate) async fn immediate_gc( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, ) -> Result>, ApiError> { let guard = TENANTS.read().unwrap(); - let tenant = guard - .get(&tenant_id) - .map(Arc::clone) - .with_context(|| format!("tenant {tenant_id}")) - .map_err(|e| ApiError::NotFound(e.into()))?; - // TODO(sharding): make callers of this function shard-aware - let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let tenant = guard + .get(&tenant_shard_id) + .map(Arc::clone) + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting @@ -2116,9 +2082,9 @@ pub(crate) async fn immediate_gc( task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::GarbageCollector, - Some(tenant_id), + Some(tenant_shard_id), Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), + &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), false, async move { fail::fail_point!("immediate_gc_task_pre"); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 03600cf5ae..3765ff6e7a 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1223,7 +1223,7 @@ impl RemoteTimelineClient { task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "remote upload", false, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 126d4d5563..112128ead8 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -837,7 +837,7 @@ impl LayerInner { crate::task_mgr::spawn( &tokio::runtime::Handle::current(), crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_shard_id.tenant_id), + Some(self.desc.tenant_shard_id), Some(self.desc.timeline_id), &task_name, false, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index bc404c41a0..dc23030218 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -87,13 +87,13 @@ pub fn start_background_loops( tenant: &Arc, background_jobs_can_start: Option<&completion::Barrier>, ) { - let tenant_id = tenant.tenant_shard_id.tenant_id; + let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, - Some(tenant_id), + Some(tenant_shard_id), None, - &format!("compactor for tenant {tenant_id}"), + &format!("compactor for tenant {tenant_shard_id}"), false, { let tenant = Arc::clone(tenant); @@ -105,7 +105,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) - .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } @@ -114,9 +114,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, - Some(tenant_id), + Some(tenant_shard_id), None, - &format!("garbage collector for tenant {tenant_id}"), + &format!("garbage collector for tenant {tenant_shard_id}"), false, { let tenant = Arc::clone(tenant); @@ -128,7 +128,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; gc_loop(tenant, cancel) - .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 551b66b77d..81dbc04793 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -66,7 +66,7 @@ use crate::metrics::{ TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; +use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; use crate::tenant::config::{EvictionPolicy, TenantConfOpt}; use pageserver_api::reltag::RelTag; @@ -77,7 +77,7 @@ use postgres_ffi::to_pg_timestamp; use utils::{ completion, generation::Generation, - id::{TenantId, TimelineId}, + id::TimelineId, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, simple_rcu::{Rcu, RcuReadGuard}, @@ -926,7 +926,7 @@ impl Timeline { tracing::debug!("Waiting for WalReceiverManager..."); task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; @@ -977,7 +977,7 @@ impl Timeline { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; @@ -995,12 +995,7 @@ impl Timeline { tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks( - None, - Some(self.tenant_shard_id.tenant_id), - Some(self.timeline_id), - ) - .await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; // Finally wait until any gate-holders are complete self.gate.close().await; @@ -1314,16 +1309,20 @@ impl Timeline { &self.conf.default_tenant_conf, ); - // TODO(sharding): make evictions state shard aware - // (https://github.com/neondatabase/neon/issues/5953) let tenant_id_str = self.tenant_shard_id.tenant_id.to_string(); + let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug()); let timeline_id_str = self.timeline_id.to_string(); self.metrics .evictions_with_low_residence_duration .write() .unwrap() - .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold); + .change_threshold( + &tenant_id_str, + &shard_id_str, + &timeline_id_str, + new_threshold, + ); } } @@ -1395,7 +1394,7 @@ impl Timeline { ancestor_lsn: metadata.ancestor_lsn(), metrics: TimelineMetrics::new( - &tenant_shard_id.tenant_id, + &tenant_shard_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", @@ -1496,7 +1495,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "layer flush task", false, @@ -1847,7 +1846,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "initial size calculation", false, @@ -2020,7 +2019,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "ondemand logical size calculation", false, @@ -2279,7 +2278,7 @@ impl Timeline { } // Recurse into ancestor if needed - if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { trace!( "going into ancestor {}, cont_lsn is {}", timeline.ancestor_lsn, @@ -2461,13 +2460,7 @@ impl Timeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = cache - .lookup_materialized_page( - self.tenant_shard_id.tenant_id, - self.timeline_id, - key, - lsn, - ctx, - ) + .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx) .await?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) @@ -3209,7 +3202,7 @@ impl DurationRecorder { #[derive(Default)] struct CompactLevel0Phase1StatsBuilder { version: Option, - tenant_id: Option, + tenant_id: Option, timeline_id: Option, read_lock_acquisition_micros: DurationRecorder, read_lock_held_spawn_blocking_startup_micros: DurationRecorder, @@ -3226,7 +3219,7 @@ struct CompactLevel0Phase1StatsBuilder { #[derive(serde::Serialize)] struct CompactLevel0Phase1Stats { version: u64, - tenant_id: TenantId, + tenant_id: TenantShardId, timeline_id: TimelineId, read_lock_acquisition_micros: RecordedDuration, read_lock_held_spawn_blocking_startup_micros: RecordedDuration, @@ -3745,7 +3738,7 @@ impl Timeline { let ctx = ctx.attached_child(); let mut stats = CompactLevel0Phase1StatsBuilder { version: Some(2), - tenant_id: Some(self.tenant_shard_id.tenant_id), + tenant_id: Some(self.tenant_shard_id), timeline_id: Some(self.timeline_id), ..Default::default() }; @@ -4207,7 +4200,7 @@ impl Timeline { let cache = page_cache::get(); if let Err(e) = cache .memorize_materialized_page( - self.tenant_shard_id.tenant_id, + self.tenant_shard_id, self.timeline_id, key, last_rec_lsn, @@ -4251,7 +4244,7 @@ impl Timeline { let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "download all remote layers task", false, diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 2a103a7ff4..be873181d9 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -43,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), ) .await; @@ -71,7 +71,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks( None, - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), ) .await; @@ -528,7 +528,7 @@ impl DeleteTimelineFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), Some(timeline_id), "timeline_delete", false, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3fe4bc0f83..020c5a9e9f 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -60,7 +60,7 @@ impl Timeline { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), &format!( "layer eviction for {}/{}", @@ -343,7 +343,7 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) { + let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) { Ok(t) => t, Err(_) => { return ControlFlow::Break(()); diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 04ff8602d6..e32265afb5 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -30,6 +30,7 @@ use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; +use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; use std::ops::ControlFlow; @@ -41,7 +42,7 @@ use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TenantTimelineId; +use utils::id::TimelineId; use self::connection_manager::ConnectionManagerStatus; @@ -60,7 +61,8 @@ pub struct WalReceiverConf { } pub struct WalReceiver { - timeline: TenantTimelineId, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, manager_status: Arc>>, } @@ -71,7 +73,7 @@ impl WalReceiver { mut broker_client: BrokerClientChannel, ctx: &RequestContext, ) -> Self { - let tenant_id = timeline.tenant_shard_id.tenant_id; + let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); @@ -81,9 +83,9 @@ impl WalReceiver { task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, - Some(tenant_id), + Some(timeline.tenant_shard_id), Some(timeline_id), - &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), + &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), false, async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -117,11 +119,12 @@ impl WalReceiver { *loop_status.write().unwrap() = None; Ok(()) } - .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) ); Self { - timeline: TenantTimelineId::new(tenant_id, timeline_id), + tenant_shard_id, + timeline_id, manager_status, } } @@ -129,8 +132,8 @@ impl WalReceiver { pub async fn stop(self) { task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.timeline.tenant_id), - Some(self.timeline.timeline_id), + Some(self.tenant_shard_id), + Some(self.timeline_id), ) .await; } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3bcb7ff891..61ab236322 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection( task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), "walreceiver connection", false, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 75b29a2fed..738216afa5 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -458,8 +458,10 @@ impl<'a> WalIngest<'a> { && decoded.xl_rmid == pg_constants::RM_XLOG_ID && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) - // compression of WAL is not yet supported: fall back to storing the original WAL record + // compression of WAL is not yet supported: fall back to storing the original WAL record && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + // do not materialize null pages because them most likely be soon replaced with real data + && blk.bimg_len != 0 { // Extract page image from FPI record let img_len = blk.bimg_len as usize; diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 7d79d34045..eadb9abd43 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -62,6 +62,9 @@ pub enum AuthErrorImpl { Please add it to the allowed list in the Neon console." )] IpAddressNotAllowed, + + #[error("Too many connections to this endpoint. Please try again later.")] + TooManyConnections, } #[derive(Debug, Error)] @@ -80,6 +83,10 @@ impl AuthError { pub fn ip_address_not_allowed() -> Self { AuthErrorImpl::IpAddressNotAllowed.into() } + + pub fn too_many_connections() -> Self { + AuthErrorImpl::TooManyConnections.into() + } } impl> From for AuthError { @@ -102,6 +109,7 @@ impl UserFacingError for AuthError { MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), IpAddressNotAllowed => self.to_string(), + TooManyConnections => self.to_string(), } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 649b3f40f2..ba054b53eb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -166,7 +166,7 @@ impl TryFrom for ComputeUserInfo { /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: ClientCredentials, client: &mut stream::PqStream>, allow_cleartext: bool, @@ -235,7 +235,7 @@ async fn auth_quirks( /// only if authentication was successfuly. async fn auth_and_wake_compute( api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: ClientCredentials, client: &mut stream::PqStream>, allow_cleartext: bool, @@ -314,7 +314,7 @@ impl<'a> BackendType<'a, ClientCredentials> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, @@ -387,7 +387,7 @@ impl<'a> BackendType<'a, ClientCredentials> { impl BackendType<'_, ComputeUserInfo> { pub async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, ) -> Result>, GetAuthInfoError> { use BackendType::*; match self { @@ -404,7 +404,7 @@ impl BackendType<'_, ComputeUserInfo> { /// The link auth flow doesn't support this, so we return [`None`] in that case. pub async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, ) -> Result, console::errors::WakeComputeError> { use BackendType::*; diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index fc1c44809a..1fa2d5599f 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -112,6 +112,9 @@ struct ProxyCliArgs { /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] rate_limiter_timeout: tokio::time::Duration, + /// Endpoint rate limiter max number of requests per second. + #[clap(long, default_value_t = 300)] + endpoint_rps_limit: u32, /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. #[clap(long, default_value_t = 100)] initial_limit: usize, @@ -317,6 +320,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, + endpoint_rps_limit: args.endpoint_rps_limit, })); Ok(config) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 182d71f9be..dea446eb22 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -20,6 +20,7 @@ pub struct ProxyConfig { pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, pub disable_ip_check_for_http: bool, + pub endpoint_rps_limit: u32, } #[derive(Debug)] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index b0a73fd03d..deab966d9e 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -196,15 +196,15 @@ pub mod errors { } /// Extra query params we'd like to pass to the console. -pub struct ConsoleReqExtra<'a> { +pub struct ConsoleReqExtra { /// A unique identifier for a connection. pub session_id: uuid::Uuid, /// Name of client application, if set. - pub application_name: Option<&'a str>, + pub application_name: String, pub options: Vec<(String, String)>, } -impl<'a> ConsoleReqExtra<'a> { +impl ConsoleReqExtra { // https://swagger.io/docs/specification/serialization/ DeepObject format // paramName[prop1]=value1¶mName[prop2]=value2&.... pub fn options_as_deep_object(&self) -> Vec<(String, String)> { @@ -259,20 +259,20 @@ pub trait Api { /// Get the client's auth secret for authentication. async fn get_auth_info( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result; async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result>, errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result; } diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 8aad8c06bc..c464b4daf2 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -144,7 +144,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_auth_info( &self, - _extra: &ConsoleReqExtra<'_>, + _extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { self.do_get_auth_info(creds).await @@ -152,7 +152,7 @@ impl super::Api for Api { async fn get_allowed_ips( &self, - _extra: &ConsoleReqExtra<'_>, + _extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result>, GetAuthInfoError> { Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips)) @@ -161,7 +161,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _extra: &ConsoleReqExtra<'_>, + _extra: &ConsoleReqExtra, _creds: &ComputeUserInfo, ) -> Result { self.do_wake_compute() diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index f8c3ee5b58..192252a0df 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -48,7 +48,7 @@ impl Api { async fn do_get_auth_info( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); @@ -60,9 +60,9 @@ impl Api { .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", extra.session_id)]) .query(&[ - ("application_name", extra.application_name), - ("project", Some(&creds.endpoint)), - ("role", Some(&creds.inner.user)), + ("application_name", extra.application_name.as_str()), + ("project", creds.endpoint.as_str()), + ("role", creds.inner.user.as_str()), ]) .build()?; @@ -101,7 +101,7 @@ impl Api { async fn do_wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); @@ -113,8 +113,8 @@ impl Api { .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", extra.session_id)]) .query(&[ - ("application_name", extra.application_name), - ("project", Some(&creds.endpoint)), + ("application_name", extra.application_name.as_str()), + ("project", creds.endpoint.as_str()), ]); request_builder = if extra.options.is_empty() { @@ -161,7 +161,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_auth_info( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { self.do_get_auth_info(extra, creds).await @@ -169,7 +169,7 @@ impl super::Api for Api { async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result>, GetAuthInfoError> { let key: &str = &creds.endpoint; @@ -192,7 +192,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { let key: &str = &creds.inner.cache_key; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 018f774c7e..ae8b294841 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -9,6 +9,7 @@ use crate::{ console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api}, http::StatusCode, protocol2::WithClientIp, + rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, usage_metrics::{Ids, USAGE_METRICS}, }; @@ -307,6 +308,7 @@ pub async fn task_main( let connections = tokio_util::task::task_tracker::TaskTracker::new(); let cancel_map = Arc::new(CancelMap::default()); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit)); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await @@ -315,6 +317,8 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + connections.spawn( async move { info!("accepted postgres client connection"); @@ -340,6 +344,7 @@ pub async fn task_main( socket, ClientMode::Tcp, peer_addr.ip(), + endpoint_rate_limiter, ) .await } @@ -415,6 +420,7 @@ pub async fn handle_client( stream: S, mode: ClientMode, peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { info!( protocol = mode.protocol_label(), @@ -463,6 +469,7 @@ pub async fn handle_client( ¶ms, session_id, mode.allow_self_signed_compute(config), + endpoint_rate_limiter, ); cancel_map .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config)) @@ -671,7 +678,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { pub async fn connect_to_compute( mechanism: &M, mut node_info: console::CachedNodeInfo, - extra: &console::ConsoleReqExtra<'_>, + extra: &console::ConsoleReqExtra, creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, mut latency_timer: LatencyTimer, ) -> Result @@ -928,6 +935,8 @@ struct Client<'a, S> { session_id: uuid::Uuid, /// Allow self-signed certificates (for testing). allow_self_signed_compute: bool, + /// Rate limiter for endpoints + endpoint_rate_limiter: Arc, } impl<'a, S> Client<'a, S> { @@ -938,6 +947,7 @@ impl<'a, S> Client<'a, S> { params: &'a StartupMessageParams, session_id: uuid::Uuid, allow_self_signed_compute: bool, + endpoint_rate_limiter: Arc, ) -> Self { Self { stream, @@ -945,6 +955,7 @@ impl<'a, S> Client<'a, S> { params, session_id, allow_self_signed_compute, + endpoint_rate_limiter, } } } @@ -966,15 +977,29 @@ impl Client<'_, S> { params, session_id, allow_self_signed_compute, + endpoint_rate_limiter, } = self; + // check rate limit + if let Some(ep) = creds.get_endpoint() { + if !endpoint_rate_limiter.check(ep) { + return stream + .throw_error(auth::AuthError::too_many_connections()) + .await; + } + } + + let proto = mode.protocol_label(); let extra = console::ConsoleReqExtra { session_id, // aka this connection's id - application_name: params.get("application_name"), + application_name: format!( + "{}/{}", + params.get("application_name").unwrap_or_default(), + proto + ), options: neon_options(params), }; - - let mut latency_timer = LatencyTimer::new(mode.protocol_label()); + let mut latency_timer = LatencyTimer::new(proto); let user = creds.get_user().to_owned(); let auth_result = match creds @@ -1012,7 +1037,6 @@ impl Client<'_, S> { .or_else(|e| stream.throw_error(e)) .await?; - let proto = mode.protocol_label(); NUM_DB_CONNECTIONS_OPENED_COUNTER .with_label_values(&[proto]) .inc(); diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 31c3ad1055..4691abbfb9 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -484,13 +484,13 @@ fn helper_create_connect_info( mechanism: &TestConnectMechanism, ) -> ( CachedNodeInfo, - console::ConsoleReqExtra<'static>, + console::ConsoleReqExtra, auth::BackendType<'_, ComputeUserInfo>, ) { let cache = helper_create_cached_node_info(); let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), - application_name: Some("TEST"), + application_name: "TEST".into(), options: vec![], }; let creds = auth::BackendType::Test(mechanism); diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 5622c44a68..f40b8dbd1c 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -3,4 +3,5 @@ mod limit_algorithm; mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; +pub use limiter::EndpointRateLimiter; pub use limiter::Limiter; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 3a9fed3919..9d28bb67b3 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -6,6 +6,9 @@ use std::{ time::Duration, }; +use dashmap::DashMap; +use parking_lot::Mutex; +use smol_str::SmolStr; use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; use tokio::time::{timeout, Instant}; use tracing::info; @@ -15,6 +18,74 @@ use super::{ RateLimiterConfig, }; +// Simple per-endpoint rate limiter. +// +// Check that number of connections to the endpoint is below `max_rps` rps. +// Purposefully ignore user name and database name as clients can reconnect +// with different names, so we'll end up sending some http requests to +// the control plane. +// +// We also may save quite a lot of CPU (I think) by bailing out right after we +// saw SNI, before doing TLS handshake. User-side error messages in that case +// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now +// I went with a more expensive way that yields user-friendlier error messages. +// +// TODO: add a better bucketing here, e.g. not more than 300 requests per second, +// and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects +// are noramal during redeployments, so we should not block them. +pub struct EndpointRateLimiter { + map: DashMap>>, + max_rps: u32, + access_count: AtomicUsize, +} + +impl EndpointRateLimiter { + pub fn new(max_rps: u32) -> Self { + Self { + map: DashMap::new(), + max_rps, + access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request + } + } + + /// Check that number of connections to the endpoint is below `max_rps` rps. + pub fn check(&self, endpoint: SmolStr) -> bool { + // do GC every 100k requests (worst case memory usage is about 10MB) + if self.access_count.fetch_add(1, Ordering::AcqRel) % 100_000 == 0 { + self.do_gc(); + } + + let now = chrono::Utc::now().naive_utc().time(); + let entry = self + .map + .entry(endpoint) + .or_insert_with(|| Arc::new(Mutex::new((now, 0)))); + let mut entry = entry.lock(); + let (last_time, count) = *entry; + + if now - last_time < chrono::Duration::seconds(1) { + if count >= self.max_rps { + return false; + } + *entry = (last_time, count + 1); + } else { + *entry = (now, 1); + } + true + } + + /// Clean the map. Simple strategy: remove all entries. At worst, we'll + /// double the effective max_rps during the cleanup. But that way deletion + /// does not aquire mutex on each entry access. + pub fn do_gc(&self) { + info!( + "cleaning up endpoint rate limiter, current size = {}", + self.map.len() + ); + self.map.clear(); + } +} + /// Limits the number of concurrent jobs. /// /// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index cd496ff01e..92d6e2d851 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -14,6 +14,7 @@ use tokio_util::task::TaskTracker; use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER}; +use crate::rate_limiter::EndpointRateLimiter; use crate::{cancellation::CancelMap, config::ProxyConfig}; use futures::StreamExt; use hyper::{ @@ -43,6 +44,7 @@ pub async fn task_main( } let conn_pool = conn_pool::GlobalConnPool::new(config); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit)); // shutdown the connection pool tokio::spawn({ @@ -91,6 +93,7 @@ pub async fn task_main( let sni_name = tls.server_name().map(|s| s.to_string()); let conn_pool = conn_pool.clone(); let ws_connections = ws_connections.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); async move { let peer_addr = match client_addr { @@ -103,6 +106,7 @@ pub async fn task_main( let sni_name = sni_name.clone(); let conn_pool = conn_pool.clone(); let ws_connections = ws_connections.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); async move { let cancel_map = Arc::new(CancelMap::default()); @@ -117,6 +121,7 @@ pub async fn task_main( session_id, sni_name, peer_addr.ip(), + endpoint_rate_limiter, ) .instrument(info_span!( "serverless", @@ -190,6 +195,7 @@ async fn request_handler( session_id: uuid::Uuid, sni_hostname: Option, peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> Result, ApiError> { let host = request .headers() @@ -214,6 +220,7 @@ async fn request_handler( session_id, host, peer_addr, + endpoint_rate_limiter, ) .await { diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 734df11368..4f3b31b9be 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -37,7 +37,7 @@ use crate::proxy::ConnectMechanism; use tracing::{error, warn, Span}; use tracing::{info, info_span, Instrument}; -pub const APP_NAME: &str = "sql_over_http"; +pub const APP_NAME: &str = "/sql_over_http"; const MAX_CONNS_PER_ENDPOINT: usize = 20; #[derive(Debug, Clone)] @@ -432,7 +432,7 @@ async fn connect_to_compute( let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), - application_name: Some(APP_NAME), + application_name: APP_NAME.to_string(), options: console_options, }; // TODO(anna): this is a bit hacky way, consider using console notification listener. diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 199b03550d..cd6184cdee 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,6 +3,7 @@ use crate::{ config::ProxyConfig, error::io_error, proxy::{handle_client, ClientMode}, + rate_limiter::EndpointRateLimiter, }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream}; @@ -13,6 +14,7 @@ use pin_project_lite::pin_project; use std::{ net::IpAddr, pin::Pin, + sync::Arc, task::{ready, Context, Poll}, }; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; @@ -134,6 +136,7 @@ pub async fn serve_websocket( session_id: uuid::Uuid, hostname: Option, peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; handle_client( @@ -143,6 +146,7 @@ pub async fn serve_websocket( WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, peer_addr, + endpoint_rate_limiter, ) .await?; Ok(()) diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 510a128663..a15a908212 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -142,7 +142,9 @@ pub(crate) async fn branch_cleanup_and_check_errors( .collect(); if !orphan_layers.is_empty() { - result.errors.push(format!( + // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report + // these as a hint that there is something worth cleaning up here. + result.warnings.push(format!( "index_part.json does not contain layers from S3: {:?}", orphan_layers .iter() @@ -170,6 +172,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( )); } } + BlobDataParseResult::Relic => {} BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( parse_errors .into_iter() @@ -215,6 +218,8 @@ pub(crate) enum BlobDataParseResult { index_part_generation: Generation, s3_layers: HashSet<(LayerFileName, Generation)>, }, + /// The remains of a deleted Timeline (i.e. an initdb archive only) + Relic, Incorrect(Vec), } @@ -245,6 +250,7 @@ pub(crate) async fn list_timeline_blobs( timeline_dir_target.delimiter = String::new(); let mut index_parts: Vec = Vec::new(); + let mut initdb_archive: bool = false; let stream = stream_listing(s3_client, &timeline_dir_target); pin_mut!(stream); @@ -258,6 +264,10 @@ pub(crate) async fn list_timeline_blobs( tracing::info!("Index key {key}"); index_parts.push(obj) } + Some("initdb.tar.zst") => { + tracing::info!("initdb archive {key}"); + initdb_archive = true; + } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { tracing::info!("Parsed layer key: {} {:?}", new_layer, gen); @@ -279,6 +289,16 @@ pub(crate) async fn list_timeline_blobs( } } + if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive { + tracing::info!( + "Timeline is empty apart from initdb archive: expected post-deletion state." + ); + return Ok(S3TimelineBlobData { + blob_data: BlobDataParseResult::Relic, + keys_to_remove: Vec::new(), + }); + } + // Choose the index_part with the highest generation let (index_part_object, index_part_generation) = match index_parts .iter() diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e5465952fb..6607db21e6 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -86,7 +86,9 @@ impl S3Target { if new_self.prefix_in_bucket.is_empty() { new_self.prefix_in_bucket = format!("/{}/", new_segment); } else { - let _ = new_self.prefix_in_bucket.pop(); + if new_self.prefix_in_bucket.ends_with('/') { + new_self.prefix_in_bucket.pop(); + } new_self.prefix_in_bucket = [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter); } diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 1f0ceebdaf..ef020edc2a 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -57,7 +57,7 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json } => match scan_metadata(bucket_config).await { + Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await { Err(e) => { tracing::error!("Failed: {e}"); Err(e) @@ -70,6 +70,17 @@ async fn main() -> anyhow::Result<()> { } if summary.is_fatal() { Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) } else { Ok(()) } diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index ad82db1e76..228f8d6763 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -174,6 +174,10 @@ Timeline layer count: {6} pub fn is_fatal(&self) -> bool { !self.with_errors.is_empty() } + + pub fn is_empty(&self) -> bool { + self.count == 0 + } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9e0beeb4d1..4b23650960 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -28,6 +28,7 @@ import jwt import psycopg2 import pytest import requests +import toml from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest @@ -436,7 +437,7 @@ class NeonEnvBuilder: # Pageserver remote storage self.pageserver_remote_storage = pageserver_remote_storage # Safekeepers remote storage - self.sk_remote_storage: Optional[RemoteStorage] = None + self.safekeepers_remote_storage: Optional[RemoteStorage] = None self.broker = broker self.run_id = run_id @@ -506,6 +507,66 @@ class NeonEnvBuilder: return env + def from_repo_dir( + self, + repo_dir: Path, + neon_binpath: Optional[Path] = None, + pg_distrib_dir: Optional[Path] = None, + ) -> NeonEnv: + """ + A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. + """ + + # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests + self.neon_binpath = neon_binpath or self.neon_binpath + self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir + + # Get the initial tenant and timeline from the snapshot config + snapshot_config_toml = repo_dir / "config" + with snapshot_config_toml.open("r") as f: + snapshot_config = toml.load(f) + + self.initial_tenant = TenantId(snapshot_config["default_tenant_id"]) + self.initial_timeline = TimelineId( + dict(snapshot_config["branch_name_mappings"][DEFAULT_BRANCH_NAME])[ + str(self.initial_tenant) + ] + ) + self.env = self.init_configs() + + for ps_dir in repo_dir.glob("pageserver_*"): + tenants_from_dir = ps_dir / "tenants" + tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" + + log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") + shutil.copytree(tenants_from_dir, tenants_to_dir) + + for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): + sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name + log.info(f"Copying safekeeper directory {sk_from_dir} to {sk_to_dir}") + sk_to_dir.rmdir() + shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid")) + + shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) + shutil.copytree( + repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + ) + + if (attachments_json := Path(repo_dir / "attachments.json")).exists(): + shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) + + # Update the config with info about tenants and timelines + with (self.repo_dir / "config").open("r") as f: + config = toml.load(f) + + config["default_tenant_id"] = snapshot_config["default_tenant_id"] + config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + + with (self.repo_dir / "config").open("w") as f: + toml.dump(config, f) + + return self.env + def enable_scrub_on_exit(self): """ Call this if you would like the fixture to automatically run @@ -534,9 +595,11 @@ class NeonEnvBuilder: self.pageserver_remote_storage = ret def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind): - assert self.sk_remote_storage is None, "sk_remote_storage already configured" + assert ( + self.safekeepers_remote_storage is None + ), "safekeepers_remote_storage already configured" - self.sk_remote_storage = self._configure_and_create_remote_storage( + self.safekeepers_remote_storage = self._configure_and_create_remote_storage( kind, RemoteStorageUser.SAFEKEEPER ) @@ -589,7 +652,7 @@ class NeonEnvBuilder: directory_to_clean.rmdir() def cleanup_remote_storage(self): - for x in [self.pageserver_remote_storage, self.sk_remote_storage]: + for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: if isinstance(x, S3Storage): x.do_cleanup() @@ -693,7 +756,7 @@ class NeonEnv: self.pageservers: List[NeonPageserver] = [] self.broker = config.broker self.pageserver_remote_storage = config.pageserver_remote_storage - self.safekeepers_remote_storage = config.sk_remote_storage + self.safekeepers_remote_storage = config.safekeepers_remote_storage self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath @@ -718,25 +781,17 @@ class NeonEnv: self.attachment_service = None # Create a config file corresponding to the options - toml = textwrap.dedent( - f""" - default_tenant_id = '{config.initial_tenant}' - """ - ) + cfg: Dict[str, Any] = { + "default_tenant_id": str(self.initial_tenant), + "broker": { + "listen_addr": self.broker.listen_addr(), + }, + "pageservers": [], + "safekeepers": [], + } if self.control_plane_api is not None: - toml += textwrap.dedent( - f""" - control_plane_api = '{self.control_plane_api}' - """ - ) - - toml += textwrap.dedent( - f""" - [broker] - listen_addr = '{self.broker.listen_addr()}' - """ - ) + cfg["control_plane_api"] = self.control_plane_api # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -749,26 +804,24 @@ class NeonEnv: http=self.port_distributor.get_port(), ) - toml += textwrap.dedent( - f""" - [[pageservers]] - id={ps_id} - listen_pg_addr = 'localhost:{pageserver_port.pg}' - listen_http_addr = 'localhost:{pageserver_port.http}' - pg_auth_type = '{pg_auth_type}' - http_auth_type = '{http_auth_type}' - """ - ) - + ps_cfg: Dict[str, Any] = { + "id": ps_id, + "listen_pg_addr": f"localhost:{pageserver_port.pg}", + "listen_http_addr": f"localhost:{pageserver_port.http}", + "pg_auth_type": pg_auth_type, + "http_auth_type": http_auth_type, + } # Create a corresponding NeonPageserver object self.pageservers.append( NeonPageserver( self, ps_id, port=pageserver_port, - config_override=config.pageserver_config_override, + config_override=self.pageserver_config_override, ) ) + cfg["pageservers"].append(ps_cfg) + # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): port = SafekeeperPort( @@ -777,32 +830,22 @@ class NeonEnv: http=self.port_distributor.get_port(), ) id = config.safekeepers_id_start + i # assign ids sequentially - toml += textwrap.dedent( - f""" - [[safekeepers]] - id = {id} - pg_port = {port.pg} - pg_tenant_only_port = {port.pg_tenant_only} - http_port = {port.http} - sync = {'true' if config.safekeepers_enable_fsync else 'false'}""" - ) + sk_cfg: Dict[str, Any] = { + "id": id, + "pg_port": port.pg, + "pg_tenant_only_port": port.pg_tenant_only, + "http_port": port.http, + "sync": config.safekeepers_enable_fsync, + } if config.auth_enabled: - toml += textwrap.dedent( - """ - auth_enabled = true - """ - ) - if config.sk_remote_storage is not None: - toml += textwrap.dedent( - f""" - remote_storage = "{remote_storage_to_toml_inline_table(config.sk_remote_storage)}" - """ - ) - safekeeper = Safekeeper(env=self, id=id, port=port) - self.safekeepers.append(safekeeper) + sk_cfg["auth_enabled"] = True + if self.safekeepers_remote_storage is not None: + sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table() + self.safekeepers.append(Safekeeper(env=self, id=id, port=port)) + cfg["safekeepers"].append(sk_cfg) - log.info(f"Config: {toml}") - self.neon_cli.init(toml) + log.info(f"Config: {cfg}") + self.neon_cli.init(cfg) def start(self): # Start up broker, pageserver and all safekeepers @@ -1288,10 +1331,10 @@ class NeonCli(AbstractNeonCli): def init( self, - config_toml: str, + config: Dict[str, Any], ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: - tmp.write(config_toml) + tmp.write(toml.dumps(config)) tmp.flush() cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] @@ -1729,11 +1772,16 @@ class NeonPageserver(PgProtocol): @property def workdir(self) -> Path: - return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}")) + return self.env.repo_dir / f"pageserver_{self.id}" def assert_no_errors(self): - logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") - errors = scan_pageserver_log_for_errors(logfile, self.allowed_errors) + logfile = self.workdir / "pageserver.log" + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return + + with logfile.open("r") as f: + errors = scan_pageserver_log_for_errors(f, self.allowed_errors) for _lineno, error in errors: log.info(f"not allowed error: {error.strip()}") @@ -1757,7 +1805,10 @@ class NeonPageserver(PgProtocol): def log_contains(self, pattern: str) -> Optional[str]: """Check that the pageserver log contains a line that matches the given regex""" - logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") + logfile = self.workdir / "pageserver.log" + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None contains_re = re.compile(pattern) @@ -1766,14 +1817,11 @@ class NeonPageserver(PgProtocol): # no guarantee it is already present in the log file. This hasn't # been a problem in practice, our python tests are not fast enough # to hit that race condition. - while True: - line = logfile.readline() - if not line: - break - - if contains_re.search(line): - # found it! - return line + with logfile.open("r") as f: + for line in f: + if contains_re.search(line): + # found it! + return line return None @@ -1796,6 +1844,27 @@ class NeonPageserver(PgProtocol): client = self.http_client() return client.tenant_detach(tenant_id) + def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs): + # This API is only for use when generations are enabled + assert self.env.attachment_service is not None + + if config["mode"].startswith("Attached") and "generation" not in config: + config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + + client = self.http_client() + return client.tenant_location_conf(tenant_id, config, **kwargs) + + def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]: + path = self.tenant_dir(tenant_id) / "config-v1" + log.info(f"Reading location conf from {path}") + bytes = open(path, "r").read() + try: + decoded: dict[str, Any] = toml.loads(bytes) + return decoded + except: + log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}") + raise + def tenant_create( self, tenant_id: TenantId, @@ -2729,6 +2798,7 @@ class EndpointFactory: lsn: Optional[Lsn] = None, hot_standby: bool = False, config_lines: Optional[List[str]] = None, + pageserver_id: Optional[int] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -2748,6 +2818,7 @@ class EndpointFactory: lsn=lsn, hot_standby=hot_standby, config_lines=config_lines, + pageserver_id=pageserver_id, ) def stop_all(self) -> "EndpointFactory": @@ -3094,7 +3165,7 @@ def pytest_addoption(parser: Parser): SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"config|metadata|.+\.(?:toml|pid|json|sql)" + r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)" ) @@ -3355,8 +3426,6 @@ def parse_project_git_version_output(s: str) -> str: The information is generated by utils::project_git_version! """ - import re - res = re.search(r"git(-env)?:([0-9a-fA-F]{8,40})(-\S+)?", s) if res and (commit := res.group(2)): return commit diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 3e75bac424..b46ddf5527 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -150,7 +150,7 @@ class PageserverHttpClient(requests.Session): # (this may change in future if we do fault injection of a kind that causes # requests TCP flows to stick) read=False, - backoff_factor=0, + backoff_factor=0.2, status_forcelist=[503], allowed_methods=None, remove_headers_on_redirect=[], @@ -277,6 +277,23 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params) self.verbose_error(res) + def tenant_location_conf( + self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None + ): + body = location_conf.copy() + body["tenant_id"] = str(tenant_id) + + params = {} + if flush_ms is not None: + params["flush_ms"] = str(flush_ms) + + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", + json=body, + params=params, + ) + self.verbose_error(res) + def tenant_delete(self, tenant_id: TenantId): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 565e5fa7f8..824531bea4 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union import boto3 +import toml from mypy_boto3_s3 import S3Client from fixtures.log_helper import log @@ -133,7 +134,10 @@ class LocalFsStorage: return json.load(f) def to_toml_inline_table(self) -> str: - return f"local_path='{self.root}'" + rv = { + "local_path": str(self.root), + } + return toml.TomlEncoder().dump_inline_table(rv) def cleanup(self): # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files @@ -174,18 +178,18 @@ class S3Storage: ) def to_toml_inline_table(self) -> str: - s = [ - f"bucket_name='{self.bucket_name}'", - f"bucket_region='{self.bucket_region}'", - ] + rv = { + "bucket_name": self.bucket_name, + "bucket_region": self.bucket_region, + } if self.prefix_in_bucket is not None: - s.append(f"prefix_in_bucket='{self.prefix_in_bucket}'") + rv["prefix_in_bucket"] = self.prefix_in_bucket if self.endpoint is not None: - s.append(f"endpoint='{self.endpoint}'") + rv["endpoint"] = self.endpoint - return ",".join(s) + return toml.TomlEncoder().dump_inline_table(rv) def do_cleanup(self): if not self.cleanup: @@ -384,4 +388,4 @@ def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): raise Exception("invalid remote storage type") - return f"{{{remote_storage.to_toml_inline_table()}}}" + return remote_storage.to_toml_inline_table() diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py new file mode 100644 index 0000000000..241531437c --- /dev/null +++ b/test_runner/fixtures/workload.py @@ -0,0 +1,148 @@ +from typing import Optional + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + last_flush_lsn_upload, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.types import TenantId, TimelineId + + +class Workload: + """ + This is not a general purpose load generator: it exists for storage tests that need to inject some + high level types of storage work via the postgres interface: + - layer writes (`write_rows`) + - work for compaction (`churn_rows`) + - reads, checking we get the right data (`validate`) + """ + + def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + self.env = env + self.tenant_id = tenant_id + self.timeline_id = timeline_id + self.table = "foo" + + self.expect_rows = 0 + self.churn_cursor = 0 + + self._endpoint: Optional[Endpoint] = None + + def endpoint(self, pageserver_id: int) -> Endpoint: + if self._endpoint is None: + self._endpoint = self.env.endpoints.create( + "main", + tenant_id=self.tenant_id, + pageserver_id=pageserver_id, + endpoint_id="ep-workload", + ) + self._endpoint.start(pageserver_id=pageserver_id) + else: + self._endpoint.reconfigure(pageserver_id=pageserver_id) + + connstring = self._endpoint.safe_psql( + "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" + ) + log.info(f"Workload.endpoint: connstr={connstring}") + + return self._endpoint + + def __del__(self): + if self._endpoint is not None: + self._endpoint.stop() + + def init(self, pageserver_id: int): + endpoint = self.endpoint(pageserver_id) + + endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);") + endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") + last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + + def write_rows(self, n, pageserver_id): + endpoint = self.endpoint(pageserver_id) + start = self.expect_rows + end = start + n - 1 + self.expect_rows += n + dummy_value = "blah" + endpoint.safe_psql( + f""" + INSERT INTO {self.table} (id, val) + SELECT g, '{dummy_value}' + FROM generate_series({start}, {end}) g + """ + ) + + return last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + + def churn_rows(self, n, pageserver_id, upload=True): + assert self.expect_rows >= n + + max_iters = 10 + endpoint = self.endpoint(pageserver_id) + todo = n + i = 0 + while todo > 0: + i += 1 + if i > max_iters: + raise RuntimeError("oops") + start = self.churn_cursor % self.expect_rows + n_iter = min((self.expect_rows - start), todo) + todo -= n_iter + + end = start + n_iter - 1 + + log.info( + f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}" + ) + + assert end < self.expect_rows + + self.churn_cursor += n_iter + dummy_value = "blah" + endpoint.safe_psql_many( + [ + f""" + INSERT INTO {self.table} (id, val) + SELECT g, '{dummy_value}' + FROM generate_series({start}, {end}) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + f"VACUUM {self.table}", + ] + ) + + last_flush_lsn = wait_for_last_flush_lsn( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + ps_http = self.env.get_pageserver(pageserver_id).http_client() + wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) + + if upload: + # force a checkpoint to trigger upload + ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id) + wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) + log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") + else: + log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") + + def validate(self, pageserver_id): + endpoint = self.endpoint(pageserver_id) + result = endpoint.safe_psql_many( + [ + "select clear_buffer_cache()", + f""" + SELECT COUNT(*) FROM {self.table} + """, + ] + ) + + log.info(f"validate({self.expect_rows}): {result}") + assert result == [[("",)], [(self.expect_rows,)]] diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 7487106c44..bd87ff3efd 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -92,8 +92,9 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") - env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") + env.pageserver.allowed_errors.extend( + [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"] + ) pageserver_token_old = env.auth_keys.generate_pageserver_token() pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) @@ -145,9 +146,9 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") - env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") - + env.pageserver.allowed_errors.extend( + [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"] + ) pageserver_token_old = env.auth_keys.generate_pageserver_token() pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index a19b2862f8..9879254897 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -14,8 +14,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*invalid branch start lsn.*") - env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*") + env.pageserver.allowed_errors.extend( + [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"] + ) # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index a908dd713a..82ca985d01 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -148,11 +148,11 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" - ) - env.pageserver.allowed_errors.append( - ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading", + ] ) ps_http = env.pageserver.http_client() @@ -247,11 +247,11 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" - ) - env.pageserver.allowed_errors.append( - ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory", + ] ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index f3c6af4427..3f5de100fd 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,30 +1,25 @@ -import copy import os import shutil import subprocess import tempfile from pathlib import Path -from typing import Any, List, Optional +from typing import List, Optional import pytest -import toml # TODO: replace with tomllib for Python >= 3.11 -from fixtures.log_helper import log +import toml from fixtures.neon_fixtures import ( - NeonCli, + NeonEnv, NeonEnvBuilder, PgBin, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, wait_for_upload, ) from fixtures.pg_version import PgVersion -from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn -from pytest import FixtureRequest # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -37,8 +32,8 @@ from pytest import FixtureRequest # If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true. # # The file contains a couple of helper functions: -# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files). # - check_neon_works performs the test itself, feel free to add more checks there. +# - dump_differs compares two SQL dumps and writes the diff to a file. # # # How to run `test_backward_compatibility` locally: @@ -46,6 +41,7 @@ from pytest import FixtureRequest # export DEFAULT_PG_VERSION=15 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # # # Build previous version of binaries and create a data snapshot: # rm -rf pg_install target @@ -59,8 +55,7 @@ from pytest import FixtureRequest # CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` # # # Run backward compatibility test -# COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} \ -# ./scripts/pytest -k test_backward_compatibility +# ./scripts/pytest -k test_backward_compatibility # # # How to run `test_forward_compatibility` locally: @@ -68,6 +63,8 @@ from pytest import FixtureRequest # export DEFAULT_PG_VERSION=15 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} +# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install # # # Build previous version of binaries and store them somewhere: # rm -rf pg_install target @@ -84,9 +81,7 @@ from pytest import FixtureRequest # ./scripts/pytest -k test_create_snapshot # # # Run forward compatibility test -# COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} \ -# COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install \ -# ./scripts/pytest -k test_forward_compatibility +# ./scripts/pytest -k test_forward_compatibility # check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( @@ -155,13 +150,9 @@ def test_create_snapshot( @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_backward_compatibility( - pg_bin: PgBin, - port_distributor: PortDistributor, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - neon_binpath: Path, - pg_distrib_dir: Path, pg_version: PgVersion, - request: FixtureRequest, ): """ Test that the new binaries can read old data @@ -177,23 +168,15 @@ def test_backward_compatibility( ) try: - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, - ) + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") + neon_env_builder.start() check_neon_works( - test_output_dir / "compatibility_snapshot" / "repo", - neon_binpath, - neon_binpath, - pg_distrib_dir, - pg_version, - port_distributor, - test_output_dir, - pg_bin, - request, + env, + test_output_dir=test_output_dir, + sql_dump_path=compatibility_snapshot_dir / "dump.sql", + repo_dir=env.repo_dir, ) except Exception: if breaking_changes_allowed: @@ -212,12 +195,10 @@ def test_backward_compatibility( @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_forward_compatibility( + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, top_output_dir: Path, - port_distributor: PortDistributor, pg_version: PgVersion, - request: FixtureRequest, - neon_binpath: Path, ): """ Test that the old binaries can read new data @@ -244,24 +225,19 @@ def test_forward_compatibility( ) try: - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir( + compatibility_snapshot_dir / "repo", + neon_binpath=compatibility_neon_bin, pg_distrib_dir=compatibility_postgres_distrib_dir, ) + neon_env_builder.start() check_neon_works( - test_output_dir / "compatibility_snapshot" / "repo", - compatibility_neon_bin, - neon_binpath, - compatibility_postgres_distrib_dir, - pg_version, - port_distributor, - test_output_dir, - PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version), - request, + env, + test_output_dir=test_output_dir, + sql_dump_path=compatibility_snapshot_dir / "dump.sql", + repo_dir=env.repo_dir, ) except Exception: if breaking_changes_allowed: @@ -276,189 +252,26 @@ def test_forward_compatibility( ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" -def prepare_snapshot( - from_dir: Path, - to_dir: Path, - port_distributor: PortDistributor, - pg_distrib_dir: Optional[Path] = None, -): - assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" - assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory" - assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql" +def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): + ep = env.endpoints.create_start("main") + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) - log.info(f"Copying snapshot from {from_dir} to {to_dir}") - shutil.copytree(from_dir, to_dir) - - repo_dir = to_dir / "repo" - - snapshot_config_toml = repo_dir / "config" - snapshot_config = toml.load(snapshot_config_toml) - - # Remove old logs to avoid confusion in test artifacts - for logfile in repo_dir.glob("**/*.log"): - logfile.unlink() - - # Remove old computes in 'endpoints'. Old versions of the control plane used a directory - # called "pgdatadirs". Delete it, too. - if (repo_dir / "endpoints").exists(): - shutil.rmtree(repo_dir / "endpoints") - if (repo_dir / "pgdatadirs").exists(): - shutil.rmtree(repo_dir / "pgdatadirs") - os.mkdir(repo_dir / "endpoints") - - # Update paths and ports in config files - legacy_pageserver_toml = repo_dir / "pageserver.toml" - legacy_bundle = os.path.exists(legacy_pageserver_toml) - - path_to_config: dict[Path, dict[Any, Any]] = {} - if legacy_bundle: - os.mkdir(repo_dir / "pageserver_1") - path_to_config[repo_dir / "pageserver_1" / "pageserver.toml"] = toml.load( - legacy_pageserver_toml - ) - os.remove(legacy_pageserver_toml) - os.rename(repo_dir / "tenants", repo_dir / "pageserver_1" / "tenants") - else: - for ps_conf in snapshot_config["pageservers"]: - config_path = repo_dir / f"pageserver_{ps_conf['id']}" / "pageserver.toml" - path_to_config[config_path] = toml.load(config_path) - - # For each pageserver config, edit it and rewrite - for config_path, pageserver_config in path_to_config.items(): - pageserver_config["remote_storage"]["local_path"] = str( - LocalFsStorage.component_path(repo_dir, RemoteStorageUser.PAGESERVER) - ) - - for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"): - pageserver_config[param] = port_distributor.replace_with_new_port( - pageserver_config[param] - ) - - # We don't use authentication in compatibility tests - # so just remove authentication related settings. - pageserver_config.pop("pg_auth_type", None) - pageserver_config.pop("http_auth_type", None) - - if pg_distrib_dir: - pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) - - with config_path.open("w") as f: - toml.dump(pageserver_config, f) - - # neon_local config doesn't have to be backward compatible. If we're using a dump from before - # it supported multiple pageservers, fix it up. - if "pageservers" not in snapshot_config: - snapshot_config["pageservers"] = [snapshot_config["pageserver"]] - del snapshot_config["pageserver"] - - for param in ("listen_http_addr", "listen_pg_addr"): - for pageserver in snapshot_config["pageservers"]: - pageserver[param] = port_distributor.replace_with_new_port(pageserver[param]) - snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port( - snapshot_config["broker"]["listen_addr"] - ) - for sk in snapshot_config["safekeepers"]: - for param in ("http_port", "pg_port", "pg_tenant_only_port"): - sk[param] = port_distributor.replace_with_new_port(sk[param]) - - if pg_distrib_dir: - snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir) - - with snapshot_config_toml.open("w") as f: - toml.dump(snapshot_config, f) - - # Ensure that snapshot doesn't contain references to the original path - rv = subprocess.run( - [ - "grep", - "--recursive", - "--binary-file=without-match", - "--files-with-matches", - "test_create_snapshot/repo", - str(repo_dir), - ], - capture_output=True, - text=True, - ) - assert ( - rv.returncode != 0 - ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" - - -def check_neon_works( - repo_dir: Path, - neon_target_binpath: Path, - neon_current_binpath: Path, - pg_distrib_dir: Path, - pg_version: PgVersion, - port_distributor: PortDistributor, - test_output_dir: Path, - pg_bin: PgBin, - request: FixtureRequest, -): - snapshot_config_toml = repo_dir / "config" - snapshot_config = toml.load(snapshot_config_toml) - snapshot_config["neon_distrib_dir"] = str(neon_target_binpath) - snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir) - with (snapshot_config_toml).open("w") as f: - toml.dump(snapshot_config, f) - - # TODO: replace with NeonEnvBuilder / NeonEnv - config: Any = type("NeonEnvStub", (object,), {}) - config.rust_log_override = None - config.repo_dir = repo_dir - config.pg_version = pg_version - config.initial_tenant = snapshot_config["default_tenant_id"] - config.pg_distrib_dir = pg_distrib_dir - config.remote_storage = None - config.sk_remote_storage = None - - # Use the "target" binaries to launch the storage nodes - config_target = config - config_target.neon_binpath = neon_target_binpath - # We are using maybe-old binaries for neon services, but want to use current - # binaries for test utilities like neon_local - config_target.neon_local_binpath = neon_current_binpath - cli_target = NeonCli(config_target) - - # And the current binaries to launch computes - snapshot_config["neon_distrib_dir"] = str(neon_current_binpath) - with (snapshot_config_toml).open("w") as f: - toml.dump(snapshot_config, f) - config_current = copy.copy(config) - config_current.neon_binpath = neon_current_binpath - cli_current = NeonCli(config_current) - - cli_target.raw_cli(["start"]) - request.addfinalizer(lambda: cli_target.raw_cli(["stop"])) - - pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() - cli_current.endpoint_create( - branch_name="main", pg_port=pg_port, http_port=http_port, endpoint_id="ep-main" - ) - cli_current.endpoint_start("ep-main") - request.addfinalizer(lambda: cli_current.endpoint_stop("ep-main")) - - connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" + connstr = ep.connstr() pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"] ) initial_dump_differs = dump_differs( - repo_dir.parent / "dump.sql", + sql_dump_path, test_output_dir / "dump.sql", test_output_dir / "dump.filediff", ) # Check that project can be recovered from WAL # loosely based on https://www.notion.so/neondatabase/Storage-Recovery-from-WAL-d92c0aac0ebf40df892b938045d7d720 - tenant_id = snapshot_config["default_tenant_id"] - timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] - pageserver_port = snapshot_config["pageservers"][0]["listen_http_addr"].split(":")[-1] - pageserver_http = PageserverHttpClient( - port=pageserver_port, - is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled - ) + pageserver_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + pg_version = env.pg_version shutil.rmtree(repo_dir / "local_fs_remote_storage") timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) @@ -494,6 +307,11 @@ def dump_differs( Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False). """ + if not first.exists(): + raise FileNotFoundError(f"{first} doesn't exist") + if not second.exists(): + raise FileNotFoundError(f"{second} doesn't exist") + with output.open("w") as stdout: res = subprocess.run( [ diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py old mode 100755 new mode 100644 diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 920e8d0b72..faedf5d944 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -99,12 +99,13 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ] ) - # FIXME: we should clean up pageserver to not print this - env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*") - - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: we should clean up pageserver to not print this + ".*exited with error: unexpected message type: CopyData.*", + # FIXME: Is this expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + ] ) def import_tar(base, wal): diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index d2d8d71e3f..51e358e60d 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -236,3 +236,30 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): assert vanilla_pg.safe_psql( "select sum(somedata) from replication_example" ) == endpoint.safe_psql("select sum(somedata) from replication_example") + + +# +# Check that slots are not inherited in brnach +# +def test_slots_and_branching(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant, timeline = env.neon_cli.create_tenant() + env.pageserver.http_client() + + main_branch = env.endpoints.create_start("main", tenant_id=tenant) + main_cur = main_branch.connect().cursor() + + # Create table and insert some data + main_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + wait_for_last_flush_lsn(env, main_branch, tenant, timeline) + + # Create branch ws. + env.neon_cli.create_branch("ws", "main", tenant_id=tenant) + ws_branch = env.endpoints.create_start("ws", tenant_id=tenant) + log.info("postgres is running on 'ws' branch") + + # Check that we can create slot with the same name + ws_cur = ws_branch.connect().cursor() + ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 66cc286aba..4488be31c5 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -23,14 +23,20 @@ from fixtures.neon_fixtures import ( PgBin, S3Scrubber, last_flush_lsn_upload, - wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import list_prefix +from fixtures.pageserver.http import PageserverApiException +from fixtures.pageserver.utils import ( + assert_tenant_state, + list_prefix, + wait_for_last_record_lsn, + wait_for_upload, +) from fixtures.remote_storage import ( RemoteStorageKind, ) from fixtures.types import TenantId, TimelineId from fixtures.utils import print_gc_result, wait_until +from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -93,7 +99,10 @@ def generate_uploads_and_deletions( ) assert tenant_id is not None assert timeline_id is not None - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) ps_http.timeline_checkpoint(tenant_id, timeline_id) # Compaction should generate some GC-elegible layers @@ -560,3 +569,91 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): read_all(env, tenant_id, timeline_id) evict_all_layers(env, tenant_id, timeline_id) read_all(env, tenant_id, timeline_id) + + +def test_multi_attach( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + neon_env_builder.enable_generations = True + neon_env_builder.num_pageservers = 3 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + pageservers = env.pageservers + http_clients = list([p.http_client() for p in pageservers]) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # We will intentionally create situations where stale deletions happen from non-latest-generation + # nodes when the tenant is multiply-attached + for ps in env.pageservers: + ps.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) + + # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) + _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + http_clients[1].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + http_clients[2].timeline_detail(tenant_id, timeline_id) + + workload = Workload(env, tenant_id, timeline_id) + workload.init(pageservers[0].id) + workload.write_rows(1000, pageservers[0].id) + + # Attach the tenant to the other two pageservers + pageservers[1].tenant_attach(env.initial_tenant) + pageservers[2].tenant_attach(env.initial_tenant) + + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) + + # Now they all have it attached + _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients]) + _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) + _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + + # The endpoint can use any pageserver to service its reads + for pageserver in pageservers: + workload.validate(pageserver.id) + + # If we write some more data, all the nodes can see it, including stale ones + wrote_lsn = workload.write_rows(1000, pageservers[0].id) + for ps_http in http_clients: + wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn) + + # ...and indeed endpoints can see it via any of the pageservers + for pageserver in pageservers: + workload.validate(pageserver.id) + + # Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage + for ps_http in http_clients: + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn) + + # Now, the contents of remote storage will be a set of layers from each pageserver, but with unique + # generation numbers + # TODO: validate remote storage contents + + # Stop all pageservers + for ps in pageservers: + ps.stop() + + # Returning to a normal healthy state: all pageservers will start, but only the one most + # recently attached via the control plane will re-attach on startup + for ps in pageservers: + ps.start() + + with pytest.raises(PageserverApiException): + _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) + _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + + # All data we wrote while multi-attached remains readable + workload.validate(pageservers[2].id) diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index b76dbbee03..042961baa5 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -64,13 +64,13 @@ def test_metric_collection( # spin up neon, after http server is ready env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) # httpserver is shut down before pageserver during passing run - env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") - # we have a fast rate of calculation, these can happen at shutdown - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" - ) - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + ".*metrics endpoint refused the sent metrics*", + # we have a fast rate of calculation, these can happen at shutdown + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ] ) tenant_id = env.initial_tenant @@ -212,13 +212,13 @@ def test_metric_collection_cleans_up_tempfile( pageserver_http = env.pageserver.http_client() # httpserver is shut down before pageserver during passing run - env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") - # we have a fast rate of calculation, these can happen at shutdown - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" - ) - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + ".*metrics endpoint refused the sent metrics*", + # we have a fast rate of calculation, these can happen at shutdown + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ] ) tenant_id = env.initial_tenant diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py new file mode 100644 index 0000000000..b14b7f1328 --- /dev/null +++ b/test_runner/regress/test_pageserver_secondary.py @@ -0,0 +1,332 @@ +import random +from typing import Any, Dict, Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.remote_storage import RemoteStorageKind +from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until +from fixtures.workload import Workload + +# A tenant configuration that is convenient for generating uploads and deletions +# without a large amount of postgres traffic. +TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_target_size": f"{128 * 1024}", + "compaction_threshold": "1", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", +} + + +def evict_random_layers( + rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId +): + """ + Evict 50% of the layers on a pageserver + """ + timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) + initial_local_layers = sorted( + list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + ) + client = pageserver.http_client() + for layer in initial_local_layers: + if "ephemeral" in layer.name or "temp_download" in layer.name: + continue + + if rng.choice([True, False]): + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") + client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + + +@pytest.mark.parametrize("seed", [1, 2, 3]) +def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): + """ + Issue many location configuration changes, ensure that tenants + remain readable & we don't get any unexpected errors. We should + have no ERROR in the log, and no 500s in the API. + + The location_config API is intentionally designed so that all destination + states are valid, so that we may test it in this way: the API should always + work as long as the tenant exists. + """ + neon_env_builder.enable_generations = True + neon_env_builder.num_pageservers = 3 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + pageservers = env.pageservers + list([p.http_client() for p in pageservers]) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # We will make no effort to avoid stale attachments + for ps in env.pageservers: + ps.allowed_errors.extend( + [ + ".*Dropped remote consistent LSN updates.*", + ".*Dropping stale deletions.*", + # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found + ".*query handler.*Tenant.*not found.*", + # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active + ".*query handler.*Tenant.*not active.*", + ] + ) + + # these can happen, if we shutdown at a good time. to be fixed as part of #5172. + message = ".*duplicated L1 layer layer=.*" + ps.allowed_errors.append(message) + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, env.pageservers[0].id) + + # We use a fixed seed to make the test reproducible: we want a randomly + # chosen order, but not to change the order every time we run the test. + rng = random.Random(seed) + + initial_generation = 1 + last_state = { + env.pageservers[0].id: ("AttachedSingle", initial_generation), + env.pageservers[1].id: ("Detached", None), + env.pageservers[2].id: ("Detached", None), + } + + latest_attached = env.pageservers[0].id + + for _i in range(0, 64): + # Pick a pageserver + pageserver = rng.choice(env.pageservers) + + # Pick a pseudorandom state + modes = [ + "AttachedSingle", + "AttachedMulti", + "AttachedStale", + "Secondary", + "Detached", + "_Evictions", + "_Restart", + ] + + mode = rng.choice(modes) + + last_state_ps = last_state[pageserver.id] + if mode == "_Evictions": + if last_state_ps[0].startswith("Attached"): + log.info(f"Action: evictions on pageserver {pageserver.id}") + evict_random_layers(rng, pageserver, tenant_id, timeline_id) + else: + log.info( + f"Action: skipping evictions on pageserver {pageserver.id}, is not attached" + ) + elif mode == "_Restart": + log.info(f"Action: restarting pageserver {pageserver.id}") + pageserver.stop() + pageserver.start() + if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id: + log.info("Entering postgres...") + workload.churn_rows(rng.randint(128, 256), pageserver.id) + workload.validate(pageserver.id) + elif last_state_ps[0].startswith("Attached"): + # The `attachment_service` will only re-attach on startup when a pageserver was the + # holder of the latest generation: otherwise the pageserver will revert to detached + # state if it was running attached with a stale generation + last_state[pageserver.id] = ("Detached", None) + else: + secondary_conf: Optional[Dict[str, Any]] = None + if mode == "Secondary": + secondary_conf = {"warm": rng.choice([True, False])} + + location_conf: Dict[str, Any] = { + "mode": mode, + "secondary_conf": secondary_conf, + "tenant_conf": {}, + } + + log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}") + + # Select a generation number + if mode.startswith("Attached"): + if last_state_ps[1] is not None: + if rng.choice([True, False]): + # Move between attached states, staying in the same generation + generation = last_state_ps[1] + else: + # Switch generations, while also jumping between attached states + generation = env.attachment_service.attach_hook_issue( + tenant_id, pageserver.id + ) + latest_attached = pageserver.id + else: + generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id) + latest_attached = pageserver.id + else: + generation = None + + location_conf["generation"] = generation + + pageserver.tenant_location_configure(tenant_id, location_conf) + last_state[pageserver.id] = (mode, generation) + + if mode.startswith("Attached"): + # This is a basic test: we are validating that he endpoint works properly _between_ + # configuration changes. A stronger test would be to validate that clients see + # no errors while we are making the changes. + workload.churn_rows( + rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale" + ) + workload.validate(pageserver.id) + + # Attach all pageservers + for ps in env.pageservers: + location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}} + ps.tenant_location_configure(tenant_id, location_conf) + + # Confirm that all are readable + for ps in env.pageservers: + workload.validate(ps.id) + + # Detach all pageservers + for ps in env.pageservers: + location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}} + ps.tenant_location_configure(tenant_id, location_conf) + + # Confirm that all local disk state was removed on detach + # TODO + + +def test_live_migration(neon_env_builder: NeonEnvBuilder): + """ + Test the sequence of location states that are used in a live migration. + """ + neon_env_builder.enable_generations = True + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver_a = env.pageservers[0] + pageserver_b = env.pageservers[1] + + initial_generation = 1 + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, env.pageservers[0].id) + + # Make the destination a secondary location + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + + workload.churn_rows(64, pageserver_a.id, upload=False) + + # Set origin attachment to stale + log.info("Setting origin to AttachedStale") + pageserver_a.tenant_location_configure( + tenant_id, + { + "mode": "AttachedStale", + "secondary_conf": None, + "tenant_conf": {}, + "generation": initial_generation, + }, + flush_ms=5000, + ) + + migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) + log.info(f"Acquired generation {migrated_generation} for destination pageserver") + assert migrated_generation == initial_generation + 1 + + # Writes and reads still work in AttachedStale. + workload.validate(pageserver_a.id) + + # TODO: call into secondary mode API hooks to do an upload/download sync + + # Generate some more dirty writes: we expect the origin to ingest WAL in + # in AttachedStale + workload.churn_rows(64, pageserver_a.id, upload=False) + workload.validate(pageserver_a.id) + + # Attach the destination + log.info("Setting destination to AttachedMulti") + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "AttachedMulti", + "secondary_conf": None, + "tenant_conf": {}, + "generation": migrated_generation, + }, + ) + + # Wait for destination LSN to catch up with origin + origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + + def caught_up(): + destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + log.info( + f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}" + ) + assert destination_lsn >= origin_lsn + + wait_until(100, 0.1, caught_up) + + # The destination should accept writes + workload.churn_rows(64, pageserver_b.id) + + # Dual attached: both are readable. + workload.validate(pageserver_a.id) + workload.validate(pageserver_b.id) + + # Revert the origin to secondary + log.info("Setting origin to Secondary") + pageserver_a.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + + workload.churn_rows(64, pageserver_b.id) + + # Put the destination into final state + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": migrated_generation, + }, + ) + + workload.churn_rows(64, pageserver_b.id) + workload.validate(pageserver_b.id) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0a5046e219..3004d69f50 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -73,19 +73,20 @@ def test_remote_storage_backup_and_restore( ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: Is this expected? + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ".*No timelines to attach received.*", + ".*Failed to get local tenant state.*", + # FIXME retry downloads without throwing errors + ".*failed to load remote timeline.*", + # we have a bunch of pytest.raises for these below + ".*tenant .*? already exists, state:.*", + ".*tenant directory already exists.*", + ".*simulated failure of remote operation.*", + ] ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - - env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*") - # FIXME retry downloads without throwing errors - env.pageserver.allowed_errors.append(".*failed to load remote timeline.*") - # we have a bunch of pytest.raises for these below - env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") - env.pageserver.allowed_errors.append(".*tenant directory already exists.*") - env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*") pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 48f5682371..fece876459 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -395,13 +395,13 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE env.start() pageserver_http = env.pageserver.http_client() - # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero - env.pageserver.allowed_errors.append( - ".*Timeline got dropped without initializing, cleaning its files" - ) - # the response hit_pausable_failpoint_and_later_fail - env.pageserver.allowed_errors.append( - f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn" + env.pageserver.allowed_errors.extend( + [ + # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero + ".*Timeline got dropped without initializing, cleaning its files", + # the response hit_pausable_failpoint_and_later_fail + f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn", + ] ) env.pageserver.tenant_create(env.initial_tenant) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 5b63bd6161..0dcbb23ad4 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -307,10 +307,14 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) - # the error will be printed to the log too - env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") - # Timelines get stopped during detach, ignore the gc calls that error, witnessing that - env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*") + env.pageserver.allowed_errors.extend( + [ + # the error will be printed to the log too + ".*gc target timeline does not exist.*", + # Timelines get stopped during detach, ignore the gc calls that error, witnessing that + ".*InternalServerError\\(timeline is Stopping.*", + ] + ) # Detach while running manual GC. # It should wait for manual GC to finish because it runs in a task associated with the tenant. diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index feacdcc802..dcd7232b1b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -216,16 +216,17 @@ def test_tenant_relocation( tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209") - # FIXME: Is this expected? - env.pageservers[0].allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageservers[0].allowed_errors.extend( + [ + # FIXME: Is this expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + # Needed for detach polling on the original pageserver + f".*NotFound: tenant {tenant_id}.*", + # We will dual-attach in this test, so stale generations are expected + ".*Dropped remote consistent LSN updates.*", + ] ) - # Needed for detach polling on the original pageserver - env.pageservers[0].allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") - # We will dual-attach in this test, so stale generations are expected - env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates.*") - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) # we use two branches to check that they are both relocated diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index b7b4e2be0b..07fb6dc5ca 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -117,10 +117,12 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - # FIXME: Are these expected? - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: Are these expected? + ".*No timelines to attach received.*", + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ] ) pageserver_http = env.pageserver.http_client() @@ -218,13 +220,14 @@ def test_tenant_redownloads_truncated_file_on_startup( assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - env.pageserver.allowed_errors.append(".*removing local file .* because .*") - - # FIXME: Are these expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + ".*removing local file .* because .*", + # FIXME: Are these expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + ".*No timelines to attach received.*", + ] ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 27d5cce5f2..5f72cfd747 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -36,12 +36,13 @@ def test_threshold_based_eviction( ".*metrics_collection:.* upload consumption_metrics (still failed|failed, will retry).*" ) env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(metrics_refused_log_line) - - # these can happen whenever we run consumption metrics collection - env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled") - env.pageserver.allowed_errors.append( - r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + metrics_refused_log_line, + # these can happen whenever we run consumption metrics collection + r".*failed to calculate logical size at \S+: cancelled", + r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes", + ] ) tenant_id, timeline_id = env.initial_tenant, env.initial_timeline diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 17113a6bc5..c6d578a7a2 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -39,10 +39,14 @@ from urllib3.util.retry import Retry def test_timeline_delete(neon_simple_env: NeonEnv): env = neon_simple_env - env.pageserver.allowed_errors.append(".*Timeline .* was not found.*") - env.pageserver.allowed_errors.append(".*timeline not found.*") - env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*") - env.pageserver.allowed_errors.append(".*Precondition failed: Requested tenant is missing.*") + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found.*", + ".*timeline not found.*", + ".*Cannot delete timeline which has child timelines.*", + ".*Precondition failed: Requested tenant is missing.*", + ] + ) ps_http = env.pageserver.http_client() @@ -198,22 +202,22 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ), ) - env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") - # It appears when we stopped flush loop during deletion and then pageserver is stopped - env.pageserver.allowed_errors.append( - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + env.pageserver.allowed_errors.extend( + [ + f".*{timeline_id}.*failpoint: {failpoint}", + # It appears when we stopped flush loop during deletion and then pageserver is stopped + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + # This happens when we fail before scheduling background operation. + # Timeline is left in stopping state and retry tries to stop it again. + ".*Ignoring new state, equal to the existing one: Stopping", + # This happens when we retry delete requests for broken timelines + ".*Ignoring state update Stopping for broken timeline", + # This happens when timeline remains are cleaned up during loading + ".*Timeline dir entry become invalid.*", + # In one of the branches we poll for tenant to become active. Polls can generate this log message: + f".*Tenant {env.initial_tenant} is not active*", + ] ) - # This happens when we fail before scheduling background operation. - # Timeline is left in stopping state and retry tries to stop it again. - env.pageserver.allowed_errors.append( - ".*Ignoring new state, equal to the existing one: Stopping" - ) - # This happens when we retry delete requests for broken timelines - env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") - # This happens when timeline remains are cleaned up during loading - env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*") - # In one of the branches we poll for tenant to become active. Polls can generate this log message: - env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*") ps_http.configure_failpoints((failpoint, "return")) @@ -398,13 +402,13 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm") - env.pageserver.allowed_errors.append( - ".*Ignoring new state, equal to the existing one: Stopping" - ) - # this happens, because the stuck timeline is visible to shutdown - env.pageserver.allowed_errors.append( - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + env.pageserver.allowed_errors.extend( + [ + ".*failpoint: timeline-delete-before-rm", + ".*Ignoring new state, equal to the existing one: Stopping", + # this happens, because the stuck timeline is visible to shutdown + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ] ) ps_http = env.pageserver.http_client() @@ -551,10 +555,12 @@ def test_concurrent_timeline_delete_stuck_on( with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) assert second_call_err.value.status_code == 409 - env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*") - # the second call will try to transition the timeline into Stopping state as well - env.pageserver.allowed_errors.append( - f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" + env.pageserver.allowed_errors.extend( + [ + f".*{child_timeline_id}.*{error_msg_re}.*", + # the second call will try to transition the timeline into Stopping state as well + f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping", + ] ) log.info("second call failed as expected")