diff --git a/Cargo.lock b/Cargo.lock index dead212156..c23162971e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.5" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d" +checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" dependencies = [ "cfg-if", "const-random", @@ -1389,9 +1389,9 @@ dependencies = [ [[package]] name = "crc32c" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" +checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2" dependencies = [ "rustc_version", ] diff --git a/README.md b/README.md index 95926b4628..c44ae695d6 100644 --- a/README.md +++ b/README.md @@ -230,6 +230,8 @@ postgres=# select * from t; > cargo neon stop ``` +More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md). + #### Handling build failures If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again. diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 142bb14fe5..a82b999cfb 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -18,8 +18,6 @@ use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; use postgres::{Client, NoTls}; -use tokio; -use tokio_postgres; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 2cec12119f..ef1db73982 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::{self, Result}; +use anyhow::Result; use anyhow::{bail, Context}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index f076951239..128783b477 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -13,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; -use num_cpus; -use serde_json; use tokio::task; use tracing::{error, info, warn}; use tracing_utils::http::OtelName; diff --git a/control_plane/README.md b/control_plane/README.md new file mode 100644 index 0000000000..827aba5c1f --- /dev/null +++ b/control_plane/README.md @@ -0,0 +1,26 @@ +# Control Plane and Neon Local + +This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. + +## Example: Start with Postgres 16 + +To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands. + +```shell +cargo neon init --pg-version 16 +cargo neon start +cargo neon tenant create --set-default --pg-version 16 +cargo neon endpoint create main --pg-version 16 +cargo neon endpoint start main +``` + +## Example: Create Test User and Database + +By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint. + +```shell +cargo neon endpoint create main --pg-version 16 --update-catalog true +cargo neon endpoint start main --create-test-user true +``` + +The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command. diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql new file mode 100644 index 0000000000..503231f69d --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE tenant_shards ALTER generation SET NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL; diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql new file mode 100644 index 0000000000..7e1e3cfe90 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql @@ -0,0 +1,4 @@ + + +ALTER TABLE tenant_shards ALTER generation DROP NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL; \ No newline at end of file diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index f1153c2c18..384bdcef0c 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -1,9 +1,10 @@ use crate::reconciler::ReconcileError; use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::PlacementPolicy; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use pageserver_api::models::{ - TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, + TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; @@ -117,9 +118,14 @@ async fn handle_tenant_create( check_permissions(&req, Scope::PageServerApi)?; let create_req = json_request::(&mut req).await?; + + // TODO: enable specifying this. Using Single as a default helps legacy tests to work (they + // have no expectation of HA). + let placement_policy = PlacementPolicy::Single; + json_response( StatusCode::CREATED, - service.tenant_create(create_req).await?, + service.tenant_create(create_req, placement_policy).await?, ) } @@ -185,6 +191,27 @@ async fn handle_tenant_location_config( ) } +async fn handle_tenant_config_set( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + + let config_req = json_request::(&mut req).await?; + + json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) +} + +async fn handle_tenant_config_get( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) +} + async fn handle_tenant_time_travel_remote_storage( service: Arc, mut req: Request, @@ -216,7 +243,15 @@ async fn handle_tenant_time_travel_remote_storage( done_if_after_raw, ) .await?; + json_response(StatusCode::OK, ()) +} +async fn handle_tenant_secondary_download( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + service.tenant_secondary_download(tenant_id).await?; json_response(StatusCode::OK, ()) } @@ -551,12 +586,21 @@ pub fn make_router( .delete("/v1/tenant/:tenant_id", |r| { tenant_service_handler(r, handle_tenant_delete) }) + .put("/v1/tenant/config", |r| { + tenant_service_handler(r, handle_tenant_config_set) + }) + .get("/v1/tenant/:tenant_id/config", |r| { + tenant_service_handler(r, handle_tenant_config_get) + }) .put("/v1/tenant/:tenant_id/location_config", |r| { tenant_service_handler(r, handle_tenant_location_config) }) .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| { tenant_service_handler(r, handle_tenant_time_travel_remote_storage) }) + .post("/v1/tenant/:tenant_id/secondary/download", |r| { + tenant_service_handler(r, handle_tenant_secondary_download) + }) // Timeline operations .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { tenant_service_handler(r, handle_tenant_timeline_delete) diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index ce613e858f..7ae7e264c7 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -13,14 +13,20 @@ mod schema; pub mod service; mod tenant_state; -#[derive(Clone, Serialize, Deserialize, Debug)] +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] enum PlacementPolicy { /// Cheapest way to attach a tenant: just one pageserver, no secondary Single, /// Production-ready way to attach a tenant: one attached pageserver and /// some number of secondaries. Double(usize), - /// Do not attach to any pageservers + /// Create one secondary mode locations. This is useful when onboarding + /// a tenant, or for an idle tenant that we might want to bring online quickly. + Secondary, + + /// Do not attach to any pageservers. This is appropriate for tenants that + /// have been idle for a long time, where we do not mind some delay in making + /// them available in future. Detached, } diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs index db4f00644f..d9acbc0abd 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/control_plane/attachment_service/src/main.rs @@ -9,7 +9,7 @@ use attachment_service::http::make_router; use attachment_service::metrics::preinitialize_metrics; use attachment_service::persistence::Persistence; use attachment_service::service::{Config, Service}; -use aws_config::{self, BehaviorVersion, Region}; +use aws_config::{BehaviorVersion, Region}; use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; @@ -79,13 +79,38 @@ impl Secrets { "neon-storage-controller-control-plane-jwt-token"; const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key"; + const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; + const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; + const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; + + /// Load secrets from, in order of preference: + /// - CLI args if database URL is provided on the CLI + /// - Environment variables if DATABASE_URL is set. + /// - AWS Secrets Manager secrets async fn load(args: &Cli) -> anyhow::Result { match &args.database_url { Some(url) => Self::load_cli(url, args), - None => Self::load_aws_sm().await, + None => match std::env::var(Self::DATABASE_URL_ENV) { + Ok(database_url) => Self::load_env(database_url), + Err(_) => Self::load_aws_sm().await, + }, } } + fn load_env(database_url: String) -> anyhow::Result { + let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) { + Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?), + Err(_) => None, + }; + Ok(Self { + database_url, + public_key, + jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(), + control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(), + }) + } + async fn load_aws_sm() -> anyhow::Result { let Ok(region) = std::env::var("AWS_REGION") else { anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets"); diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index 1b98cc7655..d5c304385c 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -7,8 +7,10 @@ use self::split_state::SplitState; use camino::Utf8Path; use camino::Utf8PathBuf; use diesel::pg::PgConnection; -use diesel::prelude::*; -use diesel::Connection; +use diesel::{ + Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl, + Selectable, SelectableHelper, +}; use pageserver_api::controller_api::NodeSchedulingPolicy; use pageserver_api::models::TenantConfig; use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; @@ -331,7 +333,15 @@ impl Persistence { shard_number: ShardNumber(tsp.shard_number as u8), shard_count: ShardCount::new(tsp.shard_count as u8), }; - result.insert(tenant_shard_id, Generation::new(tsp.generation as u32)); + + let Some(g) = tsp.generation else { + // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL: + // we only set generation_pageserver when setting generation. + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + )); + }; + result.insert(tenant_shard_id, Generation::new(g as u32)); } Ok(result) @@ -364,7 +374,85 @@ impl Persistence { }) .await?; - Ok(Generation::new(updated.generation as u32)) + // Generation is always non-null in the rseult: if the generation column had been NULL, then we + // should have experienced an SQL Confilict error while executing a query that tries to increment it. + debug_assert!(updated.generation.is_some()); + let Some(g) = updated.generation else { + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + ) + .into()); + }; + + Ok(Generation::new(g as u32)) + } + + /// For use when updating a persistent property of a tenant, such as its config or placement_policy. + /// + /// Do not use this for settting generation, unless in the special onboarding code path (/location_config) + /// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing + /// that we only do the first time a tenant is set to an attached policy via /location_config. + pub(crate) async fn update_tenant_shard( + &self, + tenant_shard_id: TenantShardId, + input_placement_policy: PlacementPolicy, + input_config: TenantConfig, + input_generation: Option, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + + self.with_conn(move |conn| { + let query = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)); + + if let Some(input_generation) = input_generation { + // Update includes generation column + query + .set(( + generation.eq(Some(input_generation.into().unwrap() as i32)), + placement_policy + .eq(serde_json::to_string(&input_placement_policy).unwrap()), + config.eq(serde_json::to_string(&input_config).unwrap()), + )) + .execute(conn)?; + } else { + // Update does not include generation column + query + .set(( + placement_policy + .eq(serde_json::to_string(&input_placement_policy).unwrap()), + config.eq(serde_json::to_string(&input_config).unwrap()), + )) + .execute(conn)?; + } + + Ok(()) + }) + .await?; + + Ok(()) + } + + pub(crate) async fn update_tenant_config( + &self, + input_tenant_id: TenantId, + input_config: TenantConfig, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + + self.with_conn(move |conn| { + diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .set((config.eq(serde_json::to_string(&input_config).unwrap()),)) + .execute(conn)?; + + Ok(()) + }) + .await?; + + Ok(()) } pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { @@ -375,7 +463,7 @@ impl Persistence { .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) .set(( - generation_pageserver.eq(i64::MAX), + generation_pageserver.eq(Option::::None), placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), )) .execute(conn)?; @@ -501,12 +589,15 @@ pub(crate) struct TenantShardPersistence { pub(crate) shard_stripe_size: i32, // Latest generation number: next time we attach, increment this - // and use the incremented number when attaching - pub(crate) generation: i32, + // and use the incremented number when attaching. + // + // Generation is only None when first onboarding a tenant, where it may + // be in PlacementPolicy::Secondary and therefore have no valid generation state. + pub(crate) generation: Option, // Currently attached pageserver #[serde(rename = "pageserver")] - pub(crate) generation_pageserver: i64, + pub(crate) generation_pageserver: Option, #[serde(default)] pub(crate) placement_policy: String, diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index ce91c1f5e9..b633b217c7 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -26,7 +26,7 @@ pub(super) struct Reconciler { /// of a tenant's state from when we spawned a reconcile task. pub(super) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, - pub(crate) generation: Generation, + pub(crate) generation: Option, pub(crate) intent: TargetState, pub(crate) config: TenantConfig, pub(crate) observed: ObservedState, @@ -312,7 +312,7 @@ impl Reconciler { &self.shard, &self.config, LocationConfigMode::AttachedStale, - Some(self.generation), + self.generation, None, ); self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10))) @@ -335,16 +335,17 @@ impl Reconciler { } // Increment generation before attaching to new pageserver - self.generation = self - .persistence - .increment_generation(self.tenant_shard_id, dest_ps_id) - .await?; + self.generation = Some( + self.persistence + .increment_generation(self.tenant_shard_id, dest_ps_id) + .await?, + ); let dest_conf = build_location_config( &self.shard, &self.config, LocationConfigMode::AttachedMulti, - Some(self.generation), + self.generation, None, ); @@ -401,7 +402,7 @@ impl Reconciler { &self.shard, &self.config, LocationConfigMode::AttachedSingle, - Some(self.generation), + self.generation, None, ); self.location_config(dest_ps_id, dest_final_conf.clone(), None) @@ -433,22 +434,62 @@ impl Reconciler { // If the attached pageserver is not attached, do so now. if let Some(node_id) = self.intent.attached { - let mut wanted_conf = - attached_location_conf(self.generation, &self.shard, &self.config); + // If we are in an attached policy, then generation must have been set (null generations + // are only present when a tenant is initially loaded with a secondary policy) + debug_assert!(self.generation.is_some()); + let Some(generation) = self.generation else { + return Err(ReconcileError::Other(anyhow::anyhow!( + "Attempted to attach with NULL generation" + ))); + }; + + let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do tracing::info!(%node_id, "Observed configuration already correct.") } - _ => { + observed => { // In all cases other than a matching observed configuration, we will // reconcile this location. This includes locations with different configurations, as well // as locations with unknown (None) observed state. - self.generation = self - .persistence - .increment_generation(self.tenant_shard_id, node_id) - .await?; - wanted_conf.generation = self.generation.into(); + + // The general case is to increment the generation. However, there are cases + // where this is not necessary: + // - if we are only updating the TenantConf part of the location + // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale) + // and the location was already in the correct generation + let increment_generation = match observed { + None => true, + Some(ObservedStateLocation { conf: None }) => true, + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let generations_match = observed.generation == wanted_conf.generation; + + use LocationConfigMode::*; + let mode_transition_requires_gen_inc = + match (observed.mode, wanted_conf.mode) { + // Usually the short-lived attachment modes (multi and stale) are only used + // in the case of [`Self::live_migrate`], but it is simple to handle them correctly + // here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation. + (AttachedSingle, AttachedStale) => false, + (AttachedMulti, AttachedSingle) => false, + (lhs, rhs) => lhs != rhs, + }; + + !generations_match || mode_transition_requires_gen_inc + } + }; + + if increment_generation { + let generation = self + .persistence + .increment_generation(self.tenant_shard_id, node_id) + .await?; + self.generation = Some(generation); + wanted_conf.generation = generation.into(); + } tracing::info!(%node_id, "Observed configuration requires update."); self.location_config(node_id, wanted_conf, None).await?; self.compute_notify().await?; diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs index 3224751e47..87fce3df25 100644 --- a/control_plane/attachment_service/src/scheduler.rs +++ b/control_plane/attachment_service/src/scheduler.rs @@ -284,7 +284,6 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { use super::*; - use utils::id::NodeId; use crate::tenant_state::IntentState; #[test] diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs index db5a957443..76e4e56a66 100644 --- a/control_plane/attachment_service/src/schema.rs +++ b/control_plane/attachment_service/src/schema.rs @@ -17,8 +17,8 @@ diesel::table! { shard_number -> Int4, shard_count -> Int4, shard_stripe_size -> Int4, - generation -> Int4, - generation_pageserver -> Int8, + generation -> Nullable, + generation_pageserver -> Nullable, placement_policy -> Varchar, splitting -> Int2, config -> Text, diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index 02c1a65545..4209b62db3 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -14,10 +14,13 @@ use control_plane::attachment_service::{ use diesel::result::DatabaseErrorKind; use futures::{stream::FuturesUnordered, StreamExt}; use hyper::StatusCode; -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, - TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse, - TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse, +use pageserver_api::{ + controller_api::{ + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, + TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse, + TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse, + }, + models::TenantConfigRequest, }; use pageserver_api::{ models::{ @@ -65,6 +68,11 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); // some data in it. const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); +// If we receive a call using Secondary mode initially, it will omit generation. We will initialize +// tenant shards into this generation, and as long as it remains in this generation, we will accept +// input generation from future requests as authoritative. +const INITIAL_GENERATION: Generation = Generation::new(0); + /// How long [`Service::startup_reconcile`] is allowed to take before it should give /// up on unresponsive pageservers and proceed. pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); @@ -167,6 +175,21 @@ impl From for ApiError { } } +#[allow(clippy::large_enum_variant)] +enum TenantCreateOrUpdate { + Create((TenantCreateRequest, PlacementPolicy)), + Update(Vec), +} + +struct ShardUpdate { + tenant_shard_id: TenantShardId, + placement_policy: PlacementPolicy, + tenant_config: TenantConfig, + + /// If this is None, generation is not updated. + generation: Option, +} + impl Service { pub fn get_config(&self) -> &Config { &self.config @@ -571,6 +594,9 @@ impl Service { // the shard so that a future [`TenantState::maybe_reconcile`] will try again. tenant.pending_compute_notification = result.pending_compute_notification; + // Let the TenantState know it is idle. + tenant.reconcile_complete(result.sequence); + match result.result { Ok(()) => { for (node_id, loc) in &result.observed.locations { @@ -661,8 +687,8 @@ impl Service { // after when pageservers start up and register. let mut node_ids = HashSet::new(); for tsp in &tenant_shard_persistence { - if tsp.generation_pageserver != i64::MAX { - node_ids.insert(tsp.generation_pageserver); + if let Some(node_id) = tsp.generation_pageserver { + node_ids.insert(node_id); } } for node_id in node_ids { @@ -699,18 +725,15 @@ impl Service { // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. let mut intent = IntentState::new(); - if tsp.generation_pageserver != i64::MAX { - intent.set_attached( - &mut scheduler, - Some(NodeId(tsp.generation_pageserver as u64)), - ); + if let Some(generation_pageserver) = tsp.generation_pageserver { + intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); } let new_tenant = TenantState { tenant_shard_id, shard: shard_identity, sequence: Sequence::initial(), - generation: Generation::new(tsp.generation as u32), + generation: tsp.generation.map(|g| Generation::new(g as u32)), policy: serde_json::from_str(&tsp.placement_policy).unwrap(), intent, observed: ObservedState::new(), @@ -790,8 +813,8 @@ impl Service { shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32, shard_stripe_size: 0, - generation: 0, - generation_pageserver: i64::MAX, + generation: Some(0), + generation_pageserver: None, placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), splitting: SplitState::default(), @@ -846,7 +869,7 @@ impl Service { .expect("Checked for existence above"); if let Some(new_generation) = new_generation { - tenant_state.generation = new_generation; + tenant_state.generation = Some(new_generation); } else { // This is a detach notification. We must update placement policy to avoid re-attaching // during background scheduling/reconciliation, or during attachment service restart. @@ -896,7 +919,7 @@ impl Service { node_id, ObservedStateLocation { conf: Some(attached_location_conf( - tenant_state.generation, + tenant_state.generation.unwrap(), &tenant_state.shard, &tenant_state.config, )), @@ -910,7 +933,7 @@ impl Service { Ok(AttachHookResponse { gen: attach_req .node_id - .map(|_| tenant_state.generation.into().unwrap()), + .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) } @@ -923,7 +946,7 @@ impl Service { attachment: tenant_state.and_then(|s| { s.intent .get_attached() - .map(|ps| (s.generation.into().unwrap(), ps)) + .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) }), } } @@ -973,7 +996,17 @@ impl Service { continue; }; - shard_state.generation = std::cmp::max(shard_state.generation, new_gen); + // If [`Persistence::re_attach`] selected this shard, it must have alread + // had a generation set. + debug_assert!(shard_state.generation.is_some()); + let Some(old_gen) = shard_state.generation else { + // Should never happen: would only return incremented generation + // for a tenant that already had a non-null generation. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Generation must be set while re-attaching" + ))); + }; + shard_state.generation = Some(std::cmp::max(old_gen, new_gen)); if let Some(observed) = shard_state .observed .locations @@ -1003,7 +1036,7 @@ impl Service { for req_tenant in validate_req.tenants { if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_state.generation == Generation::new(req_tenant.gen); + let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, @@ -1030,8 +1063,9 @@ impl Service { pub(crate) async fn tenant_create( &self, create_req: TenantCreateRequest, + placement_policy: PlacementPolicy, ) -> Result { - let (response, waiters) = self.do_tenant_create(create_req).await?; + let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?; self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?; Ok(response) @@ -1040,6 +1074,7 @@ impl Service { pub(crate) async fn do_tenant_create( &self, create_req: TenantCreateRequest, + placement_policy: PlacementPolicy, ) -> Result<(TenantCreateResponse, Vec), ApiError> { // This service expects to handle sharding itself: it is an error to try and directly create // a particular shard here. @@ -1065,9 +1100,27 @@ impl Service { }) .collect::>(); - // TODO: enable specifying this. Using Single as a default helps legacy tests to work (they - // have no expectation of HA). - let placement_policy: PlacementPolicy = PlacementPolicy::Single; + // If the caller specifies a None generation, it means "start from default". This is different + // to [`Self::tenant_location_config`], where a None generation is used to represent + // an incompletely-onboarded tenant. + let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) { + tracing::info!( + "tenant_create: secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + create_req.generation.map(Generation::new) + } else { + tracing::info!( + "tenant_create: not secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + Some( + create_req + .generation + .map(Generation::new) + .unwrap_or(INITIAL_GENERATION), + ) + }; // Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart @@ -1079,8 +1132,10 @@ impl Service { shard_number: tenant_shard_id.shard_number.0 as i32, shard_count: tenant_shard_id.shard_count.literal() as i32, shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32, - generation: create_req.generation.map(|g| g as i32).unwrap_or(0), - generation_pageserver: i64::MAX, + generation: initial_generation.map(|g| g.into().unwrap() as i32), + // The pageserver is not known until scheduling happens: we will set this column when + // incrementing the generation the first time we attach to a pageserver. + generation_pageserver: None, placement_policy: serde_json::to_string(&placement_policy).unwrap(), config: serde_json::to_string(&create_req.config).unwrap(), splitting: SplitState::default(), @@ -1120,15 +1175,17 @@ impl Service { )) })?; - response_shards.push(TenantCreateResponseShard { - shard_id: tenant_shard_id, - node_id: entry + if let Some(node_id) = entry.get().intent.get_attached() { + let generation = entry .get() - .intent - .get_attached() - .expect("We just set pageserver if it was None"), - generation: entry.get().generation.into().unwrap(), - }); + .generation + .expect("Generation is set when in attached mode"); + response_shards.push(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }); + } continue; } @@ -1142,9 +1199,7 @@ impl Service { placement_policy.clone(), ); - if let Some(create_gen) = create_req.generation { - state.generation = Generation::new(create_gen); - } + state.generation = initial_generation; state.config = create_req.config.clone(); state.schedule(scheduler).map_err(|e| { @@ -1153,14 +1208,18 @@ impl Service { )) })?; - response_shards.push(TenantCreateResponseShard { - shard_id: tenant_shard_id, - node_id: state - .intent - .get_attached() - .expect("We just set pageserver if it was None"), - generation: state.generation.into().unwrap(), - }); + // Only include shards in result if we are attaching: the purpose + // of the response is to tell the caller where the shards are attached. + if let Some(node_id) = state.intent.get_attached() { + let generation = state + .generation + .expect("Generation is set when in attached mode"); + response_shards.push(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }); + } entry.insert(state) } }; @@ -1214,12 +1273,114 @@ impl Service { Ok(()) } - /// This API is used by the cloud control plane to do coarse-grained control of tenants: - /// - Call with mode Attached* to upsert the tenant. - /// - Call with mode Detached to switch to PolicyMode::Detached + /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, + /// and transform it into either a tenant creation of a series of shard updates. + fn tenant_location_config_prepare( + &self, + tenant_id: TenantId, + req: TenantLocationConfigRequest, + ) -> TenantCreateOrUpdate { + let mut updates = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + + // Use location config mode as an indicator of policy. + let placement_policy = match req.config.mode { + LocationConfigMode::Detached => PlacementPolicy::Detached, + LocationConfigMode::Secondary => PlacementPolicy::Secondary, + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + if nodes.len() > 1 { + PlacementPolicy::Double(1) + } else { + // Convenience for dev/test: if we just have one pageserver, import + // tenants into Single mode so that scheduling will succeed. + PlacementPolicy::Single + } + } + }; + + let mut create = true; + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + // Saw an existing shard: this is not a creation + create = false; + + // Shards may have initially been created by a Secondary request, where we + // would have left generation as None. + // + // We only update generation the first time we see an attached-mode request, + // and if there is no existing generation set. The caller is responsible for + // ensuring that no non-storage-controller pageserver ever uses a higher + // generation than they passed in here. + use LocationConfigMode::*; + let set_generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => { + req.config.generation.map(Generation::new) + } + _ => None, + }; + + if shard.policy != placement_policy + || shard.config != req.config.tenant_conf + || set_generation.is_some() + { + updates.push(ShardUpdate { + tenant_shard_id: *shard_id, + placement_policy: placement_policy.clone(), + tenant_config: req.config.tenant_conf.clone(), + generation: set_generation, + }); + } + } + + if create { + use LocationConfigMode::*; + let generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale => req.config.generation, + // If a caller provided a generation in a non-attached request, ignore it + // and leave our generation as None: this enables a subsequent update to set + // the generation when setting an attached mode for the first time. + _ => None, + }; + + TenantCreateOrUpdate::Create( + // Synthesize a creation request + ( + TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation, + shard_parameters: ShardParameters { + // Must preserve the incoming shard_count do distinguish unsharded (0) + // from single-sharded (1): this distinction appears in the S3 keys of the tenant. + count: req.tenant_id.shard_count, + // We only import un-sharded or single-sharded tenants, so stripe + // size can be made up arbitrarily here. + stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, + }, + config: req.config.tenant_conf, + }, + placement_policy, + ), + ) + } else { + TenantCreateOrUpdate::Update(updates) + } + } + + /// This API is used by the cloud control plane to migrate unsharded tenants that it created + /// directly with pageservers into this service. /// - /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has - /// secondary locations. + /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it + /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption. + /// Think of the first attempt to call this API as a transfer of absolute authority over the + /// tenant's source of generation numbers. + /// + /// The mode in this request coarse-grained control of tenants: + /// - Call with mode Attached* to upsert the tenant. + /// - Call with mode Secondary to either onboard a tenant without attaching it, or + /// to set an existing tenant to PolicyMode::Secondary + /// - Call with mode Detached to switch to PolicyMode::Detached pub(crate) async fn tenant_location_config( &self, tenant_id: TenantId, @@ -1231,131 +1392,96 @@ impl Service { ))); } - let mut waiters = Vec::new(); + // First check if this is a creation or an update + let create_or_update = self.tenant_location_config_prepare(tenant_id, req); + let mut result = TenantLocationConfigResponse { shards: Vec::new() }; - let maybe_create = { - let mut locked = self.inner.write().unwrap(); - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + let waiters = match create_or_update { + TenantCreateOrUpdate::Create((create_req, placement_policy)) => { + let (create_resp, waiters) = + self.do_tenant_create(create_req, placement_policy).await?; + result.shards = create_resp + .shards + .into_iter() + .map(|s| TenantShardLocation { + node_id: s.node_id, + shard_id: s.shard_id, + }) + .collect(); + waiters + } + TenantCreateOrUpdate::Update(updates) => { + // Persist updates + // Ordering: write to the database before applying changes in-memory, so that + // we will not appear time-travel backwards on a restart. + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation, + } in &updates + { + self.persistence + .update_tenant_shard( + *tenant_shard_id, + placement_policy.clone(), + tenant_config.clone(), + *generation, + ) + .await?; + } - // Maybe we have existing shards - let mut create = true; - for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { - // Saw an existing shard: this is not a creation - create = false; + // Apply updates in-memory + let mut waiters = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + let result_tx = locked.result_tx.clone(); + let compute_hook = locked.compute_hook.clone(); + let (nodes, tenants, scheduler) = locked.parts_mut(); - // Note that for existing tenants we do _not_ respect the generation in the request: this is likely - // to be stale. Once a tenant is created in this service, our view of generation is authoritative, and - // callers' generations may be ignored. This represents a one-way migration of tenants from the outer - // cloud control plane into this service. + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation: update_generation, + } in updates + { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + tracing::warn!("Shard {tenant_shard_id} removed while updating"); + continue; + }; - // Use location config mode as an indicator of policy: if they ask for - // attached we go to default HA attached mode. If they ask for secondary - // we go to secondary-only mode. If they ask for detached we detach. - match req.config.mode { - LocationConfigMode::Detached => { - shard.policy = PlacementPolicy::Detached; - } - LocationConfigMode::Secondary => { - // TODO: implement secondary-only mode. - todo!(); - } - LocationConfigMode::AttachedMulti - | LocationConfigMode::AttachedSingle - | LocationConfigMode::AttachedStale => { - // TODO: persistence for changes in policy - if nodes.len() > 1 { - shard.policy = PlacementPolicy::Double(1) - } else { - // Convenience for dev/test: if we just have one pageserver, import - // tenants into Single mode so that scheduling will succeed. - shard.policy = PlacementPolicy::Single + shard.policy = placement_policy; + shard.config = tenant_config; + if let Some(generation) = update_generation { + shard.generation = Some(generation); + } + + shard.schedule(scheduler)?; + + let maybe_waiter = shard.maybe_reconcile( + result_tx.clone(), + nodes, + &compute_hook, + &self.config, + &self.persistence, + &self.gate, + &self.cancel, + ); + if let Some(waiter) = maybe_waiter { + waiters.push(waiter); + } + + if let Some(node_id) = shard.intent.get_attached() { + result.shards.push(TenantShardLocation { + shard_id: tenant_shard_id, + node_id: *node_id, + }) } } } - - shard.schedule(scheduler)?; - - let maybe_waiter = shard.maybe_reconcile( - result_tx.clone(), - nodes, - &compute_hook, - &self.config, - &self.persistence, - &self.gate, - &self.cancel, - ); - if let Some(waiter) = maybe_waiter { - waiters.push(waiter); - } - - if let Some(node_id) = shard.intent.get_attached() { - result.shards.push(TenantShardLocation { - shard_id: *shard_id, - node_id: *node_id, - }) - } + waiters } - - if create { - // Validate request mode - match req.config.mode { - LocationConfigMode::Detached | LocationConfigMode::Secondary => { - // When using this API to onboard an existing tenant to this service, it must start in - // an attached state, because we need the request to come with a generation - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Imported tenant must be in attached mode" - ))); - } - - LocationConfigMode::AttachedMulti - | LocationConfigMode::AttachedSingle - | LocationConfigMode::AttachedStale => { - // Pass - } - } - - // Validate request generation - let Some(generation) = req.config.generation else { - // We can only import attached tenants, because we need the request to come with a generation - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Generation is mandatory when importing tenant" - ))); - }; - - // Synthesize a creation request - Some(TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(tenant_id), - generation: Some(generation), - shard_parameters: ShardParameters { - // Must preserve the incoming shard_count do distinguish unsharded (0) - // from single-sharded (1): this distinction appears in the S3 keys of the tenant. - count: req.tenant_id.shard_count, - // We only import un-sharded or single-sharded tenants, so stripe - // size can be made up arbitrarily here. - stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, - }, - config: req.config.tenant_conf, - }) - } else { - None - } - }; - - let waiters = if let Some(create_req) = maybe_create { - let (create_resp, waiters) = self.do_tenant_create(create_req).await?; - result.shards = create_resp - .shards - .into_iter() - .map(|s| TenantShardLocation { - node_id: s.node_id, - shard_id: s.shard_id, - }) - .collect(); - waiters - } else { - waiters }; if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { @@ -1375,6 +1501,91 @@ impl Service { Ok(result) } + pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { + let tenant_id = req.tenant_id; + let config = req.config; + + self.persistence + .update_tenant_config(req.tenant_id, config.clone()) + .await?; + + let waiters = { + let mut waiters = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let result_tx = locked.result_tx.clone(); + let compute_hook = locked.compute_hook.clone(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.config = config.clone(); + if let Some(waiter) = shard.maybe_reconcile( + result_tx.clone(), + nodes, + &compute_hook, + &self.config, + &self.persistence, + &self.gate, + &self.cancel, + ) { + waiters.push(waiter); + } + } + waiters + }; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Treat this as success because we have stored the configuration. If e.g. + // a node was unavailable at this time, it should not stop us accepting a + // configuration change. + tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}"); + } + + Ok(()) + } + + pub(crate) fn tenant_config_get( + &self, + tenant_id: TenantId, + ) -> Result, ApiError> { + let config = { + let locked = self.inner.read().unwrap(); + + match locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + { + Some((_tenant_shard_id, shard)) => shard.config.clone(), + None => { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )) + } + } + }; + + // Unlike the pageserver, we do not have a set of global defaults: the config is + // entirely per-tenant. Therefore the distinction between `tenant_specific_overrides` + // and `effective_config` in the response is meaningless, but we retain that syntax + // in order to remain compatible with the pageserver API. + + let response = HashMap::from([ + ( + "tenant_specific_overrides", + serde_json::to_value(&config) + .context("serializing tenant specific overrides") + .map_err(ApiError::InternalServerError)?, + ), + ( + "effective_config", + serde_json::to_value(&config) + .context("serializing effective config") + .map_err(ApiError::InternalServerError)?, + ), + ]); + + Ok(response) + } + pub(crate) async fn tenant_time_travel_remote_storage( &self, time_travel_req: &TenantTimeTravelRequest, @@ -1460,6 +1671,60 @@ impl Service { })?; } } + Ok(()) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantId, + ) -> Result<(), ApiError> { + // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + for node_id in shard.intent.get_secondary() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running + // downloads, they can return a clean 202 response instead of the HTTP client timing out. + + // Issue concurrent requests to all shards' locations + let mut futs = FuturesUnordered::new(); + for (tenant_shard_id, node) in targets { + let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + futs.push(async move { + let result = client.tenant_secondary_download(tenant_shard_id).await; + (result, node) + }) + } + + // Handle any errors returned by pageservers. This includes cases like this request racing with + // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as + // well as more general cases like 503s, 500s, or timeouts. + while let Some((result, node)) = futs.next().await { + let Err(e) = result else { continue }; + + // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever + // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache + // than they had hoped for. + tracing::warn!( + "Ignoring tenant secondary download error from pageserver {}: {e}", + node.id, + ); + } Ok(()) } @@ -2039,8 +2304,8 @@ impl Service { // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will // populate the correct generation as part of its transaction, to protect us // against racing with changes in the state of the parent. - generation: 0, - generation_pageserver: target.node.id.0 as i64, + generation: None, + generation_pageserver: Some(target.node.id.0 as i64), placement_policy: serde_json::to_string(&policy).unwrap(), // TODO: get the config out of the map config: serde_json::to_string(&TenantConfig::default()).unwrap(), @@ -2161,7 +2426,8 @@ impl Service { .expect("It was present, we just split it"); let old_attached = old_state.intent.get_attached().unwrap(); old_state.intent.clear(scheduler); - (old_attached, old_state.generation, old_state.config.clone()) + let generation = old_state.generation.expect("Shard must have been attached"); + (old_attached, generation, old_state.config.clone()) }; for child in child_ids { @@ -2182,7 +2448,7 @@ impl Service { child_state.observed = ObservedState { locations: child_observed, }; - child_state.generation = generation; + child_state.generation = Some(generation); child_state.config = config.clone(); // The child's TenantState::splitting is intentionally left at the default value of Idle, @@ -2247,6 +2513,7 @@ impl Service { match shard.policy { PlacementPolicy::Single => { shard.intent.clear_secondary(scheduler); + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); } PlacementPolicy::Double(_n) => { // If our new attached node was a secondary, it no longer should be. @@ -2256,6 +2523,12 @@ impl Service { if let Some(old_attached) = old_attached { shard.intent.push_secondary(scheduler, old_attached); } + + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + } + PlacementPolicy::Secondary => { + shard.intent.clear(scheduler); + shard.intent.push_secondary(scheduler, migrate_req.node_id); } PlacementPolicy::Detached => { return Err(ApiError::BadRequest(anyhow::anyhow!( @@ -2263,9 +2536,6 @@ impl Service { ))) } } - shard - .intent - .set_attached(scheduler, Some(migrate_req.node_id)); tracing::info!("Migrating: new intent {:?}", shard.intent); shard.sequence = shard.sequence.next(); @@ -2593,7 +2863,7 @@ impl Service { observed_loc.conf = None; } - if tenant_state.intent.notify_offline(config_req.node_id) { + if tenant_state.intent.demote_attached(config_req.node_id) { tenant_state.sequence = tenant_state.sequence.next(); match tenant_state.schedule(scheduler) { Err(e) => { @@ -2660,6 +2930,9 @@ impl Service { /// Helper for methods that will try and call pageserver APIs for /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant /// is attached somewhere. + /// + /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is + /// an attached policy. We should error out if it isn't. fn ensure_attached_schedule( &self, mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index c14fe6699e..33b7d578c7 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -53,8 +53,11 @@ pub(crate) struct TenantState { pub(crate) sequence: Sequence, // Latest generation number: next time we attach, increment this - // and use the incremented number when attaching - pub(crate) generation: Generation, + // and use the incremented number when attaching. + // + // None represents an incompletely onboarded tenant via the [`Service::location_config`] + // API, where this tenant may only run in PlacementPolicy::Secondary. + pub(crate) generation: Option, // High level description of how the tenant should be set up. Provided // externally. @@ -181,6 +184,13 @@ impl IntentState { } } + /// Remove the last secondary node from the list of secondaries + pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { + if let Some(node_id) = self.secondary.pop() { + scheduler.node_dec_ref(node_id); + } + } + pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { if let Some(old_attached) = self.attached.take() { scheduler.node_dec_ref(old_attached); @@ -208,11 +218,13 @@ impl IntentState { &self.secondary } - /// When a node goes offline, we update intents to avoid using it - /// as their attached pageserver. + /// If the node is in use as the attached location, demote it into + /// the list of secondary locations. This is used when a node goes offline, + /// and we want to use a different node for attachment, but not permanently + /// forget the location on the offline node. /// /// Returns true if a change was made - pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool { + pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool { if self.attached == Some(node_id) { // TODO: when scheduler starts tracking attached + secondary counts separately, we will // need to call into it here. @@ -315,7 +327,7 @@ pub(crate) struct ReconcileResult { pub(crate) result: Result<(), ReconcileError>, pub(crate) tenant_shard_id: TenantShardId, - pub(crate) generation: Generation, + pub(crate) generation: Option, pub(crate) observed: ObservedState, /// Set [`TenantState::pending_compute_notification`] from this flag @@ -340,7 +352,7 @@ impl TenantState { tenant_shard_id, policy, intent: IntentState::default(), - generation: Generation::new(0), + generation: Some(Generation::new(0)), shard, observed: ObservedState::default(), config: TenantConfig::default(), @@ -438,10 +450,16 @@ impl TenantState { // more work on the same pageservers we're already using. let mut modified = false; + // Add/remove nodes to fulfil policy use PlacementPolicy::*; match self.policy { Single => { // Should have exactly one attached, and zero secondaries + if !self.intent.secondary.is_empty() { + self.intent.clear_secondary(scheduler); + modified = true; + } + let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?; modified |= modified_attached; @@ -451,6 +469,23 @@ impl TenantState { } } Double(secondary_count) => { + let retain_secondaries = if self.intent.attached.is_none() + && scheduler.node_preferred(&self.intent.secondary).is_some() + { + // If we have no attached, and one of the secondaries is elegible to be promoted, retain + // one more secondary than we usually would, as one of them will become attached futher down this function. + secondary_count + 1 + } else { + secondary_count + }; + + while self.intent.secondary.len() > retain_secondaries { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); + modified = true; + } + // Should have exactly one attached, and N secondaries let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?; modified |= modified_attached; @@ -463,15 +498,28 @@ impl TenantState { modified = true; } } - Detached => { - // Should have no attached or secondary pageservers - if self.intent.attached.is_some() { - self.intent.set_attached(scheduler, None); + Secondary => { + if let Some(node_id) = self.intent.get_attached() { + // Populate secondary by demoting the attached node + self.intent.demote_attached(*node_id); + modified = true; + } else if self.intent.secondary.is_empty() { + // Populate secondary by scheduling a fresh node + let node_id = scheduler.schedule_shard(&[])?; + self.intent.push_secondary(scheduler, node_id); modified = true; } - - if !self.intent.secondary.is_empty() { - self.intent.clear_secondary(scheduler); + while self.intent.secondary.len() > 1 { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); + modified = true; + } + } + Detached => { + // Never add locations in this mode + if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() { + self.intent.clear(scheduler); modified = true; } } @@ -518,7 +566,12 @@ impl TenantState { fn dirty(&self) -> bool { if let Some(node_id) = self.intent.attached { - let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config); + // Maybe panic: it is a severe bug if we try to attach while generation is null. + let generation = self + .generation + .expect("Attempted to enter attached state without a generation"); + + let wanted_conf = attached_location_conf(generation, &self.shard, &self.config); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { @@ -596,6 +649,10 @@ impl TenantState { // Reconcile already in flight for the current sequence? if let Some(handle) = &self.reconciler { if handle.sequence == self.sequence { + tracing::info!( + "Reconciliation already in progress for sequence {:?}", + self.sequence, + ); return Some(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), @@ -615,6 +672,10 @@ impl TenantState { return None; }; + // Advance the sequence before spawning a reconciler, so that sequence waiters + // can distinguish between before+after the reconcile completes. + self.sequence = self.sequence.next(); + let reconciler_cancel = cancel.child_token(); let mut reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, @@ -716,6 +777,17 @@ impl TenantState { }) } + /// Called when a ReconcileResult has been emitted and the service is updating + /// our state: if the result is from a sequence >= my ReconcileHandle, then drop + /// the handle to indicate there is no longer a reconciliation in progress. + pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) { + if let Some(reconcile_handle) = &self.reconciler { + if reconcile_handle.sequence <= sequence { + self.reconciler = None; + } + } + } + // If we had any state at all referring to this node ID, drop it. Does not // attempt to reschedule. pub(crate) fn deref_node(&mut self, node_id: NodeId) { @@ -736,13 +808,8 @@ impl TenantState { shard_number: self.tenant_shard_id.shard_number.0 as i32, shard_count: self.tenant_shard_id.shard_count.literal() as i32, shard_stripe_size: self.shard.stripe_size.0 as i32, - generation: self.generation.into().unwrap_or(0) as i32, - generation_pageserver: self - .intent - .get_attached() - .map(|n| n.0 as i64) - .unwrap_or(i64::MAX), - + generation: self.generation.map(|g| g.into().unwrap_or(0) as i32), + generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64), placement_policy: serde_json::to_string(&self.policy).unwrap(), config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), @@ -805,8 +872,10 @@ pub(crate) mod tests { assert_ne!(attached_node_id, secondary_node_id); // Notifying the attached node is offline should demote it to a secondary - let changed = tenant_state.intent.notify_offline(attached_node_id); + let changed = tenant_state.intent.demote_attached(attached_node_id); assert!(changed); + assert!(tenant_state.intent.attached.is_none()); + assert_eq!(tenant_state.intent.secondary.len(), 2); // Update the scheduler state to indicate the node is offline nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline; diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index 92342b478b..610d7386d9 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -200,7 +200,7 @@ impl AttachmentService { "localhost", "-p", &format!("{}", self.postgres_port), - &DB_NAME, + DB_NAME, ]) .output() .await diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index de7eb797d6..5a75bc2a1d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -605,7 +605,7 @@ impl Endpoint { let conn_str = self.connstr("cloud_admin", "postgres"); println!("Starting postgres node at '{}'", conn_str); if create_test_user { - let conn_str = self.connstr("user", "neondb"); + let conn_str = self.connstr("test", "neondb"); println!("Also at '{}'", conn_str); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 61aa8a5ae8..d583866290 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -14,7 +14,6 @@ use byteorder::{BigEndian, ReadBytesExt}; use postgres_ffi::BLCKSZ; use serde::{Deserialize, Serialize}; use serde_with::serde_as; -use strum_macros; use utils::{ completion, history_buffer::HistoryBufferWithDropCounter, @@ -1077,7 +1076,6 @@ impl PagestreamBeMessage { #[cfg(test)] mod tests { - use bytes::Buf; use serde_json::json; use super::*; diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 467a4cf0c1..a2a9165184 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -6,7 +6,6 @@ use crate::{ }; use hex::FromHex; use serde::{Deserialize, Serialize}; -use thiserror; use utils::id::TenantId; #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] @@ -656,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke #[cfg(test)] mod tests { - use std::str::FromStr; - - use bincode; - use utils::{id::TenantId, Hex}; + use utils::Hex; use super::*; diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 6f847cf9d7..478ad81dc1 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -623,9 +623,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result { mod fs_tests { use super::*; - use bytes::Bytes; use camino_tempfile::tempdir; - use futures_util::Stream; use std::{collections::HashMap, io::Write}; async fn read_and_check_metadata( diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index af70dc7ca2..438f45fbde 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -1040,7 +1040,7 @@ mod tests { Some("test/prefix/"), Some("/test/prefix/"), ]; - let expected_outputs = vec![ + let expected_outputs = [ vec!["", "some/path", "some/path"], vec!["/", "/some/path", "/some/path"], vec![ diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index fbf0dff665..03e65f74fe 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,7 +1,6 @@ // For details about authentication see docs/authentication.md use arc_swap::ArcSwap; -use serde; use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; use anyhow::Result; diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index ea05cf54b1..2fef8d35df 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] -pub struct Completion(TaskTrackerToken); +pub struct Completion { + _token: TaskTrackerToken, +} /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] @@ -49,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion(token), Barrier(tracker)) + (Completion { _token: token }, Barrier(tracker)) } diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 6f6c46cfeb..af15cee924 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -45,7 +45,7 @@ impl Generation { Self::Broken } - pub fn new(v: u32) -> Self { + pub const fn new(v: u32) -> Self { Self::Valid(v) } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 550ab10700..a60971abf0 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tracing::{self, debug, info, info_span, warn, Instrument}; +use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; use std::str::FromStr; @@ -156,6 +156,10 @@ pub struct ChannelWriter { buffer: BytesMut, pub tx: mpsc::Sender>, written: usize, + /// Time spent waiting for the channel to make progress. It is not the same as time to upload a + /// buffer because we cannot know anything about that, but this should allow us to understand + /// the actual time taken without the time spent `std::thread::park`ed. + wait_time: std::time::Duration, } impl ChannelWriter { @@ -168,6 +172,7 @@ impl ChannelWriter { buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2), tx, written: 0, + wait_time: std::time::Duration::ZERO, } } @@ -180,6 +185,8 @@ impl ChannelWriter { tracing::trace!(n, "flushing"); let ready = self.buffer.split().freeze(); + let wait_started_at = std::time::Instant::now(); + // not ideal to call from blocking code to block_on, but we are sure that this // operation does not spawn_blocking other tasks let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async { @@ -192,6 +199,9 @@ impl ChannelWriter { // sending it to the client. Ok(()) }); + + self.wait_time += wait_started_at.elapsed(); + if res.is_err() { return Err(std::io::ErrorKind::BrokenPipe.into()); } @@ -202,6 +212,10 @@ impl ChannelWriter { pub fn flushed_bytes(&self) -> usize { self.written } + + pub fn wait_time(&self) -> std::time::Duration { + self.wait_time + } } impl std::io::Write for ChannelWriter { @@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request) -> Result { tracing::info!( bytes = writer.flushed_bytes(), - elapsed_ms = started_at.elapsed().as_millis(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), "responded /metrics" ); } Err(e) => { - tracing::warn!("failed to write out /metrics response: {e:#}"); + // there is a chance that this error is not the BrokenPipe we generate in the writer + // for "closed connection", but it is highly unlikely. + tracing::warn!( + after_bytes = writer.flushed_bytes(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), + "failed to write out /metrics response: {e:?}" + ); // semantics of this error are quite... unclear. we want to error the stream out to // abort the response to somehow notify the client that we failed. // diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index b3269ae049..1aebe91428 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -415,7 +415,6 @@ mod tests { use super::*; - use serde::ser::Serialize; use serde_assert::{Deserializer, Serializer, Token, Tokens}; #[test] diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index effc9c67b5..b7301776eb 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -1,6 +1,6 @@ #![warn(missing_docs)] -use std::cmp::{Eq, Ordering, PartialOrd}; +use std::cmp::{Eq, Ordering}; use std::collections::BinaryHeap; use std::fmt::Debug; use std::mem; @@ -249,7 +249,6 @@ where mod tests { use super::*; use std::sync::Arc; - use std::time::Duration; impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index dc4a599111..ecc5353be3 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -221,7 +221,7 @@ impl RcuWaitList { #[cfg(test)] mod tests { use super::*; - use std::sync::{Arc, Mutex}; + use std::sync::Mutex; use std::time::Duration; #[tokio::test] diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 0773abba2d..703a6dfd52 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -239,7 +239,6 @@ mod tests { use std::{ convert::Infallible, pin::{pin, Pin}, - sync::atomic::{AtomicUsize, Ordering}, time::Duration, }; diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index a12f691504..22a410b4af 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -6,7 +6,6 @@ use futures::future::BoxFuture; use futures::{Stream, StreamExt}; use itertools::Itertools; use pin_project_lite::pin_project; -use std::cmp::Ord; use std::collections::BinaryHeap; use std::collections::VecDeque; use std::future::Future; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index d18b8d6885..437387164d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -20,7 +20,6 @@ use std::num::NonZeroUsize; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use toml_edit; use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; @@ -212,9 +211,9 @@ pub struct PageServerConf { pub log_format: LogFormat, - /// Number of tenants which will be concurrently loaded from remote storage proactively on startup, - /// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes - /// loading such tenants, vs. other work in the system. + /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach. + /// + /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system. pub concurrent_tenant_warmup: ConfigurableSemaphore, /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. @@ -1203,10 +1202,7 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { - use std::{ - fs, - num::{NonZeroU32, NonZeroUsize}, - }; + use std::{fs, num::NonZeroU32}; use camino_tempfile::{tempdir, Utf8TempDir}; use pageserver_api::models::EvictionPolicy; diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 38a4c9eb5d..f9cbcea565 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,5 @@ use super::*; use std::collections::HashMap; -use std::time::SystemTime; -use utils::lsn::Lsn; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index ca9ae8f983..313eb2663d 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::Deserialize; use serde::Serialize; use thiserror::Error; -use tokio; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use tracing::{self, debug, error}; +use tracing::{debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; @@ -726,7 +725,7 @@ mod test { use camino::Utf8Path; use hex_literal::hex; use pageserver_api::shard::ShardIndex; - use std::{io::ErrorKind, time::Duration}; + use std::io::ErrorKind; use tracing::info; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -735,10 +734,7 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{ - harness::TenantHarness, remote_timeline_client::remote_timeline_path, - storage_layer::DeltaFileName, - }, + tenant::{harness::TenantHarness, storage_layer::DeltaFileName}, }; use super::*; @@ -1161,13 +1157,8 @@ mod test { pub(crate) mod mock { use tracing::info; - use crate::tenant::remote_timeline_client::remote_layer_path; - use super::*; - use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }; + use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index b1c6f35704..92c1475aef 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -58,6 +58,7 @@ use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, + metrics::disk_usage_based_eviction::METRICS, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ self, @@ -65,7 +66,6 @@ use crate::{ remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName}, - Timeline, }, }; @@ -409,13 +409,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = + let (candidates, collection_time) = { + let started_at = std::time::Instant::now(); match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } - EvictionCandidates::Finished(partitioned) => partitioned, - }; + EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()), + } + }; + + METRICS.layers_collected.inc_by(candidates.len() as u64); + + tracing::info!( + elapsed_ms = collection_time.as_millis(), + total_layers = candidates.len(), + "collection completed" + ); // Debug-log the list of candidates let now = SystemTime::now(); @@ -446,9 +456,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. - let selection = select_victims(&candidates, usage_pre); + let (evicted_amount, usage_planned) = + select_victims(&candidates, usage_pre).into_amount_and_planned(); - let (evicted_amount, usage_planned) = selection.into_amount_and_planned(); + METRICS.layers_selected.inc_by(evicted_amount as u64); // phase2: evict layers @@ -477,9 +488,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( if let Some(next) = next { match next { Ok(Ok(file_size)) => { + METRICS.layers_evicted.inc(); usage_assumed.add_available_bytes(file_size); } - Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => { + Ok(Err(( + file_size, + EvictionError::NotFound + | EvictionError::Downloaded + | EvictionError::Timeout, + ))) => { evictions_failed.file_sizes += file_size; evictions_failed.count += 1; } @@ -495,7 +512,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // calling again when consumed_all is fine as evicted is fused. let Some((_partition, candidate)) = evicted.next() else { - consumed_all = true; + if !consumed_all { + tracing::info!("all evictions started, waiting"); + consumed_all = true; + } continue; }; @@ -503,11 +523,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( EvictionLayer::Attached(layer) => { let file_size = layer.layer_desc().file_size; js.spawn(async move { - layer - .evict_and_wait() - .await - .map(|()| file_size) - .map_err(|e| (file_size, e)) + // have a low eviction waiting timeout because our LRU calculations go stale fast; + // also individual layer evictions could hang because of bugs and we do not want to + // pause disk_usage_based_eviction for such. + let timeout = std::time::Duration::from_secs(5); + + match layer.evict_and_wait(timeout).await { + Ok(()) => Ok(file_size), + Err(e) => Err((file_size, e)), + } }); } EvictionLayer::Secondary(layer) => { @@ -529,6 +553,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( (usage_assumed, evictions_failed) }; + let started_at = std::time::Instant::now(); + + let evict_layers = async move { + let mut evict_layers = std::pin::pin!(evict_layers); + + let maximum_expected = std::time::Duration::from_secs(10); + + let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await; + let tuple = if let Ok(tuple) = res { + tuple + } else { + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing"); + evict_layers.await + }; + + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "completed"); + tuple + }; + + let evict_layers = + evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount)); + let (usage_assumed, evictions_failed) = tokio::select! { tuple = evict_layers => { tuple }, _ = cancel.cancelled() => { @@ -763,6 +811,8 @@ async fn collect_eviction_candidates( eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { + const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10); + // get a snapshot of the list of tenants let tenants = tenant::mgr::list_tenants() .await @@ -791,6 +841,8 @@ async fn collect_eviction_candidates( continue; } + let started_at = std::time::Instant::now(); + // collect layers from all timelines in this tenant // // If one of the timelines becomes `!is_active()` during the iteration, @@ -805,6 +857,7 @@ async fn collect_eviction_candidates( } let info = tl.get_local_layers_for_disk_usage_eviction().await; debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend(info.resident_layers.into_iter()); max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); @@ -870,7 +923,25 @@ async fn collect_eviction_candidates( (partition, candidate) }); + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); + candidates.extend(tenant_candidates); + + let elapsed = started_at.elapsed(); + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); + } } // Note: the same tenant ID might be hit twice, if it transitions from attached to @@ -885,11 +956,11 @@ async fn collect_eviction_candidates( }, ); - for secondary_tenant in secondary_tenants { + for tenant in secondary_tenants { // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is // to prevent repeated disk usage based evictions from completely draining less often // updating secondaries. - let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction(); + let (mut layer_info, total_layers) = tenant.get_layers_for_eviction(); debug_assert!( total_layers >= layer_info.resident_layers.len(), @@ -897,6 +968,8 @@ async fn collect_eviction_candidates( layer_info.resident_layers.len() ); + let started_at = std::time::Instant::now(); + layer_info .resident_layers .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); @@ -918,9 +991,27 @@ async fn collect_eviction_candidates( ) }); + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); candidates.extend(tenant_candidates); tokio::task::yield_now().await; + + let elapsed = started_at.elapsed(); + + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); + } } debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, @@ -997,30 +1088,6 @@ impl VictimSelection { } } -struct TimelineKey(Arc); - -impl PartialEq for TimelineKey { - fn eq(&self, other: &Self) -> bool { - Arc::ptr_eq(&self.0, &other.0) - } -} - -impl Eq for TimelineKey {} - -impl std::hash::Hash for TimelineKey { - fn hash(&self, state: &mut H) { - Arc::as_ptr(&self.0).hash(state); - } -} - -impl std::ops::Deref for TimelineKey { - type Target = Timeline; - - fn deref(&self) -> &Self::Target { - self.0.as_ref() - } -} - /// A totally ordered f32 subset we can use with sorting functions. pub(crate) mod finite_f32 { diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 5afb3ba63d..19b5fb7e79 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -579,6 +579,12 @@ paths: required: false schema: type: integer + - name: lazy + in: query + required: false + schema: + type: boolean + description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default. put: description: | Configures a _tenant location_, that is how a particular pageserver handles diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 12bd21fd7b..9d92fbaee0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -816,13 +816,7 @@ async fn tenant_attach_handler( let tenant = state .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - None, - SpawnMode::Normal, - &ctx, - ) + .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx) .await?; let Some(tenant) = tenant else { @@ -1418,6 +1412,7 @@ async fn put_tenant_location_config_handler( let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis); + let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); @@ -1448,15 +1443,17 @@ async fn put_tenant_location_config_handler( let location_conf = LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; + // lazy==true queues up for activation or jumps the queue like normal when a compute connects, + // similar to at startup ordering. + let spawn_mode = if lazy { + tenant::SpawnMode::Lazy + } else { + tenant::SpawnMode::Eager + }; + let attached = state .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - flush, - tenant::SpawnMode::Normal, - &ctx, - ) + .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx) .await? .is_some(); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 1749e02c7f..ce5561b431 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1915,17 +1915,16 @@ impl Drop for TimelineMetrics { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; - let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); - let _ = - RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } - let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { - let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } - let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() @@ -2474,6 +2473,64 @@ pub(crate) mod tenant_throttling { } } +pub(crate) mod disk_usage_based_eviction { + use super::*; + + pub(crate) struct Metrics { + pub(crate) tenant_collection_time: Histogram, + pub(crate) tenant_layer_count: Histogram, + pub(crate) layers_collected: IntCounter, + pub(crate) layers_selected: IntCounter, + pub(crate) layers_evicted: IntCounter, + } + + impl Default for Metrics { + fn default() -> Self { + let tenant_collection_time = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collection_seconds", + "Time spent collecting layers from a tenant -- not normalized by collected layer amount", + vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0] + ) + .unwrap(); + + let tenant_layer_count = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collected_layers", + "Amount of layers gathered from a tenant", + vec![5.0, 50.0, 500.0, 5000.0, 50000.0] + ) + .unwrap(); + + let layers_collected = register_int_counter!( + "pageserver_disk_usage_based_eviction_collected_layers_total", + "Amount of layers collected" + ) + .unwrap(); + + let layers_selected = register_int_counter!( + "pageserver_disk_usage_based_eviction_select_layers_total", + "Amount of layers selected" + ) + .unwrap(); + + let layers_evicted = register_int_counter!( + "pageserver_disk_usage_based_eviction_evicted_layers_total", + "Amount of layers successfully evicted" + ) + .unwrap(); + + Self { + tenant_collection_time, + tenant_layer_count, + layers_collected, + layers_selected, + layers_evicted, + } + } + } + + pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // @@ -2508,6 +2565,7 @@ pub fn preinitialize_metrics() { Lazy::force(&TENANT_MANAGER); Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS); + Lazy::force(&disk_usage_based_eviction::METRICS); // countervecs [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 28d2584bf4..529fb9bb07 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -73,7 +73,6 @@ use std::{ collections::{hash_map::Entry, HashMap}, - convert::TryInto, sync::{ atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, Arc, Weak, @@ -262,7 +261,9 @@ pub struct PageCache { size_metrics: &'static PageCacheSizeMetrics, } -struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit); +struct PinnedSlotsPermit { + _permit: tokio::sync::OwnedSemaphorePermit, +} /// /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked @@ -558,9 +559,9 @@ impl PageCache { ) .await { - Ok(res) => Ok(PinnedSlotsPermit( - res.expect("this semaphore is never closed"), - )), + Ok(res) => Ok(PinnedSlotsPermit { + _permit: res.expect("this semaphore is never closed"), + }), Err(_timeout) => { crate::metrics::page_cache_errors_inc( crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 11eb512750..689bc5cb3c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -27,7 +27,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::ShardIndex; use pageserver_api::shard::ShardNumber; -use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; +use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; @@ -44,7 +44,6 @@ use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; -use tracing::field; use tracing::*; use utils::id::ConnectionId; use utils::sync::gate::GateGuard; @@ -1115,7 +1114,10 @@ impl PageServerHandler { ctx: &RequestContext, ) -> Result { let timeline = match self.get_cached_timeline_for_page(req) { - Ok(tl) => tl, + Ok(tl) => { + set_tracing_field_shard_id(tl); + tl + } Err(key) => { match self .load_timeline_for_page(tenant_id, timeline_id, key) @@ -1140,9 +1142,6 @@ impl PageServerHandler { } }; - // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't - set_tracing_field_shard_id(timeline); - let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetPageAtLsn); diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c726139524..9959d105eb 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -37,7 +37,6 @@ impl Value { mod test { use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; macro_rules! roundtrip { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6a63a2adeb..4158133111 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -109,7 +109,6 @@ pub use pageserver_api::models::TenantState; use tokio::sync::Semaphore; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use toml_edit; use utils::{ crashsafe, generation::Generation, @@ -227,7 +226,11 @@ pub(crate) struct TenantPreload { /// When we spawn a tenant, there is a special mode for tenant creation that /// avoids trying to read anything from remote storage. pub(crate) enum SpawnMode { - Normal, + /// Activate as soon as possible + Eager, + /// Lazy activation in the background, with the option to skip the queue if the need comes up + Lazy, + /// Tenant has been created during the lifetime of this process Create, } @@ -700,41 +703,37 @@ impl Tenant { .and_then(|x| x.initial_tenant_load_remote.take()); enum AttachType<'a> { - // During pageserver startup, we are attaching this tenant lazily in the background - Warmup(tokio::sync::SemaphorePermit<'a>), - // During pageserver startup, we are attaching this tenant as soon as we can, - // because a client tried to access it. + /// We are attaching this tenant lazily in the background. + Warmup { + _permit: tokio::sync::SemaphorePermit<'a>, + during_startup: bool + }, + /// We are attaching this tenant as soon as we can, because for example an + /// endpoint tried to access it. OnDemand, - // During normal operations after startup, we are attaching a tenant. + /// During normal operations after startup, we are attaching a tenant, and + /// eager attach was requested. Normal, } - // Before doing any I/O, wait for either or: - // - A client to attempt to access to this tenant (on-demand loading) - // - A permit to become available in the warmup semaphore (background warmup) - // - // Some-ness of init_order is how we know if we're attaching during startup or later - // in process lifetime. - let attach_type = if init_order.is_some() { + let attach_type = if matches!(mode, SpawnMode::Lazy) { + // Before doing any I/O, wait for at least one of: + // - A client attempting to access to this tenant (on-demand loading) + // - A permit becoming available in the warmup semaphore (background warmup) + tokio::select!( - _ = tenant_clone.activate_now_sem.acquire() => { + permit = tenant_clone.activate_now_sem.acquire() => { + let _ = permit.expect("activate_now_sem is never closed"); tracing::info!("Activating tenant (on-demand)"); AttachType::OnDemand }, - permit_result = conf.concurrent_tenant_warmup.inner().acquire() => { - match permit_result { - Ok(p) => { - tracing::info!("Activating tenant (warmup)"); - AttachType::Warmup(p) - } - Err(_) => { - // This is unexpected: the warmup semaphore should stay alive - // for the lifetime of init_order. Log a warning and proceed. - tracing::warn!("warmup_limit semaphore unexpectedly closed"); - AttachType::Normal - } + permit = conf.concurrent_tenant_warmup.inner().acquire() => { + let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed"); + tracing::info!("Activating tenant (warmup)"); + AttachType::Warmup { + _permit, + during_startup: init_order.is_some() } - } _ = tenant_clone.cancel.cancelled() => { // This is safe, but should be pretty rare: it is interesting if a tenant @@ -749,6 +748,8 @@ impl Tenant { }, ) } else { + // SpawnMode::{Create,Eager} always cause jumping ahead of the + // concurrent_tenant_warmup queue AttachType::Normal }; @@ -756,7 +757,7 @@ impl Tenant { (SpawnMode::Create, _) => { None }, - (SpawnMode::Normal, Some(remote_storage)) => { + (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => { let _preload_timer = TENANT.preload.start_timer(); let res = tenant_clone .preload(remote_storage, task_mgr::shutdown_token()) @@ -769,7 +770,7 @@ impl Tenant { } } } - (SpawnMode::Normal, None) => { + (_, None) => { let _preload_timer = TENANT.preload.start_timer(); None } @@ -828,7 +829,7 @@ impl Tenant { let attached = { let _attach_timer = match mode { SpawnMode::Create => None, - SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()), }; tenant_clone.attach(preload, mode, &ctx).await }; @@ -850,7 +851,7 @@ impl Tenant { // It also prevents the warmup proccess competing with the concurrency limit on // logical size calculations: if logical size calculation semaphore is saturated, // then warmup will wait for that before proceeding to the next tenant. - if let AttachType::Warmup(_permit) = attach_type { + if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) { let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect(); tracing::info!("Waiting for initial logical sizes while warming up..."); while futs.next().await.is_some() {} @@ -923,7 +924,7 @@ impl Tenant { deleting: false, timelines: HashMap::new(), }, - (None, SpawnMode::Normal) => { + (None, _) => { anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); } }; @@ -2382,7 +2383,7 @@ impl Tenant { self.tenant_shard_id, self.generation, self.shard_identity, - self.walredo_mgr.as_ref().map(Arc::clone), + self.walredo_mgr.clone(), resources, pg_version, state, @@ -3591,25 +3592,18 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; - use camino::Utf8PathBuf; use once_cell::sync::OnceCell; use pageserver_api::models::ShardParameters; use pageserver_api::shard::ShardIndex; - use std::fs; - use std::sync::Arc; use utils::logging; - use utils::lsn::Lsn; use crate::deletion_queue::mock::MockDeletionQueue; use crate::walredo::apply_neon; - use crate::{ - config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord, - }; + use crate::{repository::Key, walrecord::NeonWalRecord}; use super::*; - use crate::tenant::config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::id::{TenantId, TimelineId}; + use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -3769,7 +3763,7 @@ pub(crate) mod harness { let preload = tenant .preload(&self.remote_storage, CancellationToken::new()) .await?; - tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?; + tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { @@ -3838,10 +3832,8 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; - use once_cell::sync::Lazy; use pageserver_api::keyspace::KeySpace; use rand::{thread_rng, Rng}; - use tokio_util::sync::CancellationToken; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 18c4ea664e..9464324413 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -52,7 +52,10 @@ pub mod defaults { pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; + // The default limit on WAL lag should be set to avoid causing disconnects under high throughput + // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for + // throughputs up to 1GiB/s per timeline. + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 3d138da7af..ffb7206b1e 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -420,7 +420,7 @@ impl DeleteTenantFlow { .expect("cant be stopping or broken"); tenant - .attach(preload, super::SpawnMode::Normal, ctx) + .attach(preload, super::SpawnMode::Eager, ctx) .await .context("attach")?; diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 9f104aff86..ca30b0ac4f 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -21,7 +21,6 @@ use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; -use hex; use std::{cmp::Ordering, io, result}; use thiserror::Error; use tracing::error; @@ -700,8 +699,6 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { use super::*; - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; use rand::Rng; use std::collections::BTreeMap; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 2bedbf7f61..e48b9e83bd 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -300,7 +300,7 @@ mod tests { use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use crate::tenant::block_io::{BlockCursor, BlockReaderRef}; + use crate::tenant::block_io::BlockReaderRef; use rand::{thread_rng, RngCore}; use std::fs; use std::str::FromStr; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 8f0f73d4b5..06b61d4631 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -595,7 +595,7 @@ pub async fn init_tenant_mgr( shard_identity, Some(init_order.clone()), &TENANTS, - SpawnMode::Normal, + SpawnMode::Lazy, &ctx, ) { Ok(tenant) => { @@ -1106,9 +1106,9 @@ impl TenantManager { // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then // the caller thinks they're creating but the tenant already existed. We must switch to - // Normal mode so that when starting this Tenant we properly probe remote storage for timelines, + // Eager mode so that when starting this Tenant we properly probe remote storage for timelines, // rather than assuming it to be empty. - spawn_mode = SpawnMode::Normal; + spawn_mode = SpawnMode::Eager; } Some(TenantSlot::Secondary(state)) => { info!("Shutting down secondary tenant"); @@ -1300,7 +1300,7 @@ impl TenantManager { shard_identity, None, self.tenants, - SpawnMode::Normal, + SpawnMode::Eager, ctx, )?; @@ -1521,7 +1521,7 @@ impl TenantManager { *child_shard, child_location_conf, None, - SpawnMode::Normal, + SpawnMode::Eager, ctx, ) .await?; @@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant( shard_identity, None, &TENANTS, - SpawnMode::Normal, + SpawnMode::Eager, ctx, ) .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?; @@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc( let tenant = guard .get(&tenant_shard_id) - .map(Arc::clone) + .cloned() .with_context(|| format!("tenant {tenant_shard_id}")) .map_err(|e| ApiError::NotFound(e.into()))?; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 7d30745a0d..40be2ca8f3 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1791,14 +1791,12 @@ mod tests { context::RequestContext, tenant::{ harness::{TenantHarness, TIMELINE_ID}, - storage_layer::Layer, - Generation, Tenant, Timeline, + Tenant, Timeline, }, DEFAULT_PG_VERSION, }; use std::collections::HashSet; - use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 962cf5d12e..167e18a829 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>( const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; -pub fn is_temp_download_file(path: &Utf8Path) -> bool { +pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { let extension = path.extension(); match extension { Some(TEMP_DOWNLOAD_EXTENSION) => true, diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index c466ac0c24..14e88b836e 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage; use tokio_util::sync::CancellationToken; use tracing::instrument; -use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate}; +use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; enum DownloadCommand { Download(TenantShardId), @@ -121,6 +121,10 @@ impl SecondaryTenant { }) } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { + self.tenant_shard_id + } + pub(crate) async fn shutdown(&self) { self.cancel.cancel(); @@ -164,16 +168,17 @@ impl SecondaryTenant { self.detail.lock().unwrap().get_layers_for_eviction(self) } + /// Cancellation safe, but on cancellation the eviction will go through #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] pub(crate) async fn evict_layer( - &self, + self: &Arc, conf: &PageServerConf, timeline_id: TimelineId, name: LayerFileName, ) { debug_assert_current_span_has_tenant_id(); - let _guard = match self.gate.enter() { + let guard = match self.gate.enter() { Ok(g) => g, Err(_) => { tracing::debug!("Dropping layer evictions, secondary tenant shutting down",); @@ -187,35 +192,57 @@ impl SecondaryTenant { .timeline_path(&self.tenant_shard_id, &timeline_id) .join(name.file_name()); - // We tolerate ENOENT, because between planning eviction and executing - // it, the secondary downloader could have seen an updated heatmap that - // resulted in a layer being deleted. - // Other local I/O errors are process-fatal: these should never happen. - tokio::fs::remove_file(path) - .await - .or_else(fs_ext::ignore_not_found) - .fatal_err("Deleting layer during eviction"); + let this = self.clone(); - // Update the timeline's state. This does not have to be synchronized with - // the download process, because: - // - If downloader is racing with us to remove a file (e.g. because it is - // removed from heatmap), then our mutual .remove() operations will both - // succeed. - // - If downloader is racing with us to download the object (this would require - // multiple eviction iterations to race with multiple download iterations), then - // if we remove it from the state, the worst that happens is the downloader - // downloads it again before re-inserting, or we delete the file but it remains - // in the state map (in which case it will be downloaded if this secondary - // tenant transitions to attached and tries to access it) - // - // The important assumption here is that the secondary timeline state does not - // have to 100% match what is on disk, because it's a best-effort warming - // of the cache. - let mut detail = self.detail.lock().unwrap(); - if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - timeline_detail.on_disk_layers.remove(&name); - timeline_detail.evicted_at.insert(name, now); - } + // spawn it to be cancellation safe + tokio::task::spawn_blocking(move || { + let _guard = guard; + // We tolerate ENOENT, because between planning eviction and executing + // it, the secondary downloader could have seen an updated heatmap that + // resulted in a layer being deleted. + // Other local I/O errors are process-fatal: these should never happen. + let deleted = std::fs::remove_file(path); + + let not_found = deleted + .as_ref() + .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound); + + let deleted = if not_found { + false + } else { + deleted + .map(|()| true) + .fatal_err("Deleting layer during eviction") + }; + + if !deleted { + // skip updating accounting and putting perhaps later timestamp + return; + } + + // Update the timeline's state. This does not have to be synchronized with + // the download process, because: + // - If downloader is racing with us to remove a file (e.g. because it is + // removed from heatmap), then our mutual .remove() operations will both + // succeed. + // - If downloader is racing with us to download the object (this would require + // multiple eviction iterations to race with multiple download iterations), then + // if we remove it from the state, the worst that happens is the downloader + // downloads it again before re-inserting, or we delete the file but it remains + // in the state map (in which case it will be downloaded if this secondary + // tenant transitions to attached and tries to access it) + // + // The important assumption here is that the secondary timeline state does not + // have to 100% match what is on disk, because it's a best-effort warming + // of the cache. + let mut detail = this.detail.lock().unwrap(); + if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { + timeline_detail.on_disk_layers.remove(&name); + timeline_detail.evicted_at.insert(name, now); + } + }) + .await + .expect("secondary eviction should not have panicked"); } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 5c4e4fd160..b679077358 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -16,7 +16,8 @@ use crate::{ config::SecondaryLocationConfig, debug_assert_current_span_has_tenant_and_timeline_id, remote_timeline_client::{ - index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, + index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, storage_layer::LayerFileName, @@ -788,7 +789,7 @@ async fn init_timeline_state( // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config"); continue; - } else if crate::is_temporary(&file_path) { + } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) { // Temporary files are frequently left behind from restarting during downloads tracing::info!("Cleaning up temporary file {file_path}"); if let Err(e) = tokio::fs::remove_file(&file_path) diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index 147cf683ba..a8b05f4c0e 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -18,7 +18,6 @@ use crate::{ }; use futures::Future; -use md5; use pageserver_api::shard::TenantShardId; use rand::Rng; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9de820912e..299950cc21 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -72,7 +72,7 @@ where /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ValueReconstructState { pub records: Vec<(Lsn, NeonWalRecord)>, pub img: Option<(Lsn, Bytes)>, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 0a707295cc..56cfaeda15 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -43,7 +43,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; -use hex; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 13c9e5c989..247dd1a8e4 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; use tracing::Instrument; use utils::lsn::Lsn; use utils::sync::heavier_once_cell; @@ -208,10 +208,15 @@ impl Layer { /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is /// re-downloaded, [`EvictionError::Downloaded`] is returned. /// + /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction + /// will happen regardless the future returned by this method completing unless there is a + /// read access (currently including [`Layer::keep_resident`]) before eviction gets to + /// complete. + /// /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation /// of download-evict cycle on retry. - pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> { - self.0.evict_and_wait().await + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { + self.0.evict_and_wait(timeout).await } /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload @@ -363,7 +368,7 @@ impl Layer { /// /// Does not start local deletion, use [`Self::delete_on_drop`] for that /// separatedly. - #[cfg(feature = "testing")] + #[cfg(any(feature = "testing", test))] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { let mut rx = self.0.status.subscribe(); @@ -632,7 +637,7 @@ impl LayerInner { /// Cancellation safe, however dropping the future and calling this method again might result /// in a new attempt to evict OR join the previously started attempt. - pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> { + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { use tokio::sync::broadcast::error::RecvError; assert!(self.have_remote_client); @@ -652,16 +657,22 @@ impl LayerInner { if strong.is_some() { // drop the DownloadedLayer outside of the holding the guard drop(strong); + + // idea here is that only one evicter should ever get to witness a strong reference, + // which means whenever get_or_maybe_download upgrades a weak, it must mark up a + // cancelled eviction and signal us, like it currently does. + // + // a second concurrent evict_and_wait will not see a strong reference. LAYER_IMPL_METRICS.inc_started_evictions(); } - match rx.recv().await { - Ok(Status::Evicted) => Ok(()), - Ok(Status::Downloaded) => Err(EvictionError::Downloaded), - Err(RecvError::Closed) => { + match tokio::time::timeout(timeout, rx.recv()).await { + Ok(Ok(Status::Evicted)) => Ok(()), + Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded), + Ok(Err(RecvError::Closed)) => { unreachable!("sender cannot be dropped while we are in &self method") } - Err(RecvError::Lagged(_)) => { + Ok(Err(RecvError::Lagged(_))) => { // this is quite unlikely, but we are blocking a lot in the async context, so // we might be missing this because we are stuck on a LIFO slot on a thread // which is busy blocking for a 1TB database create_image_layers. @@ -674,6 +685,7 @@ impl LayerInner { None => Ok(()), } } + Err(_timeout) => Err(EvictionError::Timeout), } } @@ -1195,6 +1207,9 @@ pub(crate) enum EvictionError { /// Evictions must always lose to downloads in races, and this time it happened. #[error("layer was downloaded instead")] Downloaded, + + #[error("eviction did not happen within timeout")] + Timeout, } /// Error internal to the [`LayerInner::get_or_maybe_download`] diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 01c62b6f83..b43534efd4 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,13 +1,173 @@ use futures::StreamExt; +use pageserver_api::key::CONTROLFILE_KEY; use tokio::task::JoinSet; +use tracing::Instrument; use utils::{ completion::{self, Completion}, id::TimelineId, }; use super::*; -use crate::task_mgr::BACKGROUND_RUNTIME; -use crate::tenant::harness::TenantHarness; +use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME}; +use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; + +/// Used in tests to advance a future to wanted await point, and not futher. +const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); + +/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE +/// timeout uses to advance futures. +const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7); + +/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions. +#[tokio::test] +async fn smoke_test() { + let handle = BACKGROUND_RUNTIME.handle(); + + let h = TenantHarness::create("smoke_test").unwrap(); + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + let (tenant, _) = h.load().await; + + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.resident_layers().collect::>().await + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // all layers created at pageserver are like `layer`, initialized with strong + // Arc. + + let img_before = { + let mut data = ValueReconstructState::default(); + layer + .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .await + .unwrap(); + data.img + .take() + .expect("tenant harness writes the control file") + }; + + // important part is evicting the layer, which can be done when there are no more ResidentLayer + // instances -- there currently are none, only two `Layer` values, one in the layermap and on + // in scope. + layer.evict_and_wait(FOREVER).await.unwrap(); + + // double-evict returns an error, which is valid if both eviction_task and disk usage based + // eviction would both evict the same layer at the same time. + + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound)); + + // on accesses when the layer is evicted, it will automatically be downloaded. + let img_after = { + let mut data = ValueReconstructState::default(); + layer + .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .instrument(download_span.clone()) + .await + .unwrap(); + data.img.take().unwrap() + }; + + assert_eq!(img_before, img_after); + + // evict_and_wait can timeout, but it doesn't cancel the evicting itself + // + // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to + // artificially slow it down. + let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + + match layer + .evict_and_wait(std::time::Duration::ZERO) + .await + .unwrap_err() + { + EvictionError::Timeout => { + // expected, but note that the eviction is "still ongoing" + helper.release().await; + // exhaust spawn_blocking pool to ensure it is now complete + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle) + .await; + } + other => unreachable!("{other:?}"), + } + + // only way to query if a layer is resident is to acquire a ResidentLayer instance. + // Layer::keep_resident never downloads, but it might initialize if the layer file is found + // downloaded locally. + let none = layer.keep_resident().await.unwrap(); + assert!( + none.is_none(), + "Expected none, because eviction removed the local file, found: {none:?}" + ); + + // plain downloading is rarely needed + layer + .download_and_keep_resident() + .instrument(download_span) + .await + .unwrap(); + + // last important part is deletion on drop: gc and compaction use it for compacted L0 layers + // or fully garbage collected layers. deletion means deleting the local file, and scheduling a + // deletion of the already unlinked from index_part.json remote file. + // + // marking a layer to be deleted on drop is irreversible; there is no technical reason against + // reversiblity, but currently it is not needed so it is not provided. + layer.delete_on_drop(); + + let path = layer.local_path().to_owned(); + + // wait_drop produces an unconnected to Layer future which will resolve when the + // LayerInner::drop has completed. + let mut wait_drop = std::pin::pin!(layer.wait_drop()); + + // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing + // until here + tokio::time::pause(); + tokio::time::timeout(ADVANCE, &mut wait_drop) + .await + .expect_err("should had timed out because two strong references exist"); + + tokio::fs::metadata(&path) + .await + .expect("the local layer file still exists"); + + let rtc = timeline.remote_client.as_ref().unwrap(); + + { + let layers = &[layer]; + let mut g = timeline.layers.write().await; + g.finish_gc_timeline(layers); + // this just updates the remote_physical_size for demonstration purposes + rtc.schedule_gc_update(layers).unwrap(); + } + + // when strong references are dropped, the file is deleted and remote deletion is scheduled + wait_drop.await; + + let e = tokio::fs::metadata(&path) + .await + .expect_err("the local file is deleted"); + assert_eq!(e.kind(), std::io::ErrorKind::NotFound); + + rtc.wait_completion().await.unwrap(); + + assert_eq!(rtc.get_remote_physical_size(), 0); +} /// This test demonstrates a previous hang when a eviction and deletion were requested at the same /// time. Now both of them complete per Arc drop semantics. @@ -41,10 +201,10 @@ async fn evict_and_wait_on_wanted_deleted() { let resident = layer.keep_resident().await.unwrap(); { - let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait()); + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); // drive the future to await on the status channel - tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait) + tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect_err("should had been a timeout since we are holding the layer resident"); @@ -115,10 +275,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { let resident = layer.keep_resident().await.unwrap(); - let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait()); + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); // drive the future to await on the status channel - tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait) + tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect_err("should had been a timeout since we are holding the layer resident"); assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); @@ -138,7 +298,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { // because the keep_resident check alters wanted evicted without sending a message, we will // never get completed - let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait) + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await .expect("no timeout, because keep_resident re-initialized") .expect_err("eviction should not have succeeded because re-initialized"); @@ -158,9 +318,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { .sum::() ); - let mut second_eviction = std::pin::pin!(layer.evict_and_wait()); + let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); - tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction) + // advance to the wait on the queue + tokio::time::timeout(ADVANCE, &mut second_eviction) .await .expect_err("timeout because spawn_blocking is clogged"); @@ -171,7 +332,12 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { helper.release().await; - tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction) + // the second_eviction gets to run here + // + // synchronize to be *strictly* after the second_eviction spawn_blocking run + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + + tokio::time::timeout(ADVANCE, &mut second_eviction) .await .expect("eviction goes through now that spawn_blocking is unclogged") .expect("eviction should succeed, because version matches"); @@ -261,3 +427,49 @@ impl SpawnBlockingPoolHelper { .await } } + +#[test] +fn spawn_blocking_pool_helper_actually_works() { + // create a custom runtime for which we know and control how many blocking threads it has + // + // because the amount is not configurable for our helper, expect the same amount as + // BACKGROUND_RUNTIME using the tokio defaults would have. + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(512) + .enable_all() + .build() + .unwrap(); + + let handle = rt.handle(); + + rt.block_on(async move { + // this will not return until all threads are spun up and actually executing the code + // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d. + let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + + println!("consumed"); + + let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || { + // this will not get to run before we release + })); + + println!("spawned"); + + tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh) + .await + .expect_err("the task should not have gotten to run yet"); + + println!("tried to join"); + + consumed.release().await; + + println!("released"); + + tokio::time::timeout(std::time::Duration::from_secs(1), jh) + .await + .expect("no timeout") + .expect("no join error"); + + println!("joined"); + }); +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index fa5e7b3685..206f20306e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1512,10 +1512,14 @@ impl Timeline { return Ok(None); }; - match local_layer.evict_and_wait().await { + // curl has this by default + let timeout = std::time::Duration::from_secs(120); + + match local_layer.evict_and_wait(timeout).await { Ok(()) => Ok(Some(true)), Err(EvictionError::NotFound) => Ok(Some(false)), Err(EvictionError::Downloaded) => Ok(Some(false)), + Err(EvictionError::Timeout) => Ok(Some(false)), } } } @@ -5157,8 +5161,7 @@ mod tests { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); - let ctx = any_context(); - let tenant = harness.do_try_load(&ctx).await.unwrap(); + let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await @@ -5172,8 +5175,10 @@ mod tests { .expect("should had been resident") .drop_eviction_guard(); - let first = async { layer.evict_and_wait().await }; - let second = async { layer.evict_and_wait().await }; + let forever = std::time::Duration::from_secs(120); + + let first = layer.evict_and_wait(forever); + let second = layer.evict_and_wait(forever); let (first, second) = tokio::join!(first, second); @@ -5192,12 +5197,6 @@ mod tests { } } - fn any_context() -> crate::context::RequestContext { - use crate::context::*; - use crate::task_mgr::*; - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) - } - async fn find_some_layer(timeline: &Timeline) -> Layer { let layers = timeline.layers.read().await; let desc = layers diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 950459cbf9..914e3948ef 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -75,14 +75,13 @@ impl Timeline { let keyspace = self.collect_keyspace(end_lsn, ctx).await?; let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace)); - let ctx_adaptor = RequestContextAdaptor(ctx.clone()); pageserver_compaction::compact_tiered::compact_tiered( &mut adaptor, end_lsn, target_file_size, fanout, - &ctx_adaptor, + ctx, ) .await?; @@ -143,13 +142,13 @@ impl CompactionJobExecutor for TimelineAdaptor { type DeltaLayer = ResidentDeltaLayer; type ImageLayer = ResidentImageLayer; - type RequestContext = RequestContextAdaptor; + type RequestContext = crate::context::RequestContext; async fn get_layers( &mut self, key_range: &Range, lsn_range: &Range, - _ctx: &RequestContextAdaptor, + _ctx: &RequestContext, ) -> anyhow::Result>> { self.flush_updates().await?; @@ -170,7 +169,7 @@ impl CompactionJobExecutor for TimelineAdaptor { &mut self, key_range: &Range, lsn: Lsn, - _ctx: &RequestContextAdaptor, + _ctx: &RequestContext, ) -> anyhow::Result>> { if lsn == self.keyspace.0 { Ok(pageserver_compaction::helpers::intersect_keyspace( @@ -206,7 +205,7 @@ impl CompactionJobExecutor for TimelineAdaptor { &mut self, lsn: Lsn, key_range: &Range, - ctx: &RequestContextAdaptor, + ctx: &RequestContext, ) -> anyhow::Result<()> { Ok(self.create_image_impl(lsn, key_range, ctx).await?) } @@ -216,7 +215,7 @@ impl CompactionJobExecutor for TimelineAdaptor { lsn_range: &Range, key_range: &Range, input_layers: &[ResidentDeltaLayer], - ctx: &RequestContextAdaptor, + ctx: &RequestContext, ) -> anyhow::Result<()> { debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); @@ -287,7 +286,7 @@ impl CompactionJobExecutor for TimelineAdaptor { async fn delete_layer( &mut self, layer: &OwnArc, - _ctx: &RequestContextAdaptor, + _ctx: &RequestContext, ) -> anyhow::Result<()> { self.layers_to_delete.push(layer.clone().0); Ok(()) @@ -299,7 +298,7 @@ impl TimelineAdaptor { &mut self, lsn: Lsn, key_range: &Range, - ctx: &RequestContextAdaptor, + ctx: &RequestContext, ) -> Result<(), PageReconstructError> { let timer = self.timeline.metrics.create_images_time_histo.start_timer(); @@ -361,17 +360,7 @@ impl TimelineAdaptor { } } -pub struct RequestContextAdaptor(pub RequestContext); - -impl std::ops::Deref for RequestContextAdaptor { - type Target = RequestContext; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl CompactionRequestContext for RequestContextAdaptor {} +impl CompactionRequestContext for crate::context::RequestContext {} #[derive(Debug, Clone)] pub struct OwnArc(pub Arc); @@ -449,10 +438,7 @@ impl CompactionLayer for ResidentDeltaLayer { impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; - async fn load_keys<'a>( - &self, - ctx: &RequestContextAdaptor, - ) -> anyhow::Result>> { + async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result>> { self.0.load_keys(ctx).await } } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 008f9482c4..dd603135d2 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -204,6 +204,7 @@ impl Timeline { evicted: usize, errors: usize, not_evictable: usize, + timeouts: usize, #[allow(dead_code)] skipped_for_shutdown: usize, } @@ -267,7 +268,11 @@ impl Timeline { let layer = guard.drop_eviction_guard(); if no_activity_for > p.threshold { // this could cause a lot of allocations in some cases - js.spawn(async move { layer.evict_and_wait().await }); + js.spawn(async move { + layer + .evict_and_wait(std::time::Duration::from_secs(5)) + .await + }); stats.candidates += 1; } } @@ -280,6 +285,9 @@ impl Timeline { Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { stats.not_evictable += 1; } + Ok(Err(EvictionError::Timeout)) => { + stats.timeouts += 1; + } Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { /* already logged */ @@ -295,7 +303,8 @@ impl Timeline { stats = join_all => { if stats.candidates == stats.not_evictable { debug!(stats=?stats, "eviction iteration complete"); - } else if stats.errors > 0 || stats.not_evictable > 0 { + } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 { + // reminder: timeouts are not eviction cancellations warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3a2705bb50..63a2b30d09 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1667,8 +1667,6 @@ mod tests { use super::*; use crate::tenant::harness::*; use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; - use crate::tenant::Timeline; - use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; use crate::DEFAULT_PG_VERSION; diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 6ce90e0c47..247704e2a5 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -252,8 +252,6 @@ mod test { use super::*; use std::collections::HashMap; - use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord}; - /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile #[test] fn apply_aux_file_deltas() -> anyhow::Result<()> { diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs index ca016408e6..1a0d7039df 100644 --- a/pageserver/src/walredo/process/no_leak_child.rs +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -1,7 +1,5 @@ -use tracing; -use tracing::error; -use tracing::info; use tracing::instrument; +use tracing::{error, info}; use crate::metrics::WalRedoKillCause; use crate::metrics::WAL_REDO_PROCESS_COUNTERS; diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index ef0a79a50c..7ea767ec74 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 11d6f6aec5..25275ef31f 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -25,6 +25,8 @@ #include "funcapi.h" #include "miscadmin.h" #include "pagestore_client.h" +#include "common/hashfn.h" +#include "lib/hyperloglog.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -60,6 +62,7 @@ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ #define MB ((uint64)1024*1024) +#define HYPER_LOG_LOG_BIT_WIDTH 10 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) typedef struct FileCacheEntry @@ -84,6 +87,8 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ + hyperLogLogState wss_estimation; /* estimation of wroking set size */ + uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1]; } FileCacheControl; static HTAB *lfc_hash; @@ -232,6 +237,14 @@ lfc_shmem_startup(void) lfc_ctl->writes = 0; dlist_init(&lfc_ctl->lru); + /* Initialize hyper-log-log structure for estimating working set size */ + initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH); + + /* We need hashes in shared memory */ + pfree(lfc_ctl->wss_estimation.hashesArr); + memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes; + /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) @@ -529,6 +542,11 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set */ + tag.blockNum = blkno; + addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { /* Page is not cached */ @@ -967,3 +985,21 @@ local_cache_pages(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +PG_FUNCTION_INFO_V1(approximate_working_set_size); + +Datum +approximate_working_set_size(PG_FUNCTION_ARGS) +{ + int32 dc = -1; + if (lfc_size_limit != 0) + { + bool reset = PG_GETARG_BOOL(0); + LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); + dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation); + if (reset) + memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + LWLockRelease(lfc_lock); + } + PG_RETURN_INT32(dc); +} diff --git a/pgxn/neon/neon--1.2--1.3.sql b/pgxn/neon/neon--1.2--1.3.sql new file mode 100644 index 0000000000..9583008777 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.3.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size(reset bool) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor; + diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index 599b54b2ff..cee2f336f2 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,6 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.2' +default_version = '1.3' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 5024ba3744..d5ab66d6aa 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; -use clap::{self, Arg}; +use clap::Arg; use futures::TryFutureExt; use proxy::console::messages::MetricsAuxInfo; use proxy::stream::{PqStream, Stream}; diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 62015312a9..6e3eb8c1b0 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -358,8 +358,7 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::{console::AuthSecret, scram::ServerSecret}; - use std::{sync::Arc, time::Duration}; + use crate::scram::ServerSecret; #[tokio::test] async fn test_project_info_cache_settings() { diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 373138b09e..c7a2d467c0 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -4,7 +4,7 @@ use crate::{ }; use anyhow::Context; use once_cell::sync::Lazy; -use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; +use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use std::{convert::Infallible, future}; use tokio::net::{TcpListener, TcpStream}; diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 595d9c4979..d866b1820f 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; -use crate::{auth, http, sasl, scram}; +use crate::{http, sasl, scram}; use anyhow::{bail, Context}; use async_trait::async_trait; use rstest::rstest; diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index ed89e51754..e0c2d836f4 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; use tokio::io::{AsyncReadExt, DuplexStream}; -use tokio_postgres::config::SslMode; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 53e7c1c2ee..7d705ba049 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -667,7 +667,6 @@ impl Drop for Client { #[cfg(test)] mod tests { - use env_logger; use std::{mem, sync::atomic::AtomicBool}; use super::*; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index c39c1dbf28..d822c87c0e 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -19,8 +19,6 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; -use std::convert::TryInto; - pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 7; @@ -219,12 +217,9 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::FileStorage; use super::*; - use crate::SafeKeeperConf; - use anyhow::Result; use tokio::fs; - use utils::{id::TenantTimelineId, lsn::Lsn}; + use utils::lsn::Lsn; fn stub_conf() -> SafeKeeperConf { let workdir = camino_tempfile::tempdir().unwrap().into_path(); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 761541168c..f45bfb95fa 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,8 +2,7 @@ //! protocol commands. use anyhow::Context; -use std::str::FromStr; -use std::str::{self}; +use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, info_span, Instrument}; @@ -16,8 +15,8 @@ use crate::safekeeper::Term; use crate::timeline::TimelineError; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; +use postgres_backend::PostgresBackend; use postgres_backend::QueryError; -use postgres_backend::{self, PostgresBackend}; use postgres_ffi::PG_TLI; use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use regex::Regex; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 71e77334a1..b933d391ab 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2180,6 +2180,11 @@ class NeonAttachmentService(MetricsGetter): self.stop(immediate=True) +@dataclass +class LogCursor: + _line_no: int + + class NeonPageserver(PgProtocol): """ An object representing a running pageserver. @@ -2343,7 +2348,18 @@ class NeonPageserver(PgProtocol): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" - def log_contains(self, pattern: str) -> Optional[str]: + def assert_log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Tuple[str, LogCursor]: + """Convenient for use inside wait_until()""" + + res = self.log_contains(pattern, offset=offset) + assert res is not None + return res + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: """Check that the pageserver log contains a line that matches the given regex""" logfile = self.workdir / "pageserver.log" if not logfile.exists(): @@ -2357,12 +2373,17 @@ class NeonPageserver(PgProtocol): # no guarantee it is already present in the log file. This hasn't # been a problem in practice, our python tests are not fast enough # to hit that race condition. + skip_until_line_no = 0 if offset is None else offset._line_no + cur_line_no = 0 with logfile.open("r") as f: for line in f: + if cur_line_no < skip_until_line_no: + cur_line_no += 1 + continue if contains_re.search(line): # found it! - return line - + cur_line_no += 1 + return (line, LogCursor(cur_line_no)) return None def tenant_attach( diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index ad3efb5837..b8e20c451f 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -286,7 +286,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) def tenant_location_conf( - self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None + self, + tenant_id: Union[TenantId, TenantShardId], + location_conf=dict[str, Any], + flush_ms=None, + lazy: Optional[bool] = None, ): body = location_conf.copy() body["tenant_id"] = str(tenant_id) @@ -295,6 +299,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if flush_ms is not None: params["flush_ms"] = str(flush_ms) + if lazy is not None: + params["lazy"] = "true" if lazy else "false" + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", json=body, diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 1415038f69..cf64c86821 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -20,7 +20,7 @@ def assert_tenant_state( tenant: TenantId, expected_state: str, message: Optional[str] = None, -): +) -> None: tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") assert tenant_status["state"]["slug"] == expected_state, message or tenant_status @@ -206,8 +206,8 @@ def wait_for_last_record_lsn( return current_lsn if i % 10 == 0: log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 + "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + tenant, timeline, lsn, current_lsn, i + 1 ) ) time.sleep(0.1) @@ -292,7 +292,7 @@ def timeline_delete_wait_completed( iterations: int = 20, interval: Optional[float] = None, **delete_args, -): +) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval) @@ -302,7 +302,7 @@ def assert_prefix_empty( remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None, allowed_postfix: Optional[str] = None, -): +) -> None: assert remote_storage is not None response = list_prefix(remote_storage, prefix) keys = response["KeyCount"] diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 91f33e1196..7fc3bae3af 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -369,7 +369,12 @@ def start_in_background( return spawned_process -def wait_until(number_of_iterations: int, interval: float, func: Fn): +WaitUntilRet = TypeVar("WaitUntilRet") + + +def wait_until( + number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet] +) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the last return value from the function. @@ -387,6 +392,18 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn): raise Exception("timed out while waiting for %s" % func) from last_exception +def assert_eq(a, b) -> None: + assert a == b + + +def assert_gt(a, b) -> None: + assert a > b + + +def assert_ge(a, b) -> None: + assert a >= b + + def run_pg_bench_small(pg_bin: "PgBin", connstr: str): """ Fast way to populate data. diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 6cae663842..7fbce6a10c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -63,10 +63,11 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N ] ) - def log_contains_bad_request(): - env.pageserver.log_contains(".*Error processing HTTP request: Bad request") - - wait_until(50, 0.1, log_contains_bad_request) + wait_until( + 50, + 0.1, + lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"), + ) def test_null_body(negative_env: NegativeTests): diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index eb4e370ea7..b83545216d 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -200,7 +200,7 @@ class EvictionEnv: tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id) def statvfs_called(): - assert pageserver.log_contains(".*running mocked statvfs.*") + pageserver.assert_log_contains(".*running mocked statvfs.*") # we most likely have already completed multiple runs wait_until(10, 1, statvfs_called) @@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E assert actual_change >= target, "eviction must always evict more than target" time.sleep(1) # give log time to flush - assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE) env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) @@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") + env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO") @@ -801,10 +801,9 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") - - wait_until(10, 1, relieved_log_message) + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) def less_than_max_usage_pct(): post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) @@ -845,10 +844,9 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") - - wait_until(10, 1, relieved_log_message) + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) def more_than_min_avail_bytes_freed(): post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index 224e6f50c7..cb4fa43be7 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) time.sleep(10) # let compaction to be performed - assert env.pageserver.log_contains("compact-level0-phase1-return-same") + env.pageserver.assert_log_contains("compact-level0-phase1-return-same") def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py new file mode 100644 index 0000000000..5231dedcda --- /dev/null +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -0,0 +1,84 @@ +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + + +def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_explain_with_lfc_stats" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + + log.info(f"warming up caches with sequential scan in {endpoint.connstr()}") + cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0") + + log.info("running explain analyze without LFC values to verify they do not show up in the plan") + cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0") + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" not in plan + log.info("running explain analyze WITH LFC values to verify they do now show up") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" in plan + log.info("running explain analyze WITH LFC values to verify json output") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + jsonplan = cur.fetchall()[0][0] + log.debug(jsonplan) + # Directly access the 'Plan' part of the first element of the JSON array + plan_details = jsonplan[0]["Plan"] + + # Extract "File Cache Hits" and "File Cache Misses" + file_cache_hits = plan_details.get("File Cache Hits") + file_cache_misses = plan_details.get("File Cache Misses") + + # Now you can assert the values + assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}" + assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}" diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 999e077e45..9da47b9fd3 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -184,10 +184,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): # NB: the layer file is unlinked index part now, but, because we made the delete # operation stuck, the layer file itself is still in the remote_storage - def delete_at_pause_point(): - assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}") - - wait_until(10, 0.5, delete_at_pause_point) + wait_until( + 10, + 0.5, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*at failpoint.*{failpoint_name}" + ), + ) future_layer_path = env.pageserver_remote_storage.remote_layer_path( tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach ) diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py new file mode 100644 index 0000000000..a6f05fe0f7 --- /dev/null +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -0,0 +1,74 @@ +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_approximate_working_set_size" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + cur.execute("create extension neon") + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + # prepare index access below + cur.execute( + "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)" + ) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + # ensure correct query plans and stats + cur.execute("vacuum ANALYZE pgbench_accounts") + # determine table size - working set should approximate table size after sequential scan + pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'") + log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero") + cur.execute("select approximate_working_set_size(true)") + cur.execute( + 'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb' + ) + # verify working set size after sequential scan matches table size and reset working set for next test + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after sequential scan on pgbench_accounts {blocks}") + assert pages * 0.8 < blocks < pages * 1.2 + # run a few point queries with index lookup + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 4242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 54242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242") + # verify working set size after some index access of a few select pages only + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after some index access of a few select pages only {blocks}") + assert blocks < 10 diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index d62b5e531c..bfffad7572 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): def assert_logged(): if not log_expected: return - assert env.pageserver.log_contains(f".*{msg_id}.*") + env.pageserver.assert_log_contains(f".*{msg_id}.*") wait_until(10, 0.5, assert_logged) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 672f2b495d..1179a3afe9 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.2",) + assert cur.fetchone() == ("1.3",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index ca8ada4ddb..e0364dd13f 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -1,9 +1,12 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.utils import wait_until +@skip_on_postgres( + PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969" +) def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 1070d06ed0..89fc48a49f 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -432,7 +432,7 @@ def test_deletion_queue_recovery( main_pageserver.start() - def assert_deletions_submitted(n: int): + def assert_deletions_submitted(n: int) -> None: assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n # After restart, issue a flush to kick the deletion frontend to do recovery. diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py new file mode 100644 index 0000000000..5d55020e3c --- /dev/null +++ b/test_runner/regress/test_pageserver_small_inmemory_layers.py @@ -0,0 +1,110 @@ +import asyncio +import time +from typing import Tuple + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + tenant_get_shards, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until + +TIMELINE_COUNT = 10 +ENTRIES_PER_TIMELINE = 10_000 +CHECKPOINT_TIMEOUT_SECONDS = 60 + +TENANT_CONF = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", +} + + +async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF) + with env.endpoints.create_start("main", tenant_id=tenant) as ep: + conn = await ep.connect_async() + try: + await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") + await conn.execute( + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i" + ) + finally: + await conn.close(timeout=10) + + last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + return tenant, timeline, last_flush_lsn + + +async def workload( + env: NeonEnv, timelines: int, entries: int +) -> list[Tuple[TenantId, TimelineId, Lsn]]: + workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)] + return await asyncio.gather(*workers) + + +def wait_until_pageserver_is_caught_up( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn + ) + assert waited >= last_flush_lsn + + +def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: + def query(): + value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total") + assert value is not None + return value + + # The metric gets initialised on the first update. + # Retry a few times, but return 0 if it's stable. + try: + return float(wait_until(3, 0.5, query)) + except Exception: + return 0 + + +@pytest.mark.parametrize("immediate_shutdown", [True, False]) +def test_pageserver_small_inmemory_layers( + neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool +): + """ + Test that open layers get flushed after the `checkpoint_timeout` config + and do not require WAL reingest upon restart. + + The workload creates a number of timelines and writes some data to each, + but not enough to trigger flushes via the `checkpoint_distance` config. + """ + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + ps_http_client = env.pageserver.http_client() + total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) + + log.info("Sleeping for checkpoint timeout ...") + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5) + + env.pageserver.restart(immediate=immediate_shutdown) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client) + + log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}") + log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}") + + leeway = total_wal_ingested_before_restart * 5 / 100 + assert total_wal_ingested_after_restart <= leeway diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 73ebe0a76f..f8a0bef954 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -28,7 +28,14 @@ from fixtures.remote_storage import ( available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import print_gc_result, query_scalar, wait_until +from fixtures.utils import ( + assert_eq, + assert_ge, + assert_gt, + print_gc_result, + query_scalar, + wait_until, +) from requests import ReadTimeout @@ -120,10 +127,10 @@ def test_remote_storage_backup_and_restore( log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -292,9 +299,9 @@ def test_remote_storage_upload_queue_retries( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # let all future operations queue up configure_storage_sync_failpoints("return") @@ -322,17 +329,17 @@ def test_remote_storage_upload_queue_retries( churn_while_failpoints_active_thread.start() # wait for churn thread's data to get stuck in the upload queue - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0) + wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2)) + wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # unblock churn operations configure_storage_sync_failpoints("off") # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts. - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # The churn thread doesn't make progress once it blocks on the first wait_completion() call, # so, give it some time to wrap up. @@ -884,26 +891,23 @@ def wait_upload_queue_empty( wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="index", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="delete" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0 + ), ) diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py index 6ed49d7fd6..bc77dfd084 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_sharding_service.py @@ -116,7 +116,7 @@ def test_sharding_service_smoke( # Marking a pageserver offline should migrate tenants away from it. env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"}) - def node_evacuated(node_id: int): + def node_evacuated(node_id: int) -> None: counts = get_node_shard_counts(env, tenant_ids) assert counts[node_id] == 0 @@ -146,6 +146,8 @@ def test_sharding_service_smoke( for tid in tenant_ids: tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10) + env.attachment_service.consistency_check() + # Set a scheduling policy on one node, create all the tenants, observe # that the scheduling policy is respected. env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"}) @@ -256,9 +258,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): env.attachment_service.consistency_check() -def test_sharding_service_onboarding( - neon_env_builder: NeonEnvBuilder, -): +@pytest.mark.parametrize("warm_up", [True, False]) +def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): """ We onboard tenants to the sharding service by treating it as a 'virtual pageserver' which provides the /location_config API. This is similar to creating a tenant, @@ -306,6 +307,23 @@ def test_sharding_service_onboarding( }, ) + if warm_up: + origin_ps.http_client().tenant_heatmap_upload(tenant_id) + + # We expect to be called via live migration code, which may try to configure the tenant into secondary + # mode before attaching it. + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + "generation": None, + }, + ) + + virtual_ps_http.tenant_secondary_download(tenant_id) + # Call into attachment service to onboard the tenant generation += 1 virtual_ps_http.tenant_location_conf( @@ -351,7 +369,9 @@ def test_sharding_service_onboarding( assert len(dest_tenants) == 1 assert TenantId(dest_tenants[0]["id"]) == tenant_id - # sharding service advances generation by 1 when it first attaches + # sharding service advances generation by 1 when it first attaches. We started + # with a nonzero generation so this equality also proves that the generation + # was properly carried over during onboarding. assert dest_tenants[0]["generation"] == generation + 1 # The onboarded tenant should survive a restart of sharding service @@ -362,6 +382,31 @@ def test_sharding_service_onboarding( dest_ps.stop() dest_ps.start() + # Having onboarded via /location_config, we should also be able to update the + # TenantConf part of LocationConf, without inadvertently resetting the generation + modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100} + dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id) + + # The generation has moved on since we onboarded + assert generation != dest_tenant_before_conf_change["generation"] + + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": modified_tenant_conf, + # This is intentionally a stale generation + "generation": generation, + }, + ) + dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id) + assert ( + dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] + ) + dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id) + assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf + env.attachment_service.consistency_check() @@ -405,7 +450,7 @@ def test_sharding_service_compute_hook( env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"}) - def node_evacuated(node_id: int): + def node_evacuated(node_id: int) -> None: counts = get_node_shard_counts(env, [env.initial_tenant]) assert counts[node_id] == 0 @@ -667,3 +712,41 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): svc.request( "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API) ) + + +def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder): + """ + Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without + supplying the whole LocationConf. + """ + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + + http = env.attachment_service.pageserver_api() + + default_value = "7days" + new_value = "1h" + http.set_tenant_config(tenant_id, {"pitr_interval": new_value}) + + # Ensure the change landed on the storage controller + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] == new_value + assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value + + # Ensure the change made it down to the pageserver + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == new_value + assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value + + # Omitting a value clears it. This looks different in storage controller + # vs. pageserver API calls, because pageserver has defaults. + http.set_tenant_config(tenant_id, {}) + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] is None + assert readback_controller.tenant_specific_overrides["pitr_interval"] is None + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == default_value + assert "pitr_interval" not in readback_ps.tenant_specific_overrides + + env.attachment_service.consistency_check() diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index a2ffd200a6..fc099297e1 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "period": "20s", "threshold": "23h", } - assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024 + assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024 # restart the pageserver and ensure that the config is still correct env.pageserver.stop() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 8c7d332e1d..c4b4e5fb77 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -505,10 +505,10 @@ def test_tenant_delete_concurrent( return ps_http.tenant_delete(tenant_id) def hit_remove_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") def hit_run_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") with concurrent.futures.ThreadPoolExecutor() as executor: background_200_req = executor.submit(delete_tenant) @@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation( Thread(target=timeline_create).start() def hit_initdb_upload_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") wait_until(100, 0.1, hit_initdb_upload_failpoint) def creation_connection_timed_out(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( "POST.*/timeline.* request was dropped before completing" ) @@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation( Thread(target=tenant_delete).start() def deletion_arrived(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" ) @@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation( ) # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file - assert env.pageserver.log_contains(CANCELLED_ERROR) + env.pageserver.assert_log_contains(CANCELLED_ERROR) assert not env.pageserver.log_contains( ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion" ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 4752699abb..d3f24cb06e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -92,10 +92,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index b70131472a..9def3ad1c2 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -495,7 +495,7 @@ def test_emergency_relocate_with_branches_slow_replay( assert cur.fetchall() == [("before pause",), ("after pause",)] # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up @@ -632,7 +632,7 @@ def test_emergency_relocate_with_branches_createdb( assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200 # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 1c693a0df5..d16978d02a 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -147,10 +147,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 5f72cfd747..7bf49a0874 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -179,6 +179,6 @@ def test_threshold_based_eviction( assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized" assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident" - assert env.pageserver.log_contains( - metrics_refused_log_line + assert ( + env.pageserver.log_contains(metrics_refused_log_line) is not None ), "ensure the metrics collection worker ran" diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index a6a6fb47cc..795110d90b 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver + # TODO: review whether this wait_until is actually necessary, we do an await() internally wait_until( number_of_iterations=3, interval=0.2, @@ -531,7 +532,7 @@ def test_concurrent_timeline_delete_stuck_on( try: def first_call_hit_failpoint(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f".*{child_timeline_id}.*at failpoint {stuck_failpoint}" ) @@ -602,7 +603,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*" def hit_failpoint(): - assert env.pageserver.log_contains(at_failpoint_log_message) + env.pageserver.assert_log_contains(at_failpoint_log_message) wait_until(50, 0.1, hit_failpoint) @@ -612,7 +613,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.append(hangup_log_message) def got_hangup_log_message(): - assert env.pageserver.log_contains(hangup_log_message) + env.pageserver.assert_log_contains(hangup_log_message) wait_until(50, 0.1, got_hangup_log_message) @@ -624,7 +625,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def first_request_finished(): message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" - assert env.pageserver.log_contains(message) + env.pageserver.assert_log_contains(message) wait_until(50, 0.1, first_request_finished) @@ -759,7 +760,7 @@ def test_delete_orphaned_objects( for orphan in orphans: assert not orphan.exists() - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f"deleting a file not referenced from index_part.json name={orphan.stem}" ) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 327e5abe26..cbf7059c92 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -2,6 +2,7 @@ import concurrent.futures import math import random import time +from collections import defaultdict from contextlib import closing from pathlib import Path from typing import Optional @@ -14,6 +15,7 @@ from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, + NeonPageserver, PgBin, VanillaPostgres, wait_for_last_flush_lsn, @@ -839,22 +841,40 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): ) # Deleting a stuck tenant should prompt it to go active + # in some cases, it has already been activated because it's behind the detach + delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False) + tenant_ids.remove(delete_tenant_id) + + # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one + # we detached) + wait_until(10, 1, all_active) + assert len(get_tenant_states()) == n_tenants - 2 + + +def delete_lazy_activating( + delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool +): + pageserver_http = pageserver.http_client() + + # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating + # logical size is paused in a failpoint. So instead we will use a log observation to check that + # on-demand activation was triggered by the tenant deletion + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" + + if expect_attaching: + assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching" + with concurrent.futures.ThreadPoolExecutor() as executor: log.info("Starting background delete") + def activated_on_demand(): + assert pageserver.log_contains(log_match) is not None + def delete_tenant(): - env.pageserver.http_client().tenant_delete(delete_tenant_id) + pageserver_http.tenant_delete(delete_tenant_id) background_delete = executor.submit(delete_tenant) - # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating - # logical size is paused in a failpoint. So instead we will use a log observation to check that - # on-demand activation was triggered by the tenant deletion - log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" - - def activated_on_demand(): - assert env.pageserver.log_contains(log_match) is not None - log.info(f"Waiting for activation message '{log_match}'") try: wait_until(10, 1, activated_on_demand) @@ -868,12 +888,6 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Poll for deletion to complete wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) - tenant_ids.remove(delete_tenant_id) - - # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one - # we detached) - wait_until(10, 1, all_active) - assert len(get_tenant_states()) == n_tenants - 2 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): @@ -939,3 +953,159 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): client.configure_failpoints( [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")] ) + + +def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # the supporting_second does nothing except queue behind env.initial_tenant + # for purposes of showing that eager_tenant breezes past the queue + supporting_second, _ = env.neon_cli.create_tenant() + eager_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + eager_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + tenant_ids = [env.initial_tenant, supporting_second] + + def get_tenant_states() -> dict[str, list[TenantId]]: + states = defaultdict(list) + for id in tenant_ids: + state = client.tenant_status(id)["state"]["slug"] + states[state].append(id) + return dict(states) + + def one_is_active(): + states = get_tenant_states() + log.info(f"{states}") + assert len(states["Active"]) == 1 + + wait_until(10, 1, one_is_active) + + def other_is_attaching(): + states = get_tenant_states() + assert len(states["Attaching"]) == 1 + + wait_until(10, 1, other_is_attaching) + + def eager_tenant_is_active(): + resp = client.tenant_status(eager_tenant) + assert resp["state"]["slug"] == "Active" + + gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id) + client.tenant_location_conf( + eager_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=False, + ) + wait_until(10, 1, eager_tenant_is_active) + + other_is_attaching() + + client.configure_failpoints( + [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")] + ) + + +@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) +def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): + # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue + lazy_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + lazy_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + def initial_tenant_is_active(): + resp = client.tenant_status(env.initial_tenant) + assert resp["state"]["slug"] == "Active" + + wait_until(10, 1, initial_tenant_is_active) + + # even though the initial tenant is now active, because it was startup time + # attach, it will consume the only permit because logical size calculation + # is paused. + + gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id) + client.tenant_location_conf( + lazy_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=True, + ) + + def lazy_tenant_is_attaching(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Attaching" + + # paused logical size calculation of env.initial_tenant is keeping it attaching + wait_until(10, 1, lazy_tenant_is_attaching) + + for _ in range(5): + lazy_tenant_is_attaching() + time.sleep(0.5) + + def lazy_tenant_is_active(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Active" + + if activation_method == "endpoint": + with env.endpoints.create_start("main", tenant_id=lazy_tenant): + # starting up the endpoint should make it jump the queue + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "branch": + env.neon_cli.create_timeline("second_branch", lazy_tenant) + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "delete": + delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) + else: + raise RuntimeError(activation_method)