diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 810c61de2d..2bcda7cc8e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -461,6 +461,7 @@ jobs: - name: Pytest regression tests uses: ./.github/actions/run-python-test-set + timeout-minutes: 60 with: build_type: ${{ matrix.build_type }} test_selection: regress diff --git a/Cargo.lock b/Cargo.lock index e35fa564b9..625958fdb8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -282,8 +282,10 @@ dependencies = [ "control_plane", "diesel", "diesel_migrations", + "fail", "futures", "git-version", + "hex", "humantime", "hyper", "metrics", @@ -1344,6 +1346,7 @@ dependencies = [ "futures", "git-version", "hex", + "humantime", "hyper", "nix 0.27.1", "once_cell", @@ -3527,6 +3530,7 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", + "procfs", "rand 0.8.5", "regex", "remote_storage", @@ -5342,13 +5346,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" [[package]] name = "sha2" -version = "0.10.6" +version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", "digest", + "sha2-asm", +] + +[[package]] +name = "sha2-asm" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e" +dependencies = [ + "cc", ] [[package]] @@ -5884,7 +5898,7 @@ dependencies = [ [[package]] name = "tokio-epoll-uring" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939" dependencies = [ "futures", "nix 0.26.4", @@ -6434,7 +6448,7 @@ dependencies = [ [[package]] name = "uring-common" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939" dependencies = [ "bytes", "io-uring", @@ -6477,6 +6491,7 @@ version = "0.1.0" dependencies = [ "anyhow", "arc-swap", + "async-compression", "async-trait", "bincode", "byteorder", @@ -6515,6 +6530,7 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", + "tokio-tar", "tokio-test", "tokio-util", "tracing", @@ -6522,6 +6538,7 @@ dependencies = [ "tracing-subscriber", "url", "uuid", + "walkdir", "workspace_hack", ] @@ -7039,6 +7056,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "sha2", "smallvec", "subtle", "syn 1.0.109", diff --git a/README.md b/README.md index c44ae695d6..00a90f4483 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop ## Running tests +### Rust unit tests + +We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows. +Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead. +You can install `cargo-nextest` with `cargo install cargo-nextest`. + +### Integration tests + Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes). ```sh diff --git a/clippy.toml b/clippy.toml index 5f7dc66152..4c0c04f9a1 100644 --- a/clippy.toml +++ b/clippy.toml @@ -2,6 +2,8 @@ disallowed-methods = [ "tokio::task::block_in_place", # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", + # use tokio_epoll_uring_ext instead + "tokio_epoll_uring::thread_local_system", ] disallowed-macros = [ diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index ba3a84cda8..3b596a88ff 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { // which may happen in two cases: // - extension was just installed // - extension was already installed and is up to date - let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension version with query: {}", query); - client.simple_query(query)?; + // DISABLED due to compute node unpinning epic + // let query = "ALTER EXTENSION neon UPDATE"; + // info!("update neon extension version with query: {}", query); + // client.simple_query(query)?; Ok(()) } #[instrument(skip_all)] -pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { - info!("handle neon extension upgrade"); - let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension version with query: {}", query); - client.simple_query(query)?; +pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> { + info!("handle neon extension upgrade (not really)"); + // DISABLED due to compute node unpinning epic + // let query = "ALTER EXTENSION neon UPDATE"; + // info!("update neon extension version with query: {}", query); + // client.simple_query(query)?; Ok(()) } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 75e5dcb7f8..b544a8c587 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -12,6 +12,7 @@ clap.workspace = true comfy-table.workspace = true futures.workspace = true git-version.workspace = true +humantime.workspace = true nix.workspace = true once_cell.workspace = true postgres.workspace = true diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index a5fad7216c..f78f56c480 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -19,8 +19,10 @@ aws-config.workspace = true aws-sdk-secretsmanager.workspace = true camino.workspace = true clap.workspace = true +fail.workspace = true futures.workspace = true git-version.workspace = true +hex.workspace = true hyper.workspace = true humantime.workspace = true once_cell.workspace = true diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql new file mode 100644 index 0000000000..897c7e0d01 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}'; +UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}'; \ No newline at end of file diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql new file mode 100644 index 0000000000..c898ac9aee --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}'; +UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"'; \ No newline at end of file diff --git a/control_plane/attachment_service/src/heartbeater.rs b/control_plane/attachment_service/src/heartbeater.rs new file mode 100644 index 0000000000..e15de28920 --- /dev/null +++ b/control_plane/attachment_service/src/heartbeater.rs @@ -0,0 +1,227 @@ +use futures::{stream::FuturesUnordered, StreamExt}; +use std::{ + collections::HashMap, + sync::Arc, + time::{Duration, Instant}, +}; +use tokio_util::sync::CancellationToken; + +use pageserver_api::{ + controller_api::{NodeAvailability, UtilizationScore}, + models::PageserverUtilization, +}; + +use thiserror::Error; +use utils::id::NodeId; + +use crate::node::Node; + +struct HeartbeaterTask { + receiver: tokio::sync::mpsc::UnboundedReceiver, + cancel: CancellationToken, + + state: HashMap, + + max_unavailable_interval: Duration, + jwt_token: Option, +} + +#[derive(Debug, Clone)] +pub(crate) enum PageserverState { + Available { + last_seen_at: Instant, + utilization: PageserverUtilization, + }, + Offline, +} + +#[derive(Debug)] +pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>); + +#[derive(Debug, Error)] +pub(crate) enum HeartbeaterError { + #[error("Cancelled")] + Cancel, +} + +struct HeartbeatRequest { + pageservers: Arc>, + reply: tokio::sync::oneshot::Sender>, +} + +pub(crate) struct Heartbeater { + sender: tokio::sync::mpsc::UnboundedSender, +} + +impl Heartbeater { + pub(crate) fn new( + jwt_token: Option, + max_unavailable_interval: Duration, + cancel: CancellationToken, + ) -> Self { + let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); + let mut heartbeater = + HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel); + tokio::task::spawn(async move { heartbeater.run().await }); + + Self { sender } + } + + pub(crate) async fn heartbeat( + &self, + pageservers: Arc>, + ) -> Result { + let (sender, receiver) = tokio::sync::oneshot::channel(); + self.sender + .send(HeartbeatRequest { + pageservers, + reply: sender, + }) + .unwrap(); + + receiver.await.unwrap() + } +} + +impl HeartbeaterTask { + fn new( + receiver: tokio::sync::mpsc::UnboundedReceiver, + jwt_token: Option, + max_unavailable_interval: Duration, + cancel: CancellationToken, + ) -> Self { + Self { + receiver, + cancel, + state: HashMap::new(), + max_unavailable_interval, + jwt_token, + } + } + + async fn run(&mut self) { + loop { + tokio::select! { + request = self.receiver.recv() => { + match request { + Some(req) => { + let res = self.heartbeat(req.pageservers).await; + req.reply.send(res).unwrap(); + }, + None => { return; } + } + }, + _ = self.cancel.cancelled() => return + } + } + } + + async fn heartbeat( + &mut self, + pageservers: Arc>, + ) -> Result { + let mut new_state = HashMap::new(); + + let mut heartbeat_futs = FuturesUnordered::new(); + for (node_id, node) in &*pageservers { + heartbeat_futs.push({ + let jwt_token = self.jwt_token.clone(); + let cancel = self.cancel.clone(); + + // Clone the node and mark it as available such that the request + // goes through to the pageserver even when the node is marked offline. + // This doesn't impact the availability observed by [`crate::service::Service`]. + let mut node = node.clone(); + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + + async move { + let response = node + .with_client_retries( + |client| async move { client.get_utilization().await }, + &jwt_token, + 2, + 3, + Duration::from_secs(1), + &cancel, + ) + .await; + + let response = match response { + Some(r) => r, + None => { + // This indicates cancellation of the request. + // We ignore the node in this case. + return None; + } + }; + + let status = if let Ok(utilization) = response { + PageserverState::Available { + last_seen_at: Instant::now(), + utilization, + } + } else { + PageserverState::Offline + }; + + Some((*node_id, status)) + } + }); + + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; + + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); + } + } + } + + let mut deltas = Vec::new(); + let now = Instant::now(); + for (node_id, ps_state) in new_state { + use std::collections::hash_map::Entry::*; + let entry = self.state.entry(node_id); + + let mut needs_update = false; + match entry { + Occupied(ref occ) => match (occ.get(), &ps_state) { + (PageserverState::Offline, PageserverState::Offline) => {} + (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => { + if now - *last_seen_at >= self.max_unavailable_interval { + deltas.push((node_id, ps_state.clone())); + needs_update = true; + } + } + _ => { + deltas.push((node_id, ps_state.clone())); + needs_update = true; + } + }, + Vacant(_) => { + deltas.push((node_id, ps_state.clone())); + } + } + + match entry { + Occupied(mut occ) if needs_update => { + (*occ.get_mut()) = ps_state; + } + Vacant(vac) => { + vac.insert(ps_state); + } + _ => {} + } + } + + Ok(AvailablityDeltas(deltas)) + } +} diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 27ba5bdb65..076b3a2f70 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -10,9 +10,11 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use std::sync::Arc; use std::time::{Duration, Instant}; +use tokio_util::sync::CancellationToken; use utils::auth::{Scope, SwappableJwtAuth}; +use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; -use utils::http::request::{must_get_query_param, parse_request_param}; +use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param}; use utils::id::{TenantId, TimelineId}; use utils::{ @@ -26,7 +28,7 @@ use utils::{ }; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest, }; use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; @@ -174,14 +176,14 @@ async fn handle_tenant_location_config( service: Arc, mut req: Request, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; let config_req = json_request::(&mut req).await?; json_response( StatusCode::OK, service - .tenant_location_config(tenant_id, config_req) + .tenant_location_config(tenant_shard_id, config_req) .await?, ) } @@ -246,8 +248,10 @@ async fn handle_tenant_secondary_download( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; - service.tenant_secondary_download(tenant_id).await?; - json_response(StatusCode::OK, ()) + let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + + let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; + json_response(status, progress) } async fn handle_tenant_delete( @@ -349,6 +353,16 @@ async fn handle_tenant_locate( json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } +async fn handle_tenant_describe( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) +} + async fn handle_node_register(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -387,7 +401,14 @@ async fn handle_node_configure(mut req: Request) -> Result, json_response( StatusCode::OK, - state.service.node_configure(config_req).await?, + state + .service + .node_configure( + config_req.node_id, + config_req.availability.map(NodeAvailability::from), + config_req.scheduling, + ) + .await?, ) } @@ -548,14 +569,17 @@ pub fn make_router( request_span(r, handle_node_drop) }) .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump)) + .get("/debug/v1/tenant/:tenant_id/locate", |r| { + tenant_service_handler(r, handle_tenant_locate) + }) .get("/debug/v1/scheduler", |r| { request_span(r, handle_scheduler_dump) }) .post("/debug/v1/consistency_check", |r| { request_span(r, handle_consistency_check) }) - .get("/control/v1/tenant/:tenant_id/locate", |r| { - tenant_service_handler(r, handle_tenant_locate) + .put("/debug/v1/failpoints", |r| { + request_span(r, |r| failpoints_handler(r, CancellationToken::new())) }) // Node operations .post("/control/v1/node", |r| { @@ -572,6 +596,9 @@ pub fn make_router( .put("/control/v1/tenant/:tenant_id/shard_split", |r| { tenant_service_handler(r, handle_tenant_shard_split) }) + .get("/control/v1/tenant/:tenant_id", |r| { + tenant_service_handler(r, handle_tenant_describe) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. @@ -587,7 +614,7 @@ pub fn make_router( .get("/v1/tenant/:tenant_id/config", |r| { tenant_service_handler(r, handle_tenant_config_get) }) - .put("/v1/tenant/:tenant_id/location_config", |r| { + .put("/v1/tenant/:tenant_shard_id/location_config", |r| { tenant_service_handler(r, handle_tenant_location_config) }) .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| { diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/control_plane/attachment_service/src/id_lock_map.rs new file mode 100644 index 0000000000..b03700b50c --- /dev/null +++ b/control_plane/attachment_service/src/id_lock_map.rs @@ -0,0 +1,54 @@ +use std::{collections::HashMap, sync::Arc}; + +/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't +/// want to embed a lock in each one, or if your locking granularity is different to your object granularity. +/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking +/// is needed at a tenant-wide granularity. +pub(crate) struct IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + /// A synchronous lock for getting/setting the async locks that our callers will wait on. + entities: std::sync::Mutex>>>, +} + +impl IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + pub(crate) fn shared( + &self, + key: T, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default(); + entry.clone().read_owned() + } + + pub(crate) fn exclusive( + &self, + key: T, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default(); + entry.clone().write_owned() + } + + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do + /// periodic housekeeping to avoid the map growing indefinitely + pub(crate) fn housekeeping(&self) { + let mut locked = self.entities.lock().unwrap(); + locked.retain(|_k, lock| lock.try_write().is_err()) + } +} + +impl Default for IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + fn default() -> Self { + Self { + entities: std::sync::Mutex::new(HashMap::new()), + } + } +} diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index 796b465c10..4aff29f15b 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -3,7 +3,9 @@ use utils::seqwait::MonotonicCounter; mod auth; mod compute_hook; +mod heartbeater; pub mod http; +mod id_lock_map; pub mod metrics; mod node; pub mod persistence; diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs index 333c3911e3..0a925a63f6 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/control_plane/attachment_service/src/main.rs @@ -2,7 +2,7 @@ use anyhow::{anyhow, Context}; use attachment_service::http::make_router; use attachment_service::metrics::preinitialize_metrics; use attachment_service::persistence::Persistence; -use attachment_service::service::{Config, Service}; +use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; use aws_config::{BehaviorVersion, Region}; use camino::Utf8PathBuf; use clap::Parser; @@ -54,6 +54,10 @@ struct Cli { /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service #[arg(long)] database_url: Option, + + /// Grace period before marking unresponsive pageserver offline + #[arg(long)] + max_unavailable_interval: Option, } /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this @@ -206,6 +210,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> { } fn main() -> anyhow::Result<()> { + let default_panic = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + default_panic(info); + std::process::exit(1); + })); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections. @@ -243,6 +253,10 @@ async fn async_main() -> anyhow::Result<()> { jwt_token: secrets.jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, compute_hook_url: args.compute_hook_url, + max_unavailable_interval: args + .max_unavailable_interval + .map(humantime::Duration::into) + .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs index 27b03608fa..4167782715 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/control_plane/attachment_service/src/node.rs @@ -12,7 +12,7 @@ use serde::Serialize; use tokio_util::sync::CancellationToken; use utils::{backoff, id::NodeId}; -use crate::persistence::NodePersistence; +use crate::{persistence::NodePersistence, scheduler::MaySchedule}; /// Represents the in-memory description of a Node. /// @@ -83,29 +83,38 @@ impl Node { } } - pub(crate) fn set_availability( - &mut self, - availability: NodeAvailability, - ) -> AvailabilityTransition { - use NodeAvailability::*; - let transition = match (self.availability, availability) { - (Offline, Active) => { + pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { + match self.get_availability_transition(availability) { + AvailabilityTransition::ToActive => { // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any // users of previously-cloned copies of the node will still see the old cancellation // state. For example, Reconcilers in flight will have to complete and be spawned // again to realize that the node has become available. self.cancel = CancellationToken::new(); - AvailabilityTransition::ToActive } - (Active, Offline) => { + AvailabilityTransition::ToOffline => { // Fire the node's cancellation token to cancel any in-flight API requests to it self.cancel.cancel(); - AvailabilityTransition::ToOffline } - _ => AvailabilityTransition::Unchanged, - }; + AvailabilityTransition::Unchanged => {} + } self.availability = availability; - transition + } + + /// Without modifying the availability of the node, convert the intended availability + /// into a description of the transition. + pub(crate) fn get_availability_transition( + &self, + availability: NodeAvailability, + ) -> AvailabilityTransition { + use AvailabilityTransition::*; + use NodeAvailability::*; + + match (self.availability, availability) { + (Offline, Active(_)) => ToActive, + (Active(_), Offline) => ToOffline, + _ => Unchanged, + } } /// Whether we may send API requests to this node. @@ -114,21 +123,21 @@ impl Node { // a reference to the original Node's cancellation status. Checking both of these results // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable // when we cloned it, or if the original Node instance's cancellation token was fired. - matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled() + matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled() } /// Is this node elegible to have work scheduled onto it? - pub(crate) fn may_schedule(&self) -> bool { - match self.availability { - NodeAvailability::Active => {} - NodeAvailability::Offline => return false, - } + pub(crate) fn may_schedule(&self) -> MaySchedule { + let score = match self.availability { + NodeAvailability::Active(score) => score, + NodeAvailability::Offline => return MaySchedule::No, + }; match self.scheduling { - NodeSchedulingPolicy::Active => true, - NodeSchedulingPolicy::Draining => false, - NodeSchedulingPolicy::Filling => true, - NodeSchedulingPolicy::Pause => false, + NodeSchedulingPolicy::Active => MaySchedule::Yes(score), + NodeSchedulingPolicy::Draining => MaySchedule::No, + NodeSchedulingPolicy::Filling => MaySchedule::Yes(score), + NodeSchedulingPolicy::Pause => MaySchedule::No, } } @@ -146,8 +155,7 @@ impl Node { listen_pg_addr, listen_pg_port, scheduling: NodeSchedulingPolicy::Filling, - // TODO: we shouldn't really call this Active until we've heartbeated it. - availability: NodeAvailability::Active, + availability: NodeAvailability::Offline, cancel: CancellationToken::new(), } } diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index aa08945834..209d8ff075 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -11,6 +11,9 @@ use diesel::prelude::*; use diesel::Connection; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; +use pageserver_api::shard::ShardConfigError; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; use serde::{Deserialize, Serialize}; use utils::generation::Generation; @@ -72,6 +75,14 @@ pub(crate) enum DatabaseError { Logical(String), } +#[must_use] +pub(crate) enum AbortShardSplitStatus { + /// We aborted the split in the database by reverting to the parent shards + Aborted, + /// The split had already been persisted. + Complete, +} + pub(crate) type DatabaseResult = Result; impl Persistence { @@ -200,15 +211,10 @@ impl Persistence { let mut decoded = serde_json::from_slice::(&bytes) .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; - for (tenant_id, tenant) in &mut decoded.tenants { - // Backward compat: an old attachments.json from before PR #6251, replace - // empty strings with proper defaults. - if tenant.tenant_id.is_empty() { - tenant.tenant_id = tenant_id.to_string(); - tenant.config = serde_json::to_string(&TenantConfig::default()) - .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; - tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single) - .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; + for shard in decoded.tenants.values_mut() { + if shard.placement_policy == "\"Single\"" { + // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 + shard.placement_policy = "{\"Attached\":0}".to_string(); } } @@ -570,6 +576,51 @@ impl Persistence { }) .await } + + /// Used when the remote part of a shard split failed: we will revert the database state to have only + /// the parent shards, with SplitState::Idle. + pub(crate) async fn abort_shard_split( + &self, + split_tenant_id: TenantId, + new_shard_count: ShardCount, + ) -> DatabaseResult { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult { + let aborted = conn.transaction(|conn| -> DatabaseResult { + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; + + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } + + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ + count {new_shard_count:?} on tenant {split_tenant_id}" + ))); + } + + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; + + Ok(AbortShardSplitStatus::Aborted) + })?; + + Ok(aborted) + }) + .await + } } /// Parts of [`crate::tenant_state::TenantState`] that are stored durably @@ -604,6 +655,28 @@ pub(crate) struct TenantShardPersistence { pub(crate) config: String, } +impl TenantShardPersistence { + pub(crate) fn get_shard_identity(&self) -> Result { + if self.shard_count == 0 { + Ok(ShardIdentity::unsharded()) + } else { + Ok(ShardIdentity::new( + ShardNumber(self.shard_number as u8), + ShardCount::new(self.shard_count as u8), + ShardStripeSize(self.shard_stripe_size as u32), + )?) + } + } + + pub(crate) fn get_tenant_shard_id(&self) -> Result { + Ok(TenantShardId { + tenant_id: TenantId::from_str(self.tenant_id.as_str())?, + shard_number: ShardNumber(self.shard_number as u8), + shard_count: ShardCount::new(self.shard_count as u8), + }) + } +} + /// Parts of [`crate::node::Node`] that are stored durably #[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)] #[diesel(table_name = crate::schema::nodes)] diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index 603da9bf02..f00f35c74b 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -1,5 +1,6 @@ use crate::persistence::Persistence; use crate::service; +use hyper::StatusCode; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; @@ -7,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; use std::collections::HashMap; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; @@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation}; +const DEFAULT_HEATMAP_PERIOD: &str = "60s"; + /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. pub(super) struct Reconciler { @@ -255,22 +258,81 @@ impl Reconciler { tenant_shard_id: TenantShardId, node: &Node, ) -> Result<(), ReconcileError> { - match node - .with_client_retries( - |client| async move { client.tenant_secondary_download(tenant_shard_id).await }, - &self.service_config.jwt_token, - 1, - 1, - Duration::from_secs(60), - &self.cancel, - ) - .await - { - None => Err(ReconcileError::Cancel), - Some(Ok(_)) => Ok(()), - Some(Err(e)) => { - tracing::info!(" (skipping destination download: {})", e); - Ok(()) + // This is not the timeout for a request, but the total amount of time we're willing to wait + // for a secondary location to get up to date before + const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300); + + // This the long-polling interval for the secondary download requests we send to destination pageserver + // during a migration. + const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20); + + let started_at = Instant::now(); + + loop { + let (status, progress) = match node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download( + tenant_shard_id, + Some(REQUEST_DOWNLOAD_TIMEOUT), + ) + .await + }, + &self.service_config.jwt_token, + 1, + 3, + REQUEST_DOWNLOAD_TIMEOUT * 2, + &self.cancel, + ) + .await + { + None => Err(ReconcileError::Cancel), + Some(Ok(v)) => Ok(v), + Some(Err(e)) => { + // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before + // attaching, but we should not let an issue with a secondary location stop us proceeding + // with a live migration. + tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})"); + return Ok(()); + } + }?; + + if status == StatusCode::OK { + tracing::info!( + "Downloads to {} complete: {}/{} layers, {}/{} bytes", + node, + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + return Ok(()); + } else if status == StatusCode::ACCEPTED { + let total_runtime = started_at.elapsed(); + if total_runtime > TOTAL_DOWNLOAD_TIMEOUT { + tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", + total_runtime.as_millis(), + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working, + // it just makes the I/O performance for users less good. + return Ok(()); + } + + // Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call + // to the pageserver is a long-poll. + tracing::info!( + "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes", + node, + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); } } } @@ -413,7 +475,7 @@ impl Reconciler { } } - // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then + // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then // this location will be deleted in the general case reconciliation that runs after this. let origin_secondary_conf = build_location_config( &self.shard, @@ -485,17 +547,29 @@ impl Reconciler { ) .await { - Some(Ok(observed)) => observed, + Some(Ok(observed)) => Some(observed), + Some(Err(mgmt_api::Error::ApiError(status, _msg))) + if status == StatusCode::NOT_FOUND => + { + None + } Some(Err(e)) => return Err(e.into()), None => return Err(ReconcileError::Cancel), }; tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}"); - self.observed.locations.insert( - attached_node.get_id(), - ObservedStateLocation { - conf: observed_conf, - }, - ); + match observed_conf { + Some(conf) => { + // Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state, + // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running) + self.observed + .locations + .insert(attached_node.get_id(), ObservedStateLocation { conf }); + } + None => { + // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver. + self.observed.locations.remove(&attached_node.get_id()); + } + } } Ok(()) @@ -525,7 +599,12 @@ impl Reconciler { ))); }; - let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config); + let mut wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + !self.intent.secondary.is_empty(), + ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do @@ -662,10 +741,26 @@ impl Reconciler { } } +/// We tweak the externally-set TenantConfig while configuring +/// locations, using our awareness of whether secondary locations +/// are in use to automatically enable/disable heatmap uploads. +fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig { + let mut config = config.clone(); + if has_secondaries { + if config.heatmap_period.is_none() { + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + } + } else { + config.heatmap_period = None; + } + config +} + pub(crate) fn attached_location_conf( generation: Generation, shard: &ShardIdentity, config: &TenantConfig, + has_secondaries: bool, ) -> LocationConfig { LocationConfig { mode: LocationConfigMode::AttachedSingle, @@ -674,7 +769,7 @@ pub(crate) fn attached_location_conf( shard_number: shard.number.0, shard_count: shard.count.literal(), shard_stripe_size: shard.stripe_size.0, - tenant_conf: config.clone(), + tenant_conf: ha_aware_config(config, has_secondaries), } } @@ -689,6 +784,6 @@ pub(crate) fn secondary_location_conf( shard_number: shard.number.0, shard_count: shard.count.literal(), shard_stripe_size: shard.stripe_size.0, - tenant_conf: config.clone(), + tenant_conf: ha_aware_config(config, true), } } diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs index 26a2707e8d..981ba26cce 100644 --- a/control_plane/attachment_service/src/scheduler.rs +++ b/control_plane/attachment_service/src/scheduler.rs @@ -1,4 +1,5 @@ use crate::{node::Node, tenant_state::TenantState}; +use pageserver_api::controller_api::UtilizationScore; use serde::Serialize; use std::collections::HashMap; use utils::{http::error::ApiError, id::NodeId}; @@ -19,15 +20,34 @@ impl From for ApiError { } #[derive(Serialize, Eq, PartialEq)] +pub enum MaySchedule { + Yes(UtilizationScore), + No, +} + +#[derive(Serialize)] struct SchedulerNode { /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`]. shard_count: usize, /// Whether this node is currently elegible to have new shards scheduled (this is derived /// from a node's availability state and scheduling policy). - may_schedule: bool, + may_schedule: MaySchedule, } +impl PartialEq for SchedulerNode { + fn eq(&self, other: &Self) -> bool { + let may_schedule_matches = matches!( + (&self.may_schedule, &other.may_schedule), + (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No) + ); + + may_schedule_matches && self.shard_count == other.shard_count + } +} + +impl Eq for SchedulerNode {} + /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver /// on which to run. /// @@ -186,13 +206,15 @@ impl Scheduler { return None; } + // TODO: When the utilization score returned by the pageserver becomes meaningful, + // schedule based on that instead of the shard count. let node = nodes .iter() .map(|node_id| { let may_schedule = self .nodes .get(node_id) - .map(|n| n.may_schedule) + .map(|n| n.may_schedule != MaySchedule::No) .unwrap_or(false); (*node_id, may_schedule) }) @@ -211,7 +233,7 @@ impl Scheduler { .nodes .iter() .filter_map(|(k, v)| { - if hard_exclude.contains(k) || !v.may_schedule { + if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No { None } else { Some((*k, v.shard_count)) @@ -230,7 +252,7 @@ impl Scheduler { for (node_id, node) in &self.nodes { tracing::info!( "Node {node_id}: may_schedule={} shards={}", - node.may_schedule, + node.may_schedule != MaySchedule::No, node.shard_count ); } @@ -255,6 +277,7 @@ impl Scheduler { pub(crate) mod test_utils { use crate::node::Node; + use pageserver_api::controller_api::{NodeAvailability, UtilizationScore}; use std::collections::HashMap; use utils::id::NodeId; /// Test helper: synthesize the requested number of nodes, all in active state. @@ -264,13 +287,14 @@ pub(crate) mod test_utils { (1..n + 1) .map(|i| { (NodeId(i), { - let node = Node::new( + let mut node = Node::new( NodeId(i), format!("httphost-{i}"), 80 + i as u16, format!("pghost-{i}"), 5432 + i as u16, ); + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); assert!(node.is_available()); node }) diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index ea301d0372..e38007c7af 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -7,6 +7,9 @@ use std::{ time::{Duration, Instant}, }; +use crate::{ + id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError, +}; use anyhow::Context; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, @@ -16,18 +19,21 @@ use futures::{stream::FuturesUnordered, StreamExt}; use hyper::StatusCode; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy, - TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse, - TenantShardMigrateRequest, TenantShardMigrateResponse, + NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, + TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest, + TenantShardMigrateResponse, UtilizationScore, }, - models::TenantConfigRequest, + models::{SecondaryProgress, TenantConfigRequest}, }; + use pageserver_api::{ models::{ - self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters, - TenantConfig, TenantCreateRequest, TenantLocationConfigRequest, - TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, - TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, + self, LocationConfig, LocationConfigListResponse, LocationConfigMode, + PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest, + TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, + TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest, + TimelineCreateRequest, TimelineInfo, }, shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, upcall_api::{ @@ -36,6 +42,7 @@ use pageserver_api::{ }, }; use pageserver_client::mgmt_api; +use tokio::sync::OwnedRwLockWriteGuard; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::{ @@ -49,6 +56,7 @@ use utils::{ use crate::{ compute_hook::{self, ComputeHook}, + heartbeater::{Heartbeater, PageserverState}, node::{AvailabilityTransition, Node}, persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, reconciler::attached_location_conf, @@ -76,6 +84,8 @@ const INITIAL_GENERATION: Generation = Generation::new(0); /// up on unresponsive pageservers and proceed. pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); +pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); + // Top level state available to all HTTP handlers struct ServiceState { tenants: BTreeMap, @@ -123,6 +133,11 @@ pub struct Config { /// (this URL points to the control plane in prod). If this is None, the compute hook will /// assume it is running in a test environment and try to update neon_local. pub compute_hook_url: Option, + + /// Grace period within which a pageserver does not respond to heartbeats, but is still + /// considered active. Once the grace period elapses, the next heartbeat failure will + /// mark the pagseserver offline. + pub max_unavailable_interval: Duration, } impl From for ApiError { @@ -147,6 +162,20 @@ pub struct Service { compute_hook: Arc, result_tx: tokio::sync::mpsc::UnboundedSender, + heartbeater: Heartbeater, + + // Channel for background cleanup from failed operations that require cleanup, such as shard split + abort_tx: tokio::sync::mpsc::UnboundedSender, + + // Locking on a tenant granularity (covers all shards in the tenant): + // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split) + // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD) + tenant_op_locks: IdLockMap, + + // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or + // that transition it to/from Active. + node_op_locks: IdLockMap, + // Process shutdown will fire this token cancel: CancellationToken, @@ -174,6 +203,50 @@ enum TenantCreateOrUpdate { Update(Vec), } +struct ShardSplitParams { + old_shard_count: ShardCount, + new_shard_count: ShardCount, + new_stripe_size: Option, + targets: Vec, + policy: PlacementPolicy, + shard_ident: ShardIdentity, +} + +// When preparing for a shard split, we may either choose to proceed with the split, +// or find that the work is already done and return NoOp. +enum ShardSplitAction { + Split(ShardSplitParams), + NoOp(TenantShardSplitResponse), +} + +// A parent shard which will be split +struct ShardSplitTarget { + parent_id: TenantShardId, + node: Node, + child_ids: Vec, +} + +/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes +/// might not be available. We therefore use a queue of abort operations processed in the background. +struct TenantShardSplitAbort { + tenant_id: TenantId, + /// The target values from the request that failed + new_shard_count: ShardCount, + new_stripe_size: Option, + /// Until this abort op is complete, no other operations may be done on the tenant + _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>, +} + +#[derive(thiserror::Error, Debug)] +enum TenantShardSplitAbortError { + #[error(transparent)] + Database(#[from] DatabaseError), + #[error(transparent)] + Remote(#[from] mgmt_api::Error), + #[error("Unavailable")] + Unavailable, +} + struct ShardUpdate { tenant_shard_id: TenantShardId, placement_policy: PlacementPolicy, @@ -197,8 +270,6 @@ impl Service { let mut observed: HashMap)>> = HashMap::new(); - let mut nodes_online = HashSet::new(); - // Startup reconciliation does I/O to other services: whether they // are responsive or not, we should aim to finish within our deadline, because: // - If we don't, a k8s readiness hook watching /ready will kill us. @@ -220,6 +291,9 @@ impl Service { let mut cleanup = Vec::new(); let node_listings = self.scan_node_locations(node_scan_deadline).await; + // Send initial heartbeat requests to nodes that replied to the location listing above. + let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await; + for (node_id, list_response) in node_listings { let tenant_shards = list_response.tenant_shards; tracing::info!( @@ -227,7 +301,6 @@ impl Service { tenant_shards.len(), node_id ); - nodes_online.insert(node_id); for (tenant_shard_id, conf_opt) in tenant_shards { let shard_observations = observed.entry(tenant_shard_id).or_default(); @@ -246,8 +319,10 @@ impl Service { // Mark nodes online if they responded to us: nodes are offline by default after a restart. let mut new_nodes = (**nodes).clone(); for (node_id, node) in new_nodes.iter_mut() { - if nodes_online.contains(node_id) { - node.set_availability(NodeAvailability::Active); + if let Some(utilization) = nodes_online.get(node_id) { + node.set_availability(NodeAvailability::Active(UtilizationScore( + utilization.utilization_score, + ))); scheduler.node_upsert(node); } } @@ -336,6 +411,49 @@ impl Service { tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); } + async fn initial_heartbeat_round<'a>( + &self, + node_ids: impl Iterator, + ) -> HashMap { + assert!(!self.startup_complete.is_ready()); + + let all_nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let mut nodes_to_heartbeat = HashMap::new(); + for node_id in node_ids { + match all_nodes.get(node_id) { + Some(node) => { + nodes_to_heartbeat.insert(*node_id, node.clone()); + } + None => { + tracing::warn!("Node {node_id} was removed during start-up"); + } + } + } + + let res = self + .heartbeater + .heartbeat(Arc::new(nodes_to_heartbeat)) + .await; + + let mut online_nodes = HashMap::new(); + if let Ok(deltas) = res { + for (node_id, status) in deltas.0 { + match status { + PageserverState::Available { utilization, .. } => { + online_nodes.insert(node_id, utilization); + } + PageserverState::Offline => {} + } + } + } + + online_nodes + } + /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline. /// /// The result includes only nodes which responded within the deadline @@ -356,7 +474,7 @@ impl Service { node_list_futs.push({ async move { tracing::info!("Scanning shards on node {node}..."); - let timeout = Duration::from_secs(5); + let timeout = Duration::from_secs(1); let response = node .with_client_retries( |client| async move { client.list_location_config().await }, @@ -551,6 +669,56 @@ impl Service { } } } + #[instrument(skip_all)] + async fn spawn_heartbeat_driver(&self) { + self.startup_complete.clone().wait().await; + + const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); + + let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL); + while !self.cancel.is_cancelled() { + tokio::select! { + _ = interval.tick() => { } + _ = self.cancel.cancelled() => return + }; + + let nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let res = self.heartbeater.heartbeat(nodes).await; + if let Ok(deltas) = res { + for (node_id, state) in deltas.0 { + let new_availability = match state { + PageserverState::Available { utilization, .. } => NodeAvailability::Active( + UtilizationScore(utilization.utilization_score), + ), + PageserverState::Offline => NodeAvailability::Offline, + }; + let res = self + .node_configure(node_id, Some(new_availability), None) + .await; + + match res { + Ok(()) => {} + Err(ApiError::NotFound(_)) => { + // This should be rare, but legitimate since the heartbeats are done + // on a snapshot of the nodes. + tracing::info!("Node {} was not found after heartbeat round", node_id); + } + Err(err) => { + tracing::error!( + "Failed to update node {} after heartbeat round: {}", + node_id, + err + ); + } + } + } + } + } + } /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation /// was successful, this will update the observed state of the tenant such that subsequent @@ -591,7 +759,19 @@ impl Service { tenant.waiter.advance(result.sequence); } Err(e) => { - tracing::warn!("Reconcile error: {}", e); + match e { + ReconcileError::Cancel => { + tracing::info!("Reconciler was cancelled"); + } + ReconcileError::Remote(mgmt_api::Error::Cancelled) => { + // This might be due to the reconciler getting cancelled, or it might + // be due to the `Node` being marked offline. + tracing::info!("Reconciler cancelled during pageserver API call"); + } + _ => { + tracing::warn!("Reconcile error: {}", e); + } + } // Ordering: populate last_error before advancing error_seq, // so that waiters will see the correct error after waiting. @@ -627,8 +807,52 @@ impl Service { } } + async fn process_aborts( + &self, + mut abort_rx: tokio::sync::mpsc::UnboundedReceiver, + ) { + loop { + // Wait for the next result, or for cancellation + let op = tokio::select! { + r = abort_rx.recv() => { + match r { + Some(op) => {op}, + None => {break;} + } + } + _ = self.cancel.cancelled() => { + break; + } + }; + + // Retry until shutdown: we must keep this request object alive until it is properly + // processed, as it holds a lock guard that prevents other operations trying to do things + // to the tenant while it is in a weird part-split state. + while !self.cancel.is_cancelled() { + match self.abort_tenant_shard_split(&op).await { + Ok(_) => break, + Err(e) => { + tracing::warn!( + "Failed to abort shard split on {}, will retry: {e}", + op.tenant_id + ); + + // If a node is unavailable, we hope that it has been properly marked Offline + // when we retry, so that the abort op will succeed. If the abort op is failing + // for some other reason, we will keep retrying forever, or until a human notices + // and does something about it (either fixing a pageserver or restarting the controller). + tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) + .await + .ok(); + } + } + } + } + } + pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); + let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel(); tracing::info!("Loading nodes from database..."); let nodes = persistence @@ -641,12 +865,62 @@ impl Service { tracing::info!("Loaded {} nodes from database.", nodes.len()); tracing::info!("Loading shards from database..."); - let tenant_shard_persistence = persistence.list_tenant_shards().await?; + let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; tracing::info!( "Loaded {} shards from database.", tenant_shard_persistence.len() ); + // If any shard splits were in progress, reset the database state to abort them + let mut tenant_shard_count_min_max: HashMap = + HashMap::new(); + for tsp in &mut tenant_shard_persistence { + let shard = tsp.get_shard_identity()?; + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let entry = tenant_shard_count_min_max + .entry(tenant_shard_id.tenant_id) + .or_insert_with(|| (shard.count, shard.count)); + entry.0 = std::cmp::min(entry.0, shard.count); + entry.1 = std::cmp::max(entry.1, shard.count); + } + + for (tenant_id, (count_min, count_max)) in tenant_shard_count_min_max { + if count_min != count_max { + // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in + // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll + // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now. + tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}"); + let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?; + + // We may never see the Complete status here: if the split was complete, we wouldn't have + // identified this tenant has having mismatching min/max counts. + assert!(matches!(abort_status, AbortShardSplitStatus::Aborted)); + + // Clear the splitting status in-memory, to reflect that we just aborted in the database + tenant_shard_persistence.iter_mut().for_each(|tsp| { + // Set idle split state on those shards that we will retain. + let tsp_tenant_id = TenantId::from_str(tsp.tenant_id.as_str()).unwrap(); + if tsp_tenant_id == tenant_id + && tsp.get_shard_identity().unwrap().count == count_min + { + tsp.splitting = SplitState::Idle; + } else if tsp_tenant_id == tenant_id { + // Leave the splitting state on the child shards: this will be used next to + // drop them. + tracing::info!( + "Shard {tsp_tenant_id} will be dropped after shard split abort", + ); + } + }); + + // Drop shards for this tenant which we didn't just mark idle (i.e. child shards of the aborted split) + tenant_shard_persistence.retain(|tsp| { + TenantId::from_str(tsp.tenant_id.as_str()).unwrap() != tenant_id + || tsp.splitting == SplitState::Idle + }); + } + } + let mut tenants = BTreeMap::new(); let mut scheduler = Scheduler::new(nodes.values()); @@ -676,21 +950,8 @@ impl Service { } } for tsp in tenant_shard_persistence { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount::new(tsp.shard_count as u8), - }; - let shard_identity = if tsp.shard_count == 0 { - ShardIdentity::unsharded() - } else { - ShardIdentity::new( - ShardNumber(tsp.shard_number as u8), - ShardCount::new(tsp.shard_count as u8), - ShardStripeSize(tsp.shard_stripe_size as u32), - )? - }; - + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let shard_identity = tsp.get_shard_identity()?; // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. let mut intent = IntentState::new(); @@ -720,6 +981,12 @@ impl Service { let (startup_completion, startup_complete) = utils::completion::channel(); + let cancel = CancellationToken::new(); + let heartbeater = Heartbeater::new( + config.jwt_token.clone(), + config.max_unavailable_interval, + cancel.clone(), + ); let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, tenants, scheduler, @@ -728,9 +995,13 @@ impl Service { persistence, compute_hook: Arc::new(ComputeHook::new(config)), result_tx, + heartbeater, + abort_tx, startup_complete: startup_complete.clone(), - cancel: CancellationToken::new(), + cancel, gate: Gate::default(), + tenant_op_locks: Default::default(), + node_op_locks: Default::default(), }); let result_task_this = this.clone(); @@ -741,6 +1012,33 @@ impl Service { } }); + tokio::task::spawn({ + let this = this.clone(); + async move { + // Block shutdown until we're done (we must respect self.cancel) + if let Ok(_gate) = this.gate.enter() { + this.process_aborts(abort_rx).await + } + } + }); + + tokio::task::spawn({ + let this = this.clone(); + async move { + if let Ok(_gate) = this.gate.enter() { + loop { + tokio::select! { + _ = this.cancel.cancelled() => { + break; + }, + _ = tokio::time::sleep(Duration::from_secs(60)) => {} + }; + this.tenant_op_locks.housekeeping(); + } + } + } + }); + tokio::task::spawn({ let this = this.clone(); // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`] @@ -753,13 +1051,28 @@ impl Service { }; this.startup_reconcile().await; - drop(startup_completion); + } + }); + tokio::task::spawn({ + let this = this.clone(); + let startup_complete = startup_complete.clone(); + async move { + startup_complete.wait().await; this.background_reconcile().await; } }); + tokio::task::spawn({ + let this = this.clone(); + let startup_complete = startup_complete.clone(); + async move { + startup_complete.wait().await; + this.spawn_heartbeat_driver().await; + } + }); + Ok(this) } @@ -782,7 +1095,7 @@ impl Service { shard_stripe_size: 0, generation: Some(0), generation_pageserver: None, - placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(), + placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), splitting: SplitState::default(), }; @@ -809,7 +1122,7 @@ impl Service { TenantState::new( attach_req.tenant_shard_id, ShardIdentity::unsharded(), - PlacementPolicy::Single, + PlacementPolicy::Attached(0), ), ); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); @@ -818,11 +1131,37 @@ impl Service { } let new_generation = if let Some(req_node_id) = attach_req.node_id { - Some( - self.persistence - .increment_generation(attach_req.tenant_shard_id, req_node_id) - .await?, - ) + let maybe_tenant_conf = { + let locked = self.inner.write().unwrap(); + locked + .tenants + .get(&attach_req.tenant_shard_id) + .map(|t| t.config.clone()) + }; + + match maybe_tenant_conf { + Some(conf) => { + let new_generation = self + .persistence + .increment_generation(attach_req.tenant_shard_id, req_node_id) + .await?; + + // Persist the placement policy update. This is required + // when we reattaching a detached tenant. + self.persistence + .update_tenant_shard( + attach_req.tenant_shard_id, + PlacementPolicy::Attached(0), + conf, + None, + ) + .await?; + Some(new_generation) + } + None => { + anyhow::bail!("Attach hook handling raced with tenant removal") + } + } } else { self.persistence.detach(attach_req.tenant_shard_id).await?; None @@ -837,6 +1176,7 @@ impl Service { if let Some(new_generation) = new_generation { tenant_state.generation = Some(new_generation); + tenant_state.policy = PlacementPolicy::Attached(0); } else { // This is a detach notification. We must update placement policy to avoid re-attaching // during background scheduling/reconciliation, or during storage controller restart. @@ -889,6 +1229,7 @@ impl Service { tenant_state.generation.unwrap(), &tenant_state.shard, &tenant_state.config, + false, )), }, )]); @@ -918,6 +1259,118 @@ impl Service { } } + // When the availability state of a node transitions to active, we must do a full reconciliation + // of LocationConfigs on that node. This is because while a node was offline: + // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node + // - aborting a tenant shard split might have left rogue child shards behind on this node. + // + // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other + // Reconcilers might communicate with the node, and these must not overlap with the work we do in + // this function. + // + // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but + // for written for a single node rather than as a batch job for all nodes. + #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))] + async fn node_activate_reconcile( + &self, + mut node: Node, + _lock: &OwnedRwLockWriteGuard<()>, + ) -> Result<(), ApiError> { + // This Node is a mutable local copy: we will set it active so that we can use its + // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated + // later. + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + + let configs = match node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // This node didn't succeed listing its locations: it may not proceed to active state + // as it is apparently unavailable. + return Err(ApiError::PreconditionFailed( + format!("Failed to query node location configs, cannot activate ({e})").into(), + )); + } + Some(Ok(configs)) => configs, + }; + tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); + + let mut cleanup = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + + for (tenant_shard_id, observed_loc) in configs.tenant_shards { + let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else { + cleanup.push(tenant_shard_id); + continue; + }; + tenant_state + .observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + } + } + + for tenant_shard_id in cleanup { + tracing::info!("Detaching {tenant_shard_id}"); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: tenant_shard_id.shard_number.0, + shard_count: tenant_shard_id.shard_count.literal(), + shard_stripe_size: 0, + tenant_conf: models::TenantConfig::default(), + }; + client + .location_config(tenant_shard_id, config, None, false) + .await + }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // Do not let the node proceed to Active state if it is not responsive to requests + // to detach. This could happen if e.g. a shutdown bug in the pageserver is preventing + // detach completing: we should not let this node back into the set of nodes considered + // okay for scheduling. + return Err(ApiError::Conflict(format!( + "Node {node} failed to detach {tenant_shard_id}: {e}" + ))); + } + Some(Ok(_)) => {} + }; + } + + Ok(()) + } + pub(crate) async fn re_attach( &self, reattach_req: ReAttachRequest, @@ -926,15 +1379,6 @@ impl Service { self.node_register(register_req).await?; } - // Take a re-attach as indication that the node is available: this is a precursor to proper - // heartbeating in https://github.com/neondatabase/neon/issues/6844 - self.node_configure(NodeConfigureRequest { - node_id: reattach_req.node_id, - availability: Some(NodeAvailability::Active), - scheduling: None, - }) - .await?; - // Ordering: we must persist generation number updates before making them visible in the in-memory state let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?; @@ -946,6 +1390,7 @@ impl Service { // Apply the updated generation to our in-memory state let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); let mut response = ReAttachResponse { tenants: Vec::new(), @@ -957,7 +1402,7 @@ impl Service { gen: new_gen.into().unwrap(), }); // Apply the new generation number to our in-memory state - let shard_state = locked.tenants.get_mut(&tenant_shard_id); + let shard_state = tenants.get_mut(&tenant_shard_id); let Some(shard_state) = shard_state else { // Not fatal. This edge case requires a re-attach to happen // between inserting a new tenant shard in to the database, and updating our in-memory @@ -1008,6 +1453,26 @@ impl Service { // request in flight over the network: TODO handle that by making location_conf API refuse // to go backward in generations. } + + // We consider a node Active once we have composed a re-attach response, but we + // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response + // implicitly synchronizes the LocationConfigs on the node. + // + // Setting a node active unblocks any Reconcilers that might write to the location config API, + // but those requests will not be accepted by the node until it has finished processing + // the re-attach response. + if let Some(node) = nodes.get(&reattach_req.node_id) { + if !node.is_available() { + let mut new_nodes = (**nodes).clone(); + if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + scheduler.node_upsert(node); + } + let new_nodes = Arc::new(new_nodes); + *nodes = new_nodes; + } + } + Ok(response) } @@ -1048,6 +1513,12 @@ impl Service { &self, create_req: TenantCreateRequest, ) -> Result { + // Exclude any concurrent attempts to create/access the same tenant ID + let _tenant_lock = self + .tenant_op_locks + .exclusive(create_req.new_tenant_id.tenant_id) + .await; + let (response, waiters) = self.do_tenant_create(create_req).await?; self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?; @@ -1058,11 +1529,11 @@ impl Service { &self, create_req: TenantCreateRequest, ) -> Result<(TenantCreateResponse, Vec), ApiError> { - // As a default, single is convenient for tests that don't choose a policy. let placement_policy = create_req .placement_policy .clone() - .unwrap_or(PlacementPolicy::Single); + // As a default, zero secondaries is convenient for tests that don't choose a policy. + .unwrap_or(PlacementPolicy::Attached(0)); // This service expects to handle sharding itself: it is an error to try and directly create // a particular shard here. @@ -1262,6 +1733,7 @@ impl Service { let mut updates = Vec::new(); let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); // Use location config mode as an indicator of policy. let placement_policy = match req.config.mode { @@ -1271,11 +1743,11 @@ impl Service { | LocationConfigMode::AttachedSingle | LocationConfigMode::AttachedStale => { if nodes.len() > 1 { - PlacementPolicy::Double(1) + PlacementPolicy::Attached(1) } else { // Convenience for dev/test: if we just have one pageserver, import - // tenants into Single mode so that scheduling will succeed. - PlacementPolicy::Single + // tenants into non-HA mode so that scheduling will succeed. + PlacementPolicy::Attached(0) } } }; @@ -1326,12 +1798,10 @@ impl Service { TenantCreateOrUpdate::Create( // Synthesize a creation request TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(tenant_id), + new_tenant_id: tenant_shard_id, generation, shard_parameters: ShardParameters { - // Must preserve the incoming shard_count do distinguish unsharded (0) - // from single-sharded (1): this distinction appears in the S3 keys of the tenant. - count: req.tenant_id.shard_count, + count: tenant_shard_id.shard_count, // We only import un-sharded or single-sharded tenants, so stripe // size can be made up arbitrarily here. stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, @@ -1360,17 +1830,23 @@ impl Service { /// - Call with mode Detached to switch to PolicyMode::Detached pub(crate) async fn tenant_location_config( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, req: TenantLocationConfigRequest, ) -> Result { - if !req.tenant_id.is_unsharded() { + // We require an exclusive lock, because we are updating both persistent and in-memory state + let _tenant_lock = self + .tenant_op_locks + .exclusive(tenant_shard_id.tenant_id) + .await; + + if !tenant_shard_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( "This API is for importing single-sharded or unsharded tenants" ))); } // First check if this is a creation or an update - let create_or_update = self.tenant_location_config_prepare(tenant_id, req); + let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req); let mut result = TenantLocationConfigResponse { shards: Vec::new(), @@ -1476,6 +1952,9 @@ impl Service { } pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await; + let tenant_id = req.tenant_id; let config = req.config; @@ -1557,6 +2036,8 @@ impl Service { timestamp: Cow<'_, str>, done_if_after: Cow<'_, str>, ) -> Result<(), ApiError> { + let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let node = { let locked = self.inner.read().unwrap(); // Just a sanity check to prevent misuse: the API expects that the tenant is fully @@ -1641,7 +2122,10 @@ impl Service { pub(crate) async fn tenant_secondary_download( &self, tenant_id: TenantId, - ) -> Result<(), ApiError> { + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress), ApiError> { + let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to let targets = { let locked = self.inner.read().unwrap(); @@ -1662,35 +2146,76 @@ impl Service { targets }; - // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running - // downloads, they can return a clean 202 response instead of the HTTP client timing out. - // Issue concurrent requests to all shards' locations let mut futs = FuturesUnordered::new(); for (tenant_shard_id, node) in targets { let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); futs.push(async move { - let result = client.tenant_secondary_download(tenant_shard_id).await; - (result, node) + let result = client + .tenant_secondary_download(tenant_shard_id, wait) + .await; + (result, node, tenant_shard_id) }) } // Handle any errors returned by pageservers. This includes cases like this request racing with // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as // well as more general cases like 503s, 500s, or timeouts. - while let Some((result, node)) = futs.next().await { - let Err(e) = result else { continue }; - - // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever - // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache - // than they had hoped for. - tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",); + let mut aggregate_progress = SecondaryProgress::default(); + let mut aggregate_status: Option = None; + let mut error: Option = None; + while let Some((result, node, tenant_shard_id)) = futs.next().await { + match result { + Err(e) => { + // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever + // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache + // than they had hoped for. + tracing::warn!("Secondary download error from pageserver {node}: {e}",); + error = Some(e) + } + Ok((status_code, progress)) => { + tracing::info!(%tenant_shard_id, "Shard status={status_code} progress: {progress:?}"); + aggregate_progress.layers_downloaded += progress.layers_downloaded; + aggregate_progress.layers_total += progress.layers_total; + aggregate_progress.bytes_downloaded += progress.bytes_downloaded; + aggregate_progress.bytes_total += progress.bytes_total; + aggregate_progress.heatmap_mtime = + std::cmp::max(aggregate_progress.heatmap_mtime, progress.heatmap_mtime); + aggregate_status = match aggregate_status { + None => Some(status_code), + Some(StatusCode::OK) => Some(status_code), + Some(cur) => { + // Other status codes (e.g. 202) -- do not overwrite. + Some(cur) + } + }; + } + } } - Ok(()) + // If any of the shards return 202, indicate our result as 202. + match aggregate_status { + None => { + match error { + Some(e) => { + // No successes, and an error: surface it + Err(ApiError::Conflict(format!("Error from pageserver: {e}"))) + } + None => { + // No shards found + Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )) + } + } + } + Some(aggregate_status) => Ok((aggregate_status, aggregate_progress)), + } } pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { + let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + self.ensure_attached_wait(tenant_id).await?; // TODO: refactor into helper @@ -1787,10 +2312,10 @@ impl Service { create_req.new_timeline_id, ); + let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + self.ensure_attached_wait(tenant_id).await?; - // TODO: refuse to do this if shard splitting is in progress - // (https://github.com/neondatabase/neon/issues/6676) let mut targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -1912,11 +2437,10 @@ impl Service { timeline_id: TimelineId, ) -> Result { tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,); + let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; self.ensure_attached_wait(tenant_id).await?; - // TODO: refuse to do this if shard splitting is in progress - // (https://github.com/neondatabase/neon/issues/6676) let mut targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -2041,9 +2565,6 @@ impl Service { let locked = self.inner.read().unwrap(); tracing::info!("Locating shards for tenant {tenant_id}"); - // Take a snapshot of pageservers - let pageservers = locked.nodes.clone(); - let mut result = Vec::new(); let mut shard_params: Option = None; @@ -2057,7 +2578,8 @@ impl Service { "Cannot locate a tenant that is not attached" )))?; - let node = pageservers + let node = locked + .nodes .get(&node_id) .expect("Pageservers may not be deleted while referenced"); @@ -2105,21 +2627,360 @@ impl Service { }) } + pub(crate) fn tenant_describe( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + + let mut shard_zero = None; + let mut shards = Vec::new(); + + for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + if tenant_shard_id.is_zero() { + shard_zero = Some(shard); + } + + let response_shard = TenantDescribeResponseShard { + tenant_shard_id: *tenant_shard_id, + node_attached: *shard.intent.get_attached(), + node_secondary: shard.intent.get_secondary().to_vec(), + last_error: shard.last_error.lock().unwrap().clone(), + is_reconciling: shard.reconciler.is_some(), + is_pending_compute_notification: shard.pending_compute_notification, + is_splitting: matches!(shard.splitting, SplitState::Splitting), + }; + shards.push(response_shard); + } + + let Some(shard_zero) = shard_zero else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {tenant_id} not found").into(), + )); + }; + + Ok(TenantDescribeResponse { + shards, + stripe_size: shard_zero.shard.stripe_size, + policy: shard_zero.policy.clone(), + config: shard_zero.config.clone(), + }) + } + + #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] + async fn abort_tenant_shard_split( + &self, + op: &TenantShardSplitAbort, + ) -> Result<(), TenantShardSplitAbortError> { + // Cleaning up a split: + // - Parent shards are not destroyed during a split, just detached. + // - Failed pageserver split API calls can leave the remote node with just the parent attached, + // just the children attached, or both. + // + // Therefore our work to do is to: + // 1. Clean up storage controller's internal state to just refer to parents, no children + // 2. Call out to pageservers to ensure that children are detached + // 3. Call out to pageservers to ensure that parents are attached. + // + // Crash safety: + // - If the storage controller stops running during this cleanup *after* clearing the splitting state + // from our database, then [`Self::startup_reconcile`] will regard child attachments as garbage + // and detach them. + // - TODO: If the storage controller stops running during this cleanup *before* clearing the splitting state + // from our database, then we will re-enter this cleanup routine on startup. + + let TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + .. + } = op; + + // First abort persistent state, if any exists. + match self + .persistence + .abort_shard_split(*tenant_id, *new_shard_count) + .await? + { + AbortShardSplitStatus::Aborted => { + // Proceed to roll back any child shards created on pageservers + } + AbortShardSplitStatus::Complete => { + // The split completed (we might hit that path if e.g. our database transaction + // to write the completion landed in the database, but we dropped connection + // before seeing the result). + // + // We must update in-memory state to reflect the successful split. + self.tenant_shard_split_commit_inmem( + *tenant_id, + *new_shard_count, + *new_stripe_size, + ); + return Ok(()); + } + } + + // Clean up in-memory state, and accumulate the list of child locations that need detaching + let detach_locations: Vec<(Node, TenantShardId)> = { + let mut detach_locations = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + + for (tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(op.tenant_id)) + { + if shard.shard.count == op.new_shard_count { + // Surprising: the phase of [`Self::do_tenant_shard_split`] which inserts child shards in-memory + // is infallible, so if we got an error we shouldn't have got that far. + tracing::warn!( + "During split abort, child shard {tenant_shard_id} found in-memory" + ); + continue; + } + + // Add the children of this shard to this list of things to detach + if let Some(node_id) = shard.intent.get_attached() { + for child_id in tenant_shard_id.split(*new_shard_count) { + detach_locations.push(( + nodes + .get(node_id) + .expect("Intent references nonexistent node") + .clone(), + child_id, + )); + } + } else { + tracing::warn!( + "During split abort, shard {tenant_shard_id} has no attached location" + ); + } + + tracing::info!("Restoring parent shard {tenant_shard_id}"); + shard.splitting = SplitState::Idle; + self.maybe_reconcile_shard(shard, nodes); + } + + // We don't expect any new_shard_count shards to exist here, but drop them just in case + tenants.retain(|_id, s| s.shard.count != *new_shard_count); + + detach_locations + }; + + for (node, child_id) in detach_locations { + if !node.is_available() { + // An unavailable node cannot be cleaned up now: to avoid blocking forever, we will permit this, and + // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have + // removed child shards from our in-memory state and database, the reconciliation will implicitly remove + // them from the node. + tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + continue; + } + + // Detach the remote child. If the pageserver split API call is still in progress, this call will get + // a 503 and retry, up to our limit. + tracing::info!("Detaching {child_id} on {node}..."); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: child_id.shard_number.0, + shard_count: child_id.shard_count.literal(), + // Stripe size and tenant config don't matter when detaching + shard_stripe_size: 0, + tenant_conf: TenantConfig::default(), + }; + + client.location_config(child_id, config, None, false).await + }, + &self.config.jwt_token, + 1, + 10, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => { + // We failed to communicate with the remote node. This is problematic: we may be + // leaving it with a rogue child shard. + tracing::warn!( + "Failed to detach child {child_id} from node {node} during abort" + ); + return Err(e.into()); + } + None => { + // Cancellation: we were shutdown or the node went offline. Shutdown is fine, we'll + // clean up on restart. The node going offline requires a retry. + return Err(TenantShardSplitAbortError::Unavailable); + } + }; + } + + tracing::info!("Successfully aborted split"); + Ok(()) + } + + /// Infallible final stage of [`Self::tenant_shard_split`]: update the contents + /// of the tenant map to reflect the child shards that exist after the split. + fn tenant_shard_split_commit_inmem( + &self, + tenant_id: TenantId, + new_shard_count: ShardCount, + new_stripe_size: Option, + ) -> ( + TenantShardSplitResponse, + Vec<(TenantShardId, NodeId, ShardStripeSize)>, + ) { + let mut response = TenantShardSplitResponse { + new_shards: Vec::new(), + }; + let mut child_locations = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + + let parent_ids = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(shard_id, _)| *shard_id) + .collect::>(); + + let (_nodes, tenants, scheduler) = locked.parts_mut(); + for parent_id in parent_ids { + let child_ids = parent_id.split(new_shard_count); + + let (pageserver, generation, policy, parent_ident, config) = { + let mut old_state = tenants + .remove(&parent_id) + .expect("It was present, we just split it"); + + // A non-splitting state is impossible, because [`Self::tenant_shard_split`] holds + // a TenantId lock and passes it through to [`TenantShardSplitAbort`] in case of cleanup: + // nothing else can clear this. + assert!(matches!(old_state.splitting, SplitState::Splitting)); + + let old_attached = old_state.intent.get_attached().unwrap(); + old_state.intent.clear(scheduler); + let generation = old_state.generation.expect("Shard must have been attached"); + ( + old_attached, + generation, + old_state.policy, + old_state.shard, + old_state.config, + ) + }; + + for child in child_ids { + let mut child_shard = parent_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + if let Some(stripe_size) = new_stripe_size { + child_shard.stripe_size = stripe_size; + } + + let mut child_observed: HashMap = HashMap::new(); + child_observed.insert( + pageserver, + ObservedStateLocation { + conf: Some(attached_location_conf( + generation, + &child_shard, + &config, + matches!(policy, PlacementPolicy::Attached(n) if n > 0), + )), + }, + ); + + let mut child_state = TenantState::new(child, child_shard, policy.clone()); + child_state.intent = IntentState::single(scheduler, Some(pageserver)); + child_state.observed = ObservedState { + locations: child_observed, + }; + child_state.generation = Some(generation); + child_state.config = config.clone(); + + // The child's TenantState::splitting is intentionally left at the default value of Idle, + // as at this point in the split process we have succeeded and this part is infallible: + // we will never need to do any special recovery from this state. + + child_locations.push((child, pageserver, child_shard.stripe_size)); + + if let Err(e) = child_state.schedule(scheduler) { + // This is not fatal, because we've implicitly already got an attached + // location for the child shard. Failure here just means we couldn't + // find a secondary (e.g. because cluster is overloaded). + tracing::warn!("Failed to schedule child shard {child}: {e}"); + } + + tenants.insert(child, child_state); + response.new_shards.push(child); + } + } + + (response, child_locations) + } + } + pub(crate) async fn tenant_shard_split( &self, tenant_id: TenantId, split_req: TenantShardSplitRequest, ) -> Result { + // TODO: return 503 if we get stuck waiting for this lock + // (issue https://github.com/neondatabase/neon/issues/7108) + let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + + let new_shard_count = ShardCount::new(split_req.new_shard_count); + let new_stripe_size = split_req.new_stripe_size; + + // Validate the request and construct parameters. This phase is fallible, but does not require + // rollback on errors, as it does no I/O and mutates no state. + let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? { + ShardSplitAction::NoOp(resp) => return Ok(resp), + ShardSplitAction::Split(params) => params, + }; + + // Execute this split: this phase mutates state and does remote I/O on pageservers. If it fails, + // we must roll back. + let r = self + .do_tenant_shard_split(tenant_id, shard_split_params) + .await; + + match r { + Ok(r) => Ok(r), + Err(e) => { + // Split might be part-done, we must do work to abort it. + tracing::warn!("Enqueuing background abort of split on {tenant_id}"); + self.abort_tx + .send(TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + _tenant_lock, + }) + // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. + .ok(); + Err(e) + } + } + } + + fn prepare_tenant_shard_split( + &self, + tenant_id: TenantId, + split_req: TenantShardSplitRequest, + ) -> Result { + fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest( + anyhow::anyhow!("failpoint") + ))); + let mut policy = None; let mut shard_ident = None; - - // A parent shard which will be split - struct SplitTarget { - parent_id: TenantShardId, - node: Node, - child_ids: Vec, - } - // Validate input, and calculate which shards we will create let (old_shard_count, targets) = { @@ -2195,7 +3056,7 @@ impl Service { // TODO: if any reconciliation is currently in progress for this shard, wait for it. - targets.push(SplitTarget { + targets.push(ShardSplitTarget { parent_id: *tenant_shard_id, node: node.clone(), child_ids: tenant_shard_id @@ -2205,9 +3066,9 @@ impl Service { if targets.is_empty() { if children_found.len() == split_req.new_shard_count as usize { - return Ok(TenantShardSplitResponse { + return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse { new_shards: children_found, - }); + })); } else { // No shards found to split, and no existing children found: the // tenant doesn't exist at all. @@ -2229,19 +3090,45 @@ impl Service { if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); } + shard_ident.stripe_size = new_stripe_size; + tracing::info!("applied stripe size {}", shard_ident.stripe_size.0); shard_ident } else { shard_ident.unwrap() }; let policy = policy.unwrap(); + Ok(ShardSplitAction::Split(ShardSplitParams { + old_shard_count, + new_shard_count: ShardCount::new(split_req.new_shard_count), + new_stripe_size: split_req.new_stripe_size, + targets, + policy, + shard_ident, + })) + } + + async fn do_tenant_shard_split( + &self, + tenant_id: TenantId, + params: ShardSplitParams, + ) -> Result { // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the // parent shards exist as expected, but it would be neater to do the above pre-checks within the // same database transaction rather than pre-check in-memory and then maybe-fail the database write. // (https://github.com/neondatabase/neon/issues/6676) + let ShardSplitParams { + old_shard_count, + new_shard_count, + new_stripe_size, + targets, + policy, + shard_ident, + } = params; + // Before creating any new child shards in memory or on the pageservers, persist them: this // enables us to ensure that we will always be able to clean up if something goes wrong. This also // acts as the protection against two concurrent attempts to split: one of them will get a database @@ -2254,6 +3141,11 @@ impl Service { child_shard.number = child.shard_number; child_shard.count = child.shard_count; + tracing::info!( + "Create child shard persistence with stripe size {}", + shard_ident.stripe_size.0 + ); + this_child_tsps.push(TenantShardPersistence { tenant_id: child.tenant_id.to_string(), shard_number: child.shard_number.0 as i32, @@ -2292,6 +3184,9 @@ impl Service { _ => return Err(ApiError::InternalServerError(e.into())), } } + fail::fail_point!("shard-split-post-begin", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); // Now that I have persisted the splitting state, apply it in-memory. This is infallible, so // callers may assume that if splitting is set in memory, then it was persisted, and if splitting @@ -2301,20 +3196,21 @@ impl Service { for target in &targets { if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) { parent_shard.splitting = SplitState::Splitting; + // Put the observed state to None, to reflect that it is indeterminate once we start the + // split operation. + parent_shard + .observed + .locations + .insert(target.node.get_id(), ObservedStateLocation { conf: None }); } } } - // FIXME: we have now committed the shard split state to the database, so any subsequent - // failure needs to roll it back. We will later wrap this function in logic to roll back - // the split if it fails. - // (https://github.com/neondatabase/neon/issues/6676) - // TODO: issue split calls concurrently (this only matters once we're splitting // N>1 shards into M shards -- initially we're usually splitting 1 shard into N). for target in &targets { - let SplitTarget { + let ShardSplitTarget { parent_id, node, child_ids, @@ -2324,13 +3220,17 @@ impl Service { .tenant_shard_split( *parent_id, TenantShardSplitRequest { - new_shard_count: split_req.new_shard_count, - new_stripe_size: split_req.new_stripe_size, + new_shard_count: new_shard_count.literal(), + new_stripe_size, }, ) .await .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?; + fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict( + "failpoint".to_string() + ))); + tracing::info!( "Split {} into {}", parent_id, @@ -2365,62 +3265,13 @@ impl Service { .complete_shard_split(tenant_id, old_shard_count) .await?; + fail::fail_point!("shard-split-post-complete", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); + // Replace all the shards we just split with their children: this phase is infallible. - let mut response = TenantShardSplitResponse { - new_shards: Vec::new(), - }; - let mut child_locations = Vec::new(); - { - let mut locked = self.inner.write().unwrap(); - let (_nodes, tenants, scheduler) = locked.parts_mut(); - for target in targets { - let SplitTarget { - parent_id, - node: _node, - child_ids, - } = target; - let (pageserver, generation, config) = { - let mut old_state = tenants - .remove(&parent_id) - .expect("It was present, we just split it"); - let old_attached = old_state.intent.get_attached().unwrap(); - old_state.intent.clear(scheduler); - let generation = old_state.generation.expect("Shard must have been attached"); - (old_attached, generation, old_state.config.clone()) - }; - - for child in child_ids { - let mut child_shard = shard_ident; - child_shard.number = child.shard_number; - child_shard.count = child.shard_count; - - let mut child_observed: HashMap = HashMap::new(); - child_observed.insert( - pageserver, - ObservedStateLocation { - conf: Some(attached_location_conf(generation, &child_shard, &config)), - }, - ); - - let mut child_state = TenantState::new(child, child_shard, policy.clone()); - child_state.intent = IntentState::single(scheduler, Some(pageserver)); - child_state.observed = ObservedState { - locations: child_observed, - }; - child_state.generation = Some(generation); - child_state.config = config.clone(); - - // The child's TenantState::splitting is intentionally left at the default value of Idle, - // as at this point in the split process we have succeeded and this part is infallible: - // we will never need to do any special recovery from this state. - - child_locations.push((child, pageserver, child_shard.stripe_size)); - - tenants.insert(child, child_state); - response.new_shards.push(child); - } - } - } + let (response, child_locations) = + self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); @@ -2485,17 +3336,15 @@ impl Service { let old_attached = *shard.intent.get_attached(); match shard.policy { - PlacementPolicy::Single => { - shard.intent.clear_secondary(scheduler); - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); - } - PlacementPolicy::Double(_n) => { + PlacementPolicy::Attached(n) => { // If our new attached node was a secondary, it no longer should be. shard.intent.remove_secondary(scheduler, migrate_req.node_id); // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { - shard.intent.push_secondary(scheduler, old_attached); + if n > 0 { + shard.intent.push_secondary(scheduler, old_attached); + } } shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); @@ -2709,6 +3558,8 @@ impl Service { &self, register_req: NodeRegisterRequest, ) -> Result<(), ApiError> { + let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await; + // Pre-check for an already-existing node { let locked = self.inner.read().unwrap(); @@ -2736,6 +3587,30 @@ impl Service { } } + // We do not require that a node is actually online when registered (it will start life + // with it's availability set to Offline), but we _do_ require that its DNS record exists. We're + // therefore not immune to asymmetric L3 connectivity issues, but we are protected against nodes + // that register themselves with a broken DNS config. We check only the HTTP hostname, because + // the postgres hostname might only be resolvable to clients (e.g. if we're on a different VPC than clients). + if tokio::net::lookup_host(format!( + "{}:{}", + register_req.listen_http_addr, register_req.listen_http_port + )) + .await + .is_err() + { + // If we have a transient DNS issue, it's up to the caller to retry their registration. Because + // we can't robustly distinguish between an intermittent issue and a totally bogus DNS situation, + // we return a soft 503 error, to encourage callers to retry past transient issues. + return Err(ApiError::ResourceUnavailable( + format!( + "Node {} tried to register with unknown DNS name '{}'", + register_req.node_id, register_req.listen_http_addr + ) + .into(), + )); + } + // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. @@ -2768,34 +3643,65 @@ impl Service { pub(crate) async fn node_configure( &self, - config_req: NodeConfigureRequest, + node_id: NodeId, + availability: Option, + scheduling: Option, ) -> Result<(), ApiError> { - if let Some(scheduling) = config_req.scheduling { + let _node_lock = self.node_op_locks.exclusive(node_id).await; + + if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before // applying them in memory - self.persistence - .update_node(config_req.node_id, scheduling) - .await?; + self.persistence.update_node(node_id, scheduling).await?; } + // If we're activating a node, then before setting it active we must reconcile any shard locations + // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup, + // by calling [`Self::node_activate_reconcile`] + // + // The transition we calculate here remains valid later in the function because we hold the op lock on the node: + // nothing else can mutate its availability while we run. + let availability_transition = if let Some(input_availability) = availability { + let (activate_node, availability_transition) = { + let locked = self.inner.read().unwrap(); + let Some(node) = locked.nodes.get(&node_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + )); + }; + + ( + node.clone(), + node.get_availability_transition(input_availability), + ) + }; + + if matches!(availability_transition, AvailabilityTransition::ToActive) { + self.node_activate_reconcile(activate_node, &_node_lock) + .await?; + } + availability_transition + } else { + AvailabilityTransition::Unchanged + }; + + // Apply changes from the request to our in-memory state for the Node let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); let mut new_nodes = (**nodes).clone(); - let Some(node) = new_nodes.get_mut(&config_req.node_id) else { + let Some(node) = new_nodes.get_mut(&node_id) else { return Err(ApiError::NotFound( anyhow::anyhow!("Node not registered").into(), )); }; - let availability_transition = if let Some(availability) = &config_req.availability { - node.set_availability(*availability) - } else { - AvailabilityTransition::Unchanged - }; + if let Some(availability) = &availability { + node.set_availability(*availability); + } - if let Some(scheduling) = config_req.scheduling { + if let Some(scheduling) = scheduling { node.set_scheduling(scheduling); // TODO: once we have a background scheduling ticker for fill/drain, kick it @@ -2807,27 +3713,33 @@ impl Service { let new_nodes = Arc::new(new_nodes); + // Modify scheduling state for any Tenants that are affected by a change in the node's availability state. match availability_transition { AvailabilityTransition::ToOffline => { - tracing::info!("Node {} transition to offline", config_req.node_id); + tracing::info!("Node {} transition to offline", node_id); let mut tenants_affected: usize = 0; for (tenant_shard_id, tenant_state) in tenants { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { + if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { // When a node goes offline, we set its observed configuration to None, indicating unknown: we will // not assume our knowledge of the node's configuration is accurate until it comes back online observed_loc.conf = None; } - if tenant_state.intent.demote_attached(config_req.node_id) { + if new_nodes.len() == 1 { + // Special case for single-node cluster: there is no point trying to reschedule + // any tenant shards: avoid doing so, in order to avoid spewing warnings about + // failures to schedule them. + continue; + } + + if tenant_state.intent.demote_attached(node_id) { tenant_state.sequence = tenant_state.sequence.next(); match tenant_state.schedule(scheduler) { Err(e) => { // It is possible that some tenants will become unschedulable when too many pageservers // go offline: in this case there isn't much we can do other than make the issue observable. // TODO: give TenantState a scheduling error attribute to be queried later. - tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id); + tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); } Ok(()) => { if self @@ -2843,17 +3755,15 @@ impl Service { tracing::info!( "Launched {} reconciler tasks for tenants affected by node {} going offline", tenants_affected, - config_req.node_id + node_id ) } AvailabilityTransition::ToActive => { - tracing::info!("Node {} transition to active", config_req.node_id); + tracing::info!("Node {} transition to active", node_id); // When a node comes back online, we must reconcile any tenant that has a None observed // location on the node. for tenant_state in locked.tenants.values_mut() { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { + if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { self.maybe_reconcile_shard(tenant_state, &new_nodes); } @@ -2863,7 +3773,7 @@ impl Service { // TODO: in the background, we should balance work back onto this pageserver } AvailabilityTransition::Unchanged => { - tracing::info!("Node {} no change during config", config_req.node_id); + tracing::info!("Node {} no change during config", node_id); } } @@ -2942,17 +3852,23 @@ impl Service { ) } - /// Check all tenants for pending reconciliation work, and reconcile those in need + /// Check all tenants for pending reconciliation work, and reconcile those in need. + /// Additionally, reschedule tenants that require it. /// /// Returns how many reconciliation tasks were started fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); - let pageservers = locked.nodes.clone(); - locked - .tenants - .iter_mut() - .filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers)) - .count() + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let pageservers = nodes.clone(); + + let mut reconciles_spawned = 0; + for (_tenant_shard_id, shard) in tenants.iter_mut() { + if self.maybe_reconcile_shard(shard, &pageservers).is_some() { + reconciles_spawned += 1; + } + } + + reconciles_spawned } pub async fn shutdown(&self) { diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index 3c91e09ac3..9dd368bf41 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -457,22 +457,7 @@ impl TenantState { // Add/remove nodes to fulfil policy use PlacementPolicy::*; match self.policy { - Single => { - // Should have exactly one attached, and zero secondaries - if !self.intent.secondary.is_empty() { - self.intent.clear_secondary(scheduler); - modified = true; - } - - let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?; - modified |= modified_attached; - - if !self.intent.secondary.is_empty() { - self.intent.clear_secondary(scheduler); - modified = true; - } - } - Double(secondary_count) => { + Attached(secondary_count) => { let retain_secondaries = if self.intent.attached.is_none() && scheduler.node_preferred(&self.intent.secondary).is_some() { @@ -577,7 +562,12 @@ impl TenantState { .generation .expect("Attempted to enter attached state without a generation"); - let wanted_conf = attached_location_conf(generation, &self.shard, &self.config); + let wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + !self.intent.secondary.is_empty(), + ); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { @@ -890,7 +880,7 @@ pub(crate) mod tests { let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1)); + let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); tenant_state .schedule(&mut scheduler) .expect("we have enough nodes, scheduling should work"); @@ -938,7 +928,7 @@ pub(crate) mod tests { let nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1)); + let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); tenant_state.observed.locations.insert( NodeId(3), diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6c722f36b4..401feae706 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -437,7 +437,7 @@ async fn handle_tenant( let placement_policy = match create_match.get_one::("placement-policy") { Some(s) if !s.is_empty() => serde_json::from_str::(s)?, - _ => PlacementPolicy::Single, + _ => PlacementPolicy::Attached(0), }; let tenant_conf = PageServerNode::parse_config(tenant_conf)?; @@ -523,88 +523,6 @@ async fn handle_tenant( .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } - Some(("migrate", matches)) => { - let tenant_shard_id = get_tenant_shard_id(matches, env)?; - let new_pageserver = get_pageserver(env, matches)?; - let new_pageserver_id = new_pageserver.conf.id; - - let storage_controller = StorageController::from_env(env); - storage_controller - .tenant_migrate(tenant_shard_id, new_pageserver_id) - .await?; - - println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id); - } - Some(("status", matches)) => { - let tenant_id = get_tenant_id(matches, env)?; - - let mut shard_table = comfy_table::Table::new(); - shard_table.set_header(["Shard", "Pageserver", "Physical Size"]); - - let mut tenant_synthetic_size = None; - - let storage_controller = StorageController::from_env(env); - for shard in storage_controller.tenant_locate(tenant_id).await?.shards { - let pageserver = - PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?); - - let size = pageserver - .http_client - .tenant_details(shard.shard_id) - .await? - .tenant_info - .current_physical_size - .unwrap(); - - shard_table.add_row([ - format!("{}", shard.shard_id.shard_slug()), - format!("{}", shard.node_id.0), - format!("{} MiB", size / (1024 * 1024)), - ]); - - if shard.shard_id.is_zero() { - tenant_synthetic_size = - Some(pageserver.tenant_synthetic_size(shard.shard_id).await?); - } - } - - let Some(synthetic_size) = tenant_synthetic_size else { - bail!("Shard 0 not found") - }; - - let mut tenant_table = comfy_table::Table::new(); - tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]); - tenant_table.add_row([ - "Synthetic size".to_string(), - format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)), - ]); - - println!("{tenant_table}"); - println!("{shard_table}"); - } - Some(("shard-split", matches)) => { - let tenant_id = get_tenant_id(matches, env)?; - let shard_count: u8 = matches.get_one::("shard-count").cloned().unwrap_or(0); - let shard_stripe_size: Option = matches - .get_one::>("shard-stripe-size") - .cloned() - .unwrap(); - - let storage_controller = StorageController::from_env(env); - let result = storage_controller - .tenant_split(tenant_id, shard_count, shard_stripe_size) - .await?; - println!( - "Split tenant {} into shards {}", - tenant_id, - result - .new_shards - .iter() - .map(|s| format!("{:?}", s)) - .collect::>() - .join(",") - ); - } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), @@ -1578,19 +1496,6 @@ fn cli() -> Command { .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) - .subcommand(Command::new("migrate") - .about("Migrate a tenant from one pageserver to another") - .arg(tenant_id_arg.clone()) - .arg(pageserver_id_arg.clone())) - .subcommand(Command::new("status") - .about("Human readable summary of the tenant's shards and attachment locations") - .arg(tenant_id_arg.clone())) - .subcommand(Command::new("shard-split") - .about("Increase the number of shards in the tenant") - .arg(tenant_id_arg.clone()) - .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) - .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) - ) ) .subcommand( Command::new("pageserver") diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2e64489432..bd3dbef453 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -114,7 +114,7 @@ impl NeonBroker { } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct PageServerConf { // node id pub id: NodeId, @@ -126,6 +126,9 @@ pub struct PageServerConf { // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, + + pub(crate) virtual_file_io_engine: Option, + pub(crate) get_vectored_impl: Option, } impl Default for PageServerConf { @@ -136,6 +139,8 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, + virtual_file_io_engine: None, + get_vectored_impl: None, } } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 06ec942895..c5eabc46db 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -78,18 +78,39 @@ impl PageServerNode { /// /// These all end up on the command line of the `pageserver` binary. fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec { - let id = format!("id={}", self.conf.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", self.env.pg_distrib_dir_raw().display() ); - let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type); - let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr); + let PageServerConf { + id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + virtual_file_io_engine, + get_vectored_impl, + } = &self.conf; - let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type); - let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr); + let id = format!("id={}", id); + + let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type); + let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr); + + let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type); + let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr); + let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine { + format!("virtual_file_io_engine='{virtual_file_io_engine}'") + } else { + String::new() + }; + let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl { + format!("get_vectored_impl='{get_vectored_impl}'") + } else { + String::new() + }; let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); @@ -101,6 +122,8 @@ impl PageServerNode { listen_http_addr_param, listen_pg_addr_param, broker_endpoint_param, + virtual_file_io_engine, + get_vectored_impl, ]; if let Some(control_plane_api) = &self.env.control_plane_api { @@ -111,7 +134,7 @@ impl PageServerNode { // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. - if matches!(self.conf.http_auth_type, AuthType::NeonJWT) { + if matches!(http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) @@ -129,8 +152,7 @@ impl PageServerNode { )); } - if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust - { + if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); @@ -554,13 +576,6 @@ impl PageServerNode { Ok(self.http_client.list_timelines(*tenant_shard_id).await?) } - pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> { - Ok(self - .http_client - .tenant_secondary_download(*tenant_id) - .await?) - } - pub async fn timeline_create( &self, tenant_shard_id: TenantShardId, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index d7673f1b26..e7697ecac8 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -38,6 +38,9 @@ const COMMAND: &str = "storage_controller"; const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; +// Use a shorter pageserver unavailability interval than the default to speed up tests. +const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); + #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -269,6 +272,8 @@ impl StorageController { // Run migrations on every startup, in case something changed. let database_url = self.setup_database().await?; + let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into(); + let mut args = vec![ "-l", &self.listen, @@ -276,6 +281,8 @@ impl StorageController { self.path.as_ref(), "--database-url", &database_url, + "--max-unavailable-interval", + &max_unavailable.to_string(), ] .into_iter() .map(|s| s.to_string()) @@ -468,7 +475,7 @@ impl StorageController { pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { self.dispatch::<(), _>( Method::GET, - format!("control/v1/tenant/{tenant_id}/locate"), + format!("debug/v1/tenant/{tenant_id}/locate"), None, ) .await diff --git a/docs/rfcs/031-sharding-static.md b/docs/rfcs/031-sharding-static.md new file mode 100644 index 0000000000..fe009b8660 --- /dev/null +++ b/docs/rfcs/031-sharding-static.md @@ -0,0 +1,408 @@ +# Sharding Phase 1: Static Key-space Sharding + +## Summary + +To enable databases with sizes approaching the capacity of a pageserver's disk, +it is necessary to break up the storage for the database, or _shard_ it. + +Sharding in general is a complex area. This RFC aims to define an initial +capability that will permit creating large-capacity databases using a static configuration +defined at time of Tenant creation. + +## Motivation + +Currently, all data for a Tenant, including all its timelines, is stored on a single +pageserver. The local storage required may be several times larger than the actual +database size, due to LSM write inflation. + +If a database is larger than what one pageserver can hold, then it becomes impossible +for the pageserver to hold it in local storage, as it must do to provide service to +clients. + +### Prior art + +In Neon: + +- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4 +- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843 +- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677 + +Prior art in other distributed systems is too broad to capture here: pretty much +any scale out storage system does something like this. + +## Requirements + +- Enable creating a large (for example, 16TiB) database without requiring dedicated + pageserver nodes. +- Share read/write bandwidth costs for large databases across pageservers, as well + as storage capacity, in order to avoid large capacity databases acting as I/O hotspots + that disrupt service to other tenants. +- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres + does not write out a single contiguous ranges of page numbers. + +_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database +that a user might create on a current-gen enterprise SSD should also work well on +Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the +pageserver backend is not the limiting factor in the database size_. + +## Non Goals + +- Independently distributing timelines within the same tenant. If a tenant has many + timelines, then sharding may be a less efficient mechanism for distributing load than + sharing out timelines between pageservers. +- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only, + based on the idea that separate mechanisms will make sense for each dimension. + +## Impacted Components + +pageserver, control plane, postgres/smgr + +## Terminology + +**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store, +the page number is the key in that store. `Key` is a literal data type in existing code. + +**LSN dimension**: this just means the range of LSNs (history), when talking about the range +of keys and LSNs as a two dimensional space. + +## Implementation + +### Key sharding vs. LSN sharding + +When we think of sharding across the two dimensional key/lsn space, this is an +opportunity to think about how the two dimensions differ: + +- Sharding the key space distributes the _write_ workload of ingesting data + and compacting. This work must be carefully managed so that exactly one + node owns a given key. +- Sharding the LSN space distributes the _historical read_ workload. This work + can be done by anyone without any special coordination, as long as they can + see the remote index and layers. + +The key sharding is the harder part, and also the more urgent one, to support larger +capacity databases. Because distributing historical LSN read work is a relatively +simpler problem that most users don't have, we defer it to future work. It is anticipated +that some quite simple P2P offload model will enable distributing work for historical +reads: a node which is low on space can call out to peer to ask it to download and +serve reads from a historical layer. + +### Key mapping scheme + +Having decided to focus on key sharding, we must next decide how we will map +keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise +between data locality and avoiding entire large relations mapping to the same shard. + +We will define two spaces: + +- Key space: unsigned integer +- Shard space: integer from 0 to N-1, where we have N shards. + +### Key -> Shard mapping + +Keys are currently defined in the pageserver's getpage@lsn interface as follows: + +``` +pub struct Key { + pub field1: u8, + pub field2: u32, + pub field3: u32, + pub field4: u32, + pub field5: u8, + pub field6: u32, +} + + +fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} +``` + +_Note: keys for relation metadata are ignored here, as this data will be mirrored to all +shards. For distribution purposes, we only care about user data keys_ + +The properties we want from our Key->Shard mapping are: + +- Locality in `blknum`, such that adjacent `blknum` will usually map to + the same stripe and consequently land on the same shard, even though the overall + collection of blocks in a relation will be spread over many stripes and therefore + many shards. +- Avoid the same blknum on different relations landing on the same stripe, so that + with many small relations we do not end up aliasing data to the same stripe/shard. +- Avoid vulnerability to aliasing in the values of relation identity fields, such that + if there are patterns in the value of `relnode`, these do not manifest as patterns + in data placement. + +To accomplish this, the blknum is used to select a stripe, and stripes are +assigned to shards in a pseudorandom order via a hash. The motivation for +pseudo-random distribution (rather than sequential mapping of stripe to shard) +is to avoid I/O hotspots when sequentially reading multiple relations: we don't want +all relations' stripes to touch pageservers in the same order. + +To map a `Key` to a shard: + +- Hash the `Key` field 4 (relNode). +- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the + hash of this with the hash from the previous step. +- The total hash modulo the shard count gives the shard holding this key. + +Why don't we use the other fields in the Key? + +- We ignore `forknum` for key mapping, because it distinguishes different classes of data + in the same relation, and we would like to keep the data in a relation together. +- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created + database's blocks differ only by spcNode and dbNode from the original. To enable running + this type of creation without cross-pageserver communication, we must ensure that these + blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash. + +### Data placement examples + +For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards +and a stripe size of 32k pages: + +- A single large relation: `blknum` division will break the data up into 4096 + stripes, which will be scattered across the shards. +- 4096 relations of of 32k pages each: each relation will map to exactly one stripe, + and that stripe will be placed according to the hash of the key fields 4. The + data placement will be statistically uniform across shards. + +Data placement will be more uneven on smaller databases: + +- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance + that both relations land on the same shard and no data lands on the other shard. +- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double + the data of the other four shards. + +These uneven cases for small amounts of data do not matter, as long as the stripe size +is an order of magnitude smaller than the amount of data we are comfortable holding +in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if +a tenant has some shards with 256MB size and some shards with 512MB size, even though +the standard deviation of shard size within the tenant is very high. Our key mapping +scheme provides a statistical guarantee that as the tenant's overall data size increases, +uniformity of placement will improve. + +### Important Types + +#### `ShardIdentity` + +Provides the information needed to know whether a particular key belongs +to a particular shard: + +- Layout version +- Stripe size +- Shard count +- Shard index + +This structure's size is constant. Note that if we had used a differnet key +mapping scheme such as consistent hashing with explicit hash ranges assigned +to each shard, then the ShardIdentity's size would grow with the shard count: the simpler +key mapping scheme used here enables a small fixed size ShardIdentity. + +### Pageserver changes + +#### Structural + +Everywhere the Pageserver currently deals with Tenants, it will move to dealing with +`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part +of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity` +covers the whole keyspace. + +When the pageserver writes layers and index_part.json to remote storage, it must +include the shard index & count in the name, to avoid collisions (the count is +necessary for future-proofing: the count will vary in time). These keys +will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work +exactly the same for TenantShards as it does for Tenants today: each shard will have +its own generation number. + +#### Storage Format: Keys + +For tenants with >1 shard, layer files implicitly become sparse: within the key +range described in the layer name, the layer file for a shard will only hold the +content relevant to stripes assigned to the shard. + +For this reason, the LayerFileName within a tenant is no longer unique: different shards +may use the same LayerFileName to refer to different data. We may solve this simply +by including the shard number in the keys used for layers. + +The shard number will be included as a prefix (as part of tenant ID), like this: + +`pageserver/v1/tenants/-/timelines//-` + +`pageserver/v1/tenants/-/timelines//index_part.json-` + +Reasons for this particular format: + +- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere + we construct a layer file name), and enables efficient listing of index_parts within + a particular shard-timeline prefix. +- Including the shard _count_ as well as shard number means that in future when we implement + shard splitting, it will be possible for a parent shard and one of its children to write + the same layer file without a name collision. For example, a parent shard 0_1 might split + into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part + that is distinct from what shard 0_1 would have written at the same place. + +In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient, +and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`, +for example a single-shard tenant's prefix will be `0001`. + +For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0, +and use this as a cue to construct paths with no prefix at all. + +#### Storage Format: Indices + +In the phase 1 described in this RFC, shards only reference layers they write themselves. However, +when we implement shard splitting in future, it will be useful to enable shards to reference layers +written by other shards (specifically the parent shard during a split), so that shards don't +have to exhaustively copy all data into their own shard-prefixed keys. + +To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count) +tuple on each layer, such that it can construct paths for layers written by other shards. This +naturally raises the question of who "owns" such layers written by ancestral shards: this problem +will be addressed in phase 2. + +For backward compatibility, any index entry without shard information will be assumed to be +in the legacy shardidentity. + +#### WAL Ingest + +In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter +it down to the pages relevant to their shard: + +- For ordinary user data writes, only retain a write if it matches the ShardIdentity +- For metadata describing relations etc, all shards retain these writes. + +The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn: +one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards, +and have only the 0th shard populate remote_consistent_lsn. However, this is relatively +expensive: if the safekeeper can be made shard-aware then it could be taught to use +the max() of all shards' remote_consistent_lsns to decide when to trim the WAL. + +#### Compaction/GC + +No changes needed. + +The pageserver doesn't have to do anything special during compaction +or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity. +This will result in sparse layer files, containing keys only in the stripes that this +shard owns. Where optimizations currently exist in compaction for spotting "gaps" in +the key range, these should be updated to ignore gaps that are due to sharding, to +avoid spuriously splitting up layers ito stripe-sized pieces. + +### Compute Endpoints + +Compute endpoints will need to: + +- Accept a vector of connection strings as part of their configuration from the control plane +- Route pageserver requests according to mapping the hash of key to the correct + entry in the vector of connection strings. + +Doing this in compute rather than routing requests via a single pageserver is +necessary to enable sharding tenants without adding latency from extra hops. + +### Control Plane + +Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will +be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing +tenants. + +Tenant lifecycle operations like deletion will require fanning-out to all the shards +in the tenant. The same goes for timeline creation and deletion: a timeline should +not be considered created until it has been created in all shards. + +#### Selectively enabling sharding for large tenants + +Initially, we will explicitly enable sharding for large tenants only. + +In future, this hint mechanism will become optional when we implement automatic +re-sharding of tenants. + +## Future Phases + +This section exists to indicate what will likely come next after this phase. + +Phases 2a and 2b are amenable to execution in parallel. + +### Phase 2a: WAL fan-out + +**Problem**: when all shards consume the whole WAL, the network bandwidth used +for transmitting the WAL from safekeeper to pageservers is multiplied by a factor +of the shard count. + +Network bandwidth is not our most pressing bottleneck, but it is likely to become +a problem if we set a modest shard count (~8) on a significant number of tenants, +especially as those larger tenants which we shard are also likely to have higher +write bandwidth than average. + +### Phase 2b: Shard Splitting + +**Problem**: the number of shards in a tenant is defined at creation time and cannot +be changed. This causes excessive sharding for most small tenants, and an upper +bound on scale for very large tenants. + +To address this, a _splitting_ feature will later be added. One shard can split its +data into a number of children by doing a special compaction operation to generate +image layers broken up child-shard-wise, and then writing out an `index_part.json` for +each child. This will then require external coordination (by the control plane) to +safely attach these new child shards and then move them around to distribute work. +The opposite _merging_ operation can also be imagined, but is unlikely to be implemented: +once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify +the risk/complexity of implementing such a rarely-encountered scenario. + +### Phase N (future): distributed historical reads + +**Problem**: while sharding based on key is good for handling changes in overall +database size, it is less suitable for spiky/unpredictable changes in the read +workload to historical layers. Sudden increases in historical reads could result +in sudden increases in local disk capacity required for a TenantShard. + +Example: the extreme case of this would be to run a tenant for a year, then create branches +with ancestors at monthly intervals. This could lead to a sudden 12x inflation in +the on-disk capacity footprint of a TenantShard, since it would be serving reads +from all those disparate historical layers. + +If we can respond fast enough, then key-sharding a tenant more finely can help with +this, but splitting may be a relatively expensive operation and the increased historical +read load may be transient. + +A separate mechanism for handling heavy historical reads could be something like +a gossip mechanism for pageservers to communicate +about their workload, and then a getpageatlsn offload mechanism where one pageserver can +ask another to go read the necessary layers from remote storage to serve the read. This +requires relativly little coordination because it is read-only: any node can service any +read. All reads to a particular shard would still flow through one node, but the +disk capactity & I/O impact of servicing the read would be distributed. + +## FAQ/Alternatives + +### Why stripe the data, rather than using contiguous ranges of keyspace for each shard? + +When a database is growing under a write workload, writes may predominantly hit the +end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user +is intensively re-writing a particular relation, if that relation lived in a particular +shard then it would not achieve our goal of distributing the write work across shards. + +### Why not proxy read requests through one pageserver, so that endpoints don't have to change? + +1. This would not achieve scale-out of network bandwidth: a busy tenant with a large + database would still cause a load hotspot on the pageserver routing its read requests. +2. The additional hop through the "proxy" pageserver would add latency and overall + resource cost (CPU, network bandwidth) + +### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers + +In this model, there would be no explicit sharding of work, but the pageserver to which +a tenant is attached would not hold all layers on its disk: instead, it would call out +to peers to have them store some layers, and call out to those peers to request reads +in those layers. + +This mechanism will work well for distributing work in the LSN dimension, but in the key +space dimension it has the major limitation of requiring one node to handle all +incoming writes, and compactions. Even if the write workload for a large database +fits in one pageserver, it will still be a hotspot and such tenants may still +de-facto require their own pageserver. diff --git a/docs/rfcs/032-shard-splitting.md b/docs/rfcs/032-shard-splitting.md new file mode 100644 index 0000000000..d5fbda8415 --- /dev/null +++ b/docs/rfcs/032-shard-splitting.md @@ -0,0 +1,479 @@ +# Shard splitting + +## Summary + +This RFC describes a new pageserver API for splitting an existing tenant shard into +multiple shards, and describes how to use this API to safely increase the total +shard count of a tenant. + +## Motivation + +In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale +tenants beyond the capacity of a single pageserver by breaking up the key space +into stripes, and distributing these stripes across many pageservers. However, +the shard count was defined once at tenant creation time and not varied thereafter. + +In practice, the expected size of a database is rarely known at creation time, and +it is inefficient to enable sharding for very small tenants: we need to be +able to create a tenant with a small number of shards (such as 1), and later expand +when it becomes clear that the tenant has grown in size to a point where sharding +is beneficial. + +### Prior art + +Many distributed systems have the problem of choosing how many shards to create for +tenants that do not specify an expected size up-front. There are a couple of general +approaches: + +- Write to a key space in order, and start a new shard when the highest key advances + past some point. This doesn't work well for Neon, because we write to our key space + in many different contiguous ranges (per relation), rather than in one contiguous + range. To adapt to this kind of model, we would need a sharding scheme where each + relation had its own range of shards, which would be inefficient for the common + case of databases with many small relations. +- Monitor the system, and automatically re-shard at some size threshold. For + example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py) + component monitors the size of each RADOS Pool, and adjusts the number of Placement + Groups (Ceph's shard equivalent). + +## Requirements + +- A configurable capacity limit per-shard is enforced. +- Changes in shard count do not interrupt service beyond requiring postgres + to reconnect (i.e. milliseconds). +- Human being does not have to choose shard count + +## Non Goals + +- Shard splitting is always a tenant-global operation: we will not enable splitting + one shard while leaving others intact. +- The inverse operation (shard merging) is not described in this RFC. This is a lower + priority than splitting, because databases grow more often than they shrink, and + a database with many shards will still work properly if the stored data shrinks, just + with slightly more overhead (e.g. redundant WAL replication) +- Shard splitting is only initiated based on capacity bounds, not load. Splitting + a tenant based on load will make sense for some medium-capacity, high-load workloads, + but is more complex to reason about and likely is not desirable until we have + shard merging to reduce the shard count again if the database becomes less busy. + +## Impacted Components + +pageserver, storage controller + +(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment) + +## Terminology + +**Parent** shards are the shards that exist before a split. **Child** shards are +the new shards created during a split. + +**Shard** is synonymous with _tenant shard_. + +**Shard Index** is the 2-tuple of shard number and shard count, written in +paths as {:02x}{:02x}, e.g. `0001`. + +## Background + +In the implementation section, a couple of existing aspects of sharding are important +to remember: + +- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is + a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local + storage paths, and remote index metadata. +- Remote layer file paths contain the shard index of the shard that created them, and + remote indices contain the same index to enable building the layer file path. A shard's + index may reference layers that were created by another shard. +- Local tenant shard directories include the shard index. All layers downloaded by + a tenant shard are stored in this shard-prefixed path, even if those layers were + initially created by another shard: tenant shards do not read and write one anothers' + paths. +- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant. + This is for historical reasons and will be cleaned up in future, but the existing + name is used here to help comprehension when reading code. + +## Implementation + +Note: this section focuses on the correctness of the core split process. This will +be fairly inefficient in a naive implementation, and several important optimizations +are described in a later section. + +There are broadly two parts to the implementation: + +1. The pageserver split API, which splits one shard on one pageserver +2. The overall tenant split proccess which is coordinated by the storage controller, + and calls into the pageserver split API as needed. + +### Pageserver Split API + +The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split` +that takes the new total shard count in the body. + +The pageserver split API operates on one tenant shard, on one pageserver. External +coordination is required to use it safely, this is described in the later +'Split procedure' section. + +#### Preparation + +First identify the shard indices for the new child shards. These are deterministic, +calculated from the parent shard's index, and the number of children being created (this +is an input to the API, and validated to be a power of two). In a trivial example, splitting +0001 in two always results in 0002 and 0102. + +Child shard indices are chosen such that the childrens' parts of the keyspace will +be subsets of the parent's parts of the keyspace. + +#### Step 1: write new remote indices + +In remote storage, splitting is very simple: we may just write new index_part.json +objects for each child shard, containing exactly the same layers as the parent shard. + +The children will have more data than they need, but this avoids any exhausive +re-writing or copying of layer files. + +The index key path includes a generation number: the parent shard's current +attached generation number will also be used for the child shards' indices. This +makes the operation safely retryable: if everything crashes and restarts, we may +call the split API again on the parent shard, and the result will be some new remote +indices for the child shards, under a higher generation number. + +#### Step 2: start new `Tenant` objects + +A new `Tenant` object may be instantiated for each child shard, while the parent +shard still exists. When calling the tenant_spawn function for this object, +the remote index from step 1 will be read, and the child shard will start +to ingest WAL to catch up from whatever was in the remote storage at step 1. + +We now wait for child shards' WAL ingestion to catch up with the parent shard, +so that we can safely tear down the parent shard without risking an availability +gap to clients reading recent LSNs. + +#### Step 3: tear down parent `Tenant` object + +Once child shards are running and have caught up with WAL ingest, we no longer +need the parent shard. Note that clients may still be using it -- when we +shut it down, any page_service handlers will also shut down, causing clients +to disconnect. When the client reconnects, it will re-lookup the tenant, +and hit the child shard instead of the parent (shard lookup from page_service +should bias toward higher ShardCount shards). + +Note that at this stage the page service client has not yet been notified of +any split. In the trivial single split example: + +- Shard 0001 is gone: Tenant object torn down +- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live. +- Clients will continue to connect to that server thinking that shard 0001 is there, + and all requests will work, because any key that was in shard 0001 is definitely + available in either shard 0002 or shard 0102. +- Eventually, the storage controller (not the pageserver) will decide to migrate + some child shards away: at that point it will do a live migration, ensuring + that the client has an updated configuration before it detaches anything + from the original server. + +#### Complete + +When we send a 200 response to the split request, we are promising the caller: + +- That the child shards are persistent in remote storage +- That the parent shard has been shut down + +This enables the caller to proceed with the overall shard split operation, which +may involve other shards on other pageservers. + +### Storage Controller Split procedure + +Splitting a tenant requires calling the pageserver split API, and tracking +enough state to ensure recovery + completion in the event of any component (pageserver +or storage controller) crashing (or request timing out) during the split. + +1. call the split API on all existing shards. Ensure that the resulting + child shards are pinned to their pageservers until _all_ the split calls are done. + This pinning may be implemented as a "split bit" on the tenant shards, that + blocks any migrations, and also acts as a sign that if we restart, we must go + through some recovery steps to resume the split. +2. Once all the split calls are done, we may unpin the child shards (clear + the split bit). The split is now complete: subsequent steps are just migrations, + not strictly part of the split. +3. Try to schedule new pageserver locations for the child shards, using + a soft anti-affinity constraint to place shards from the same tenant onto different + pageservers. + +Updating computes about the new shard count is not necessary until we migrate +any of the child shards away from the parent's location. + +### Recovering from failures + +#### Rolling back an incomplete split + +An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers, +and detaching child shards. This will lose any WAL ingested into the children after the parents +were detached earlier, but the parents will catch up. + +No special pageserver API is needed for this. From the storage controllers point of view, the +procedure is: + +1. For all parent shards in the tenant, ensure they are attached +2. For all child shards, ensure they are not attached +3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards. + +Any remote storage content for child shards is left behind. This is similar to other cases where +we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an +index that references it). Future online scrub/cleanup functionality can remove these objects, or +they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix, +which would include any child shards that were rolled back. + +If any timelines had been created on child shards, they will be lost when rolling back. To mitigate +this, we will **block timeline creation during splitting**, so that we can safely roll back until +the split is complete, without risking losing timelines. + +Rolling back an incomplete split will happen automatically if a split fails due to some fatal +reason, and will not be accessible via an API: + +- A pageserver fails to complete its split API request after too many retries +- A pageserver returns a fatal unexpected error such as 400 or 500 +- The storage controller database returns a non-retryable error +- Some internal invariant is violated in the storage controller split code + +#### Rolling back a complete split + +A complete shard split may be rolled back similarly to an incomplete split, with the following +modifications: + +- The parent shards will no longer exist in the storage controller database, so these must + be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This + may be accomplished either by probing in S3, or by retaining some tombstone state for deleted + shards in the storage controller database. +- Any timelines that were created after the split complete will disappear when rolling back + to the tenant shards. For this reason, rolling back after a complete split should only + be done due to serious issues where loss of recently created timelines is acceptable, or + in cases where we have confirmed that no timelines were created in the intervening period. +- Parent shards' layers must not have been deleted: this property will come "for free" when + we first roll out sharding, by simply not implementing deletion of parent layers after + a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the + Optimizations section), it should apply a TTL to layers such that we have a + defined walltime window in which rollback will be possible. + +The storage controller will expose an API for rolling back a complete split, for use +in the field if we encounter some critical bug with a post-split tenant. + +#### Retrying API calls during Pageserver Restart + +When a pageserver restarts during a split API call, it may witness on-disk content for both parent and +child shards from an ongoing split. This does not intrinsically break anything, and the +pageserver may include all these shards in its `/re-attach` request to the storage controller. + +In order to support such restarts, it is important that the storage controller stores +persistent records of each child shard before it calls into a pageserver, as these child shards +may require generation increments via a `/re-attach` request. + +The pageserver restart will also result in a failed API call from the storage controller's point +of view. Recall that if _any_ pageserver fails to split, the overall split operation may not +complete, and all shards must remain pinned to their current pageserver locations until the +split is done. + +The pageserver API calls during splitting will retry on transient errors, so that +short availability gaps do not result in a failure of the overall operation. The +split in progress will be automatically rolled back if the threshold for API +retries is reached (e.g. if a pageserver stays offline for longer than a typical +restart). + +#### Rollback on Storage Controller Restart + +On startup, the storage controller will inspect the split bit for tenant shards that +it loads from the database. If any splits are in progress: + +- Database content will be reverted to the parent shards +- Child shards will be dropped from memory +- The parent and child shards will be included in the general startup reconciliation that + the storage controller does: any child shards will be detached from pageservers because + they don't exist in the storage controller's expected set of shards, and parent shards + will be attached if they aren't already. + +#### Storage controller API request failures/retries + +The split request handler will implement idempotency: if the [`Tenant`] requested to split +doesn't exist, we will check for the would-be child shards, and if they already exist, +we consider the request complete. + +If a request is retried while the original request is still underway, then the split +request handler will notice an InProgress marker in TenantManager, and return 503 +to encourage the client to backoff/retry. This is the same as the general pageserver +API handling for calls that try to act on an InProgress shard. + +#### Compute start/restart during a split + +If a compute starts up during split, it will be configured with the old sharding +configuration. This will work for reads irrespective of the progress of the split +as long as no child hards have been migrated away from their original location, and +this is guaranteed in the split procedure (see earlier section). + +#### Pageserver fails permanently during a split + +If a pageserver permanently fails (i.e. the storage controller availability state for it +goes to Offline) while a split is in progress, the splitting operation will roll back, and +during the roll back it will skip any API calls to the offline pageserver. If the offline +pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API). + +### Handling secondary locations + +For correctness, it is not necessary to split secondary locations. We can simply detach +the secondary locations for parent shards, and then attach new secondary locations +for child shards. + +Clearly this is not optimal, as it will result in re-downloads of layer files that +were already present on disk. See "Splitting secondary locations" + +### Conditions to trigger a split + +The pageserver will expose a new API for reporting on shards that are candidates +for split: this will return a top-N report of the largest tenant shards by +physical size (remote size). This should exclude any tenants that are already +at the maximum configured shard count. + +The API would look something like: +`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size` + +The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds). + +A split operation will be started when the tenant exceeds some threshold. This threshold +should be _less than_ how large we actually want shards to be, perhaps much less. That's to +minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't +wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing +tenant size distribution may be useful here: if we can make a statement like "usually, if +a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might +make our policy to split a tenant at 20GiB. + +The finest split we can do is by factors of two, but we can do higher-cardinality splits +too, and this will help to reduce the overhead of repeatedly re-splitting a tenant +as it grows. An example of a very simple heuristic for early deployment of the splitting +feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that +would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had +split a tenant, it will not need re-splitting soon after. + +## Optimizations + +### Flush parent shard to remote storage during split + +Any data that is in WAL but not remote storage at time of split will need +to be replayed by child shards when they start for the first time. To minimize +this work, we may flush the parent shard to remote storage before writing the +remote indices for child shards. + +It is important that this flush is subject to some time bounds: we may be splitting +in response to a surge of write ingest, so it may be time-critical to split. A +few seconds to flush latest data should be sufficient to optimize common cases without +running the risk of holding up a split for a harmful length of time when a parent +shard is being written heavily. If the flush doesn't complete in time, we may proceed +to shut down the parent shard and carry on with the split. + +### Hard linking parent layers into child shard directories + +Before we start the Tenant objects for child shards, we may pre-populate their +local storage directories with hard links to the layer files already present +in the parent shard's local directory. When the child shard starts and downloads +its remote index, it will find all those layer files already present on local disk. + +This avoids wasting download capacity and makes splitting faster, but more importantly +it avoids taking up a factor of N more disk space when splitting 1 shard into N. + +This mechanism will work well in typical flows where shards are migrated away +promptly after a split, but for the general case including what happens when +layers are evicted and re-downloaded after a split, see the 'Proactive compaction' +section below. + +### Filtering during compaction + +Compaction, especially image layer generation, should skip any keys that are +present in a shard's layer files, but do not match the shard's ShardIdentity's +is_key_local() check. This avoids carrying around data for longer than necessary +in post-split compactions. + +This was already implemented in https://github.com/neondatabase/neon/pull/6246 + +### Proactive compaction + +In remote storage, there is little reason to rewrite any data on a shard split: +all the children can reference parent layers via the very cheap write of the child +index_part.json. + +In local storage, things are more nuanced. During the initial split there is no +capacity cost to duplicating parent layers, if we implement the hard linking +optimization described above. However, as soon as any layers are evicted from +local disk and re-downloaded, the downloaded layers will not be hard-links any more: +they'll have real capacity footprint. That isn't a problem if we migrate child shards +away from the parent node swiftly, but it risks a significant over-use of local disk +space if we do not. + +For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of +the shards elsewhere, then churned all the layers in all the shards via eviction, +then we would blow up the storage capacity used on the node by 8x. If we're splitting +a 100GB shard, that could take the pageserver to the point of exhausting disk space. + +To avoid this scenario, we could implement a special compaction mode where we just +read historic layers, drop unwanted keys, and write back the layer file. This +is pretty expensive, but useful if we have split a large shard and are not going to +migrate the child shards away. + +The heuristic conditions for triggering such a compaction are: + +- A) eviction plus time: if a child shard + has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load. +- B) resident size plus time: we may inspect the resident layers and calculate how + many of them include the overhead of storing pre-split keys. After some time + threshold (different to the one in case A) we still have such layers occupying + local disk space, then we should proactively compact them. + +### Cleaning up parent-shard layers + +It is functionally harmless to leave parent shard layers in remote storage indefinitely. +They would be cleaned up in the event of the tenant's deletion. + +As an optimization to avoid leaking remote storage capacity (which costs money), we may +lazily clean up parent shard layers once no child shards reference them. + +This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is: + +- list all the key prefixes beginning with the tenant ID, and select those shard prefixes + which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_) +- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and + may drop out now. +- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices. +- for all ancestral shards, list objects in the prefix and delete any layer which was not + referenced by a current shard. + +If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable. + +The cleanup may be done by the scrubber (external process), or we may choose to have +the zeroth shard in the latest generation do the work -- there is no obstacle to one shard +reading the other shard's indices at runtime, and we do not require visibility of the +latest index writes. + +Cleanup should be artificially delayed by some period (for example 24 hours) to ensure +that we retain the option to roll back a split in case of bugs. + +### Splitting secondary locations + +We may implement a pageserver API similar to the main splitting API, which does a simpler +operation for secondary locations: it would not write anything to S3, instead it would simply +create the child shard directory on local disk, hard link in directories from the parent, +and set up the in memory (TenantSlot) state for the children. + +Similar to attached locations, a subset of secondary locations will probably need re-locating +after the split is complete, to avoid leaving multiple child shards on the same pageservers, +where they may use excessive space for the tenant. + +## FAQ/Alternatives + +### What should the thresholds be set to? + +Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit. + +Max shard count: + +- The safekeeper overhead to sharding is currently O(N) network bandwidth because + the un-filtered WAL is sent to all shards. To avoid this growing out of control, + a limit of 8 shards should be temporarily imposed until WAL filtering is implemented + on the safekeeper. +- there is also little benefit to increasing the shard count beyond the number + of pageservers in a region. + +### Is it worth just rewriting all the data during a split to simplify reasoning about space? diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index c172354e9f..e33bd0f486 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -6,7 +6,10 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{models::ShardParameters, shard::TenantShardId}; +use crate::{ + models::{ShardParameters, TenantConfig}, + shard::{ShardStripeSize, TenantShardId}, +}; #[derive(Serialize, Deserialize)] pub struct TenantCreateResponseShard { @@ -35,7 +38,7 @@ pub struct NodeRegisterRequest { pub struct NodeConfigureRequest { pub node_id: NodeId, - pub availability: Option, + pub availability: Option, pub scheduling: Option, } @@ -57,6 +60,31 @@ pub struct TenantLocateResponse { pub shard_params: ShardParameters, } +#[derive(Serialize, Deserialize)] +pub struct TenantDescribeResponse { + pub shards: Vec, + pub stripe_size: ShardStripeSize, + pub policy: PlacementPolicy, + pub config: TenantConfig, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantDescribeResponseShard { + pub tenant_shard_id: TenantShardId, + + pub node_attached: Option, + pub node_secondary: Vec, + + pub last_error: String, + + /// A task is currently running to reconcile this tenant's intent state with the state on pageservers + pub is_reconciling: bool, + /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending. + pub is_pending_compute_notification: bool, + /// A shard split is currently underway + pub is_splitting: bool, +} + /// Explicitly migrating a particular shard is a low level operation /// TODO: higher level "Reschedule tenant" operation where the request /// specifies some constraints, e.g. asking it to get off particular node(s) @@ -66,22 +94,76 @@ pub struct TenantShardMigrateRequest { pub node_id: NodeId, } -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)] +/// Utilisation score indicating how good a candidate a pageserver +/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. +/// Lower values are better. +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub struct UtilizationScore(pub u64); + +impl UtilizationScore { + pub fn worst() -> Self { + UtilizationScore(u64::MAX) + } +} + +#[derive(Serialize, Clone, Copy)] +#[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state - Active, + Active(UtilizationScore), // Offline: Tenants shouldn't try to attach here, but they may assume that their // secondary locations on this node still exist. Newly added nodes are in this // state until we successfully contact them. Offline, } +impl PartialEq for NodeAvailability { + fn eq(&self, other: &Self) -> bool { + use NodeAvailability::*; + matches!((self, other), (Active(_), Active(_)) | (Offline, Offline)) + } +} + +impl Eq for NodeAvailability {} + +// This wrapper provides serde functionality and it should only be used to +// communicate with external callers which don't know or care about the +// utilisation score of the pageserver it is targeting. +#[derive(Serialize, Deserialize, Clone)] +pub enum NodeAvailabilityWrapper { + Active, + Offline, +} + +impl From for NodeAvailability { + fn from(val: NodeAvailabilityWrapper) -> Self { + match val { + // Assume the worst utilisation score to begin with. It will later be updated by + // the heartbeats. + NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()), + NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, + } + } +} + +impl From for NodeAvailabilityWrapper { + fn from(val: NodeAvailability) -> Self { + match val { + NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active, + NodeAvailability::Offline => NodeAvailabilityWrapper::Offline, + } + } +} + impl FromStr for NodeAvailability { type Err = anyhow::Error; fn from_str(s: &str) -> Result { match s { - "active" => Ok(Self::Active), + // This is used when parsing node configuration requests from neon-local. + // Assume the worst possible utilisation score + // and let it get updated via the heartbeats. + "active" => Ok(Self::Active(UtilizationScore::worst())), "offline" => Ok(Self::Offline), _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), } @@ -127,11 +209,8 @@ impl From for String { /// to create secondary locations. #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] pub enum PlacementPolicy { - /// Cheapest way to attach a tenant: just one pageserver, no secondary - Single, - /// Production-ready way to attach a tenant: one attached pageserver and - /// some number of secondaries. - Double(usize), + /// Normal live state: one attached pageserver and zero or more secondaries. + Attached(usize), /// Create one secondary mode locations. This is useful when onboarding /// a tenant, or for an idle tenant that we might want to bring online quickly. Secondary, @@ -153,14 +232,14 @@ mod test { /// Check stability of PlacementPolicy's serialization #[test] fn placement_policy_encoding() -> anyhow::Result<()> { - let v = PlacementPolicy::Double(1); + let v = PlacementPolicy::Attached(1); let encoded = serde_json::to_string(&v)?; - assert_eq!(encoded, "{\"Double\":1}"); + assert_eq!(encoded, "{\"Attached\":1}"); assert_eq!(serde_json::from_str::(&encoded)?, v); - let v = PlacementPolicy::Single; + let v = PlacementPolicy::Detached; let encoded = serde_json::to_string(&v)?; - assert_eq!(encoded, "\"Single\""); + assert_eq!(encoded, "\"Detached\""); assert_eq!(serde_json::from_str::(&encoded)?, v); Ok(()) } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index a96cc09158..aad4cc97fc 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -4,6 +4,7 @@ pub mod utilization; pub use utilization::PageserverUtilization; use std::{ + borrow::Cow, collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, @@ -426,7 +427,7 @@ pub struct StatusResponse { #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantLocationConfigRequest { - pub tenant_id: TenantShardId, + pub tenant_id: Option, #[serde(flatten)] pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -577,7 +578,7 @@ pub struct TimelineInfo { pub walreceiver_status: String, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerMapInfo { pub in_memory_layers: Vec, pub historic_layers: Vec, @@ -595,7 +596,7 @@ pub enum LayerAccessKind { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStatFullDetails { pub when_millis_since_epoch: u64, - pub task_kind: &'static str, + pub task_kind: Cow<'static, str>, pub access_kind: LayerAccessKind, } @@ -654,23 +655,23 @@ impl LayerResidenceEvent { } } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStats { pub access_count_by_access_kind: HashMap, - pub task_kind_access_flag: Vec<&'static str>, + pub task_kind_access_flag: Vec>, pub first: Option, pub accesses_history: HistoryBufferWithDropCounter, pub residence_events_history: HistoryBufferWithDropCounter, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum InMemoryLayerInfo { Open { lsn_start: Lsn }, Frozen { lsn_start: Lsn, lsn_end: Lsn }, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum HistoricLayerInfo { Delta { @@ -692,6 +693,32 @@ pub enum HistoricLayerInfo { }, } +impl HistoricLayerInfo { + pub fn layer_file_name(&self) -> &str { + match self { + HistoricLayerInfo::Delta { + layer_file_name, .. + } => layer_file_name, + HistoricLayerInfo::Image { + layer_file_name, .. + } => layer_file_name, + } + } + pub fn is_remote(&self) -> bool { + match self { + HistoricLayerInfo::Delta { remote, .. } => *remote, + HistoricLayerInfo::Image { remote, .. } => *remote, + } + } + pub fn set_remote(&mut self, value: bool) { + let field = match self { + HistoricLayerInfo::Delta { remote, .. } => remote, + HistoricLayerInfo::Image { remote, .. } => remote, + }; + *field = value; + } +} + #[derive(Debug, Serialize, Deserialize)] pub struct DownloadRemoteLayersTaskSpawnRequest { pub max_concurrent_downloads: NonZeroUsize, @@ -724,6 +751,52 @@ pub struct WalRedoManagerStatus { pub pid: Option, } +/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating +/// a download job, timing out while waiting for it to run, and then inspecting this status to understand +/// what's happening. +#[derive(Default, Debug, Serialize, Deserialize, Clone)] +pub struct SecondaryProgress { + /// The remote storage LastModified time of the heatmap object we last downloaded. + #[serde( + serialize_with = "opt_ser_rfc3339_millis", + deserialize_with = "opt_deser_rfc3339_millis" + )] + pub heatmap_mtime: Option, + + /// The number of layers currently on-disk + pub layers_downloaded: usize, + /// The number of layers in the most recently seen heatmap + pub layers_total: usize, + + /// The number of layer bytes currently on-disk + pub bytes_downloaded: u64, + /// The number of layer bytes in the most recently seen heatmap + pub bytes_total: u64, +} + +fn opt_ser_rfc3339_millis( + ts: &Option, + serializer: S, +) -> Result { + match ts { + Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)), + None => serializer.serialize_none(), + } +} + +fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result, D::Error> +where + D: serde::de::Deserializer<'de>, +{ + let s: Option = serde::de::Deserialize::deserialize(deserializer)?; + match s { + None => Ok(None), + Some(s) => humantime::parse_rfc3339(&s) + .map_err(serde::de::Error::custom) + .map(Some), + } +} + pub mod virtual_file { #[derive( Copy, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 7195a12395..f5984dff5d 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -7,7 +7,7 @@ use std::time::SystemTime; /// /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might /// not handle full u64 values properly. -#[derive(serde::Serialize, Debug)] +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] pub struct PageserverUtilization { /// Used disk space #[serde(serialize_with = "ser_saturating_u63")] @@ -21,7 +21,10 @@ pub struct PageserverUtilization { /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - #[serde(serialize_with = "ser_rfc3339_millis")] + #[serde( + serialize_with = "ser_rfc3339_millis", + deserialize_with = "deser_rfc3339_millis" + )] pub captured_at: SystemTime, } @@ -32,6 +35,14 @@ fn ser_rfc3339_millis( serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) } +fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) +} + /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// /// Instead of newtype, use this because a newtype would get require handling deserializing values diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 15f3cd3b80..4a53f485ca 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -18,6 +18,7 @@ camino.workspace = true humantime.workspace = true hyper = { workspace = true, features = ["stream"] } futures.workspace = true +rand.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 12ec680cb6..5fff3e25c9 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -157,9 +157,8 @@ impl AzureBlobStorage { let mut bufs = Vec::new(); while let Some(part) = response.next().await { let part = part?; - let etag_str: &str = part.blob.properties.etag.as_ref(); if etag.is_none() { - etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + etag = Some(part.blob.properties.etag); } if last_modified.is_none() { last_modified = Some(part.blob.properties.last_modified.into()); @@ -174,6 +173,16 @@ impl AzureBlobStorage { .map_err(|e| DownloadError::Other(e.into()))?; bufs.push(data); } + + if bufs.is_empty() { + return Err(DownloadError::Other(anyhow::anyhow!( + "Azure GET response contained no buffers" + ))); + } + // unwrap safety: if these were None, bufs would be empty and we would have returned an error already + let etag = etag.unwrap(); + let last_modified = last_modified.unwrap(); + Ok(Download { download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), etag, diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index b0b69f9155..ab2035f19a 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -42,6 +42,9 @@ pub use self::{ }; use s3_bucket::RequestKind; +/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. +pub use azure_core::Etag; + pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; /// Currently, sync happens with AWS S3, that has two limits on requests per second: @@ -291,9 +294,9 @@ pub type DownloadStream = pub struct Download { pub download_stream: DownloadStream, /// The last time the file was modified (`last-modified` HTTP header) - pub last_modified: Option, + pub last_modified: SystemTime, /// A way to identify this specific version of the resource (`etag` HTTP header) - pub etag: Option, + pub etag: Etag, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 478ad81dc1..313d8226b1 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -10,7 +10,7 @@ use std::{ io::ErrorKind, num::NonZeroU32, pin::Pin, - time::{Duration, SystemTime}, + time::{Duration, SystemTime, UNIX_EPOCH}, }; use anyhow::{bail, ensure, Context}; @@ -30,6 +30,7 @@ use crate::{ }; use super::{RemoteStorage, StorageMetadata}; +use crate::Etag; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; @@ -406,35 +407,37 @@ impl RemoteStorage for LocalFs { cancel: &CancellationToken, ) -> Result { let target_path = from.with_base(&self.storage_root); - if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let source = ReaderStream::new( - fs::OpenOptions::new() - .read(true) - .open(&target_path) - .await - .with_context(|| { - format!("Failed to open source file {target_path:?} to use in the download") - }) - .map_err(DownloadError::Other)?, - ); - let metadata = self - .read_storage_metadata(&target_path) + let file_metadata = file_metadata(&target_path).await?; + + let source = ReaderStream::new( + fs::OpenOptions::new() + .read(true) + .open(&target_path) .await - .map_err(DownloadError::Other)?; + .with_context(|| { + format!("Failed to open source file {target_path:?} to use in the download") + }) + .map_err(DownloadError::Other)?, + ); - let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); - let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + let metadata = self + .read_storage_metadata(&target_path) + .await + .map_err(DownloadError::Other)?; - Ok(Download { - metadata, - last_modified: None, - etag: None, - download_stream: Box::pin(source), - }) - } else { - Err(DownloadError::NotFound) - } + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + + let etag = mock_etag(&file_metadata); + Ok(Download { + metadata, + last_modified: file_metadata + .modified() + .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?, + etag, + download_stream: Box::pin(source), + }) } async fn download_byte_range( @@ -452,50 +455,51 @@ impl RemoteStorage for LocalFs { return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes"))); } } + let target_path = from.with_base(&self.storage_root); - if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let mut source = tokio::fs::OpenOptions::new() - .read(true) - .open(&target_path) - .await - .with_context(|| { - format!("Failed to open source file {target_path:?} to use in the download") - }) - .map_err(DownloadError::Other)?; - - let len = source - .metadata() - .await - .context("query file length") - .map_err(DownloadError::Other)? - .len(); - - source - .seek(io::SeekFrom::Start(start_inclusive)) - .await - .context("Failed to seek to the range start in a local storage file") - .map_err(DownloadError::Other)?; - - let metadata = self - .read_storage_metadata(&target_path) - .await - .map_err(DownloadError::Other)?; - - let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive); - let source = ReaderStream::new(source); - - let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); - let source = crate::support::DownloadStream::new(cancel_or_timeout, source); - - Ok(Download { - metadata, - last_modified: None, - etag: None, - download_stream: Box::pin(source), + let file_metadata = file_metadata(&target_path).await?; + let mut source = tokio::fs::OpenOptions::new() + .read(true) + .open(&target_path) + .await + .with_context(|| { + format!("Failed to open source file {target_path:?} to use in the download") }) - } else { - Err(DownloadError::NotFound) - } + .map_err(DownloadError::Other)?; + + let len = source + .metadata() + .await + .context("query file length") + .map_err(DownloadError::Other)? + .len(); + + source + .seek(io::SeekFrom::Start(start_inclusive)) + .await + .context("Failed to seek to the range start in a local storage file") + .map_err(DownloadError::Other)?; + + let metadata = self + .read_storage_metadata(&target_path) + .await + .map_err(DownloadError::Other)?; + + let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive); + let source = ReaderStream::new(source); + + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + + let etag = mock_etag(&file_metadata); + Ok(Download { + metadata, + last_modified: file_metadata + .modified() + .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?, + etag, + download_stream: Box::pin(source), + }) } async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> { @@ -610,13 +614,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result< Ok(()) } -fn file_exists(file_path: &Utf8Path) -> anyhow::Result { - if file_path.exists() { - ensure!(file_path.is_file(), "file path '{file_path}' is not a file"); - Ok(true) - } else { - Ok(false) - } +async fn file_metadata(file_path: &Utf8Path) -> Result { + tokio::fs::metadata(&file_path).await.map_err(|e| { + if e.kind() == ErrorKind::NotFound { + DownloadError::NotFound + } else { + DownloadError::BadInput(e.into()) + } + }) +} + +// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we +// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests +// quickly, with less overhead than using a mock S3 server. +fn mock_etag(meta: &std::fs::Metadata) -> Etag { + let mtime = meta.modified().expect("Filesystem mtime missing"); + format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into() } #[cfg(test)] diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 438f45fbde..1cb85cfb1b 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -35,8 +35,8 @@ use aws_sdk_s3::{ }; use aws_smithy_async::rt::sleep::TokioSleep; -use aws_smithy_types::byte_stream::ByteStream; use aws_smithy_types::{body::SdkBody, DateTime}; +use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; use hyper::Body; @@ -287,8 +287,17 @@ impl S3Bucket { let remaining = self.timeout.saturating_sub(started_at.elapsed()); let metadata = object_output.metadata().cloned().map(StorageMetadata); - let etag = object_output.e_tag; - let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); + let etag = object_output + .e_tag + .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))? + .into(); + let last_modified = object_output + .last_modified + .ok_or(DownloadError::Other(anyhow::anyhow!( + "Missing LastModified header" + )))? + .try_into() + .map_err(|e: ConversionError| DownloadError::Other(e.into()))?; let body = object_output.body; let body = ByteStreamAsStream::from(body); diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index d8b9824d99..bc5e40e70f 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: // A little check to ensure that our clock is not too far off from the S3 clock { let dl = retry(|| ctx.client.download(&path2, &cancel)).await?; - let last_modified = dl.last_modified.unwrap(); + let last_modified = dl.last_modified; let half_wt = WAIT_TIME.mul_f32(0.5); let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index de27ae4e28..b3b50461da 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -13,6 +13,7 @@ testing = ["fail/failpoints"] [dependencies] arc-swap.workspace = true sentry.workspace = true +async-compression.workspace = true async-trait.workspace = true anyhow.workspace = true bincode.workspace = true @@ -36,6 +37,7 @@ serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-tar.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-error.workspace = true @@ -46,6 +48,7 @@ strum.workspace = true strum_macros.workspace = true url.workspace = true uuid.workspace = true +walkdir.workspace = true pq_proto.workspace = true postgres_connection.workspace = true diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs index 1f07f5560f..bd35e2bad6 100644 --- a/libs/utils/src/history_buffer.rs +++ b/libs/utils/src/history_buffer.rs @@ -47,9 +47,10 @@ impl ops::Deref for HistoryBufferWithDropCounter { } } -#[derive(serde::Serialize)] +#[derive(serde::Serialize, serde::Deserialize)] struct SerdeRepr { buffer: Vec, + buffer_size: usize, drop_count: u64, } @@ -61,6 +62,7 @@ where let HistoryBufferWithDropCounter { buffer, drop_count } = value; SerdeRepr { buffer: buffer.iter().cloned().collect(), + buffer_size: L, drop_count: *drop_count, } } @@ -78,19 +80,52 @@ where } } +impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter +where + T: Clone + serde::Deserialize<'de>, +{ + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let SerdeRepr { + buffer: des_buffer, + drop_count, + buffer_size, + } = SerdeRepr::::deserialize(deserializer)?; + if buffer_size != L { + use serde::de::Error; + return Err(D::Error::custom(format!( + "invalid buffer_size, expecting {L} got {buffer_size}" + ))); + } + let mut buffer = HistoryBuffer::new(); + buffer.extend(des_buffer); + Ok(HistoryBufferWithDropCounter { buffer, drop_count }) + } +} + #[cfg(test)] mod test { use super::HistoryBufferWithDropCounter; #[test] fn test_basics() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); + let mut b = HistoryBufferWithDropCounter::::default(); b.write(1); b.write(2); b.write(3); assert!(b.iter().any(|e| *e == 2)); assert!(b.iter().any(|e| *e == 3)); assert!(!b.iter().any(|e| *e == 1)); + + // round-trip serde + let round_tripped: HistoryBufferWithDropCounter = + serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap(); + assert_eq!( + round_tripped.iter().cloned().collect::>(), + b.iter().cloned().collect::>() + ); } #[test] diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index fd4e068b39..336ee87570 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -87,6 +87,8 @@ pub mod failpoint_support; pub mod yielding_loop; +pub mod zstd; + pub mod poison; /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index bc8fa7362e..3ddfa44f41 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -29,12 +29,10 @@ pub struct PageserverFeedback { // Serialize with RFC3339 format. #[serde(with = "serde_systemtime")] pub replytime: SystemTime, + /// Used to track feedbacks from different shards. Always zero for unsharded tenants. + pub shard_number: u32, } -// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. -// Do not remove previously available fields because this might be backwards incompatible. -pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; - impl PageserverFeedback { pub fn empty() -> PageserverFeedback { PageserverFeedback { @@ -43,6 +41,7 @@ impl PageserverFeedback { remote_consistent_lsn: Lsn::INVALID, disk_consistent_lsn: Lsn::INVALID, replytime: *PG_EPOCH, + shard_number: 0, } } @@ -59,17 +58,26 @@ impl PageserverFeedback { // // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys + let buf_ptr = buf.len(); + buf.put_u8(0); // # of keys, will be filled later + let mut nkeys = 0; + + nkeys += 1; buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); + nkeys += 1; buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.last_received_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.disk_consistent_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.remote_consistent_lsn.0); @@ -80,9 +88,19 @@ impl PageserverFeedback { .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; + nkeys += 1; buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); + + if self.shard_number > 0 { + nkeys += 1; + buf.put_slice(b"shard_number\0"); + buf.put_i32(4); + buf.put_u32(self.shard_number); + } + + buf[buf_ptr] = nkeys; } // Deserialize PageserverFeedback message @@ -125,9 +143,8 @@ impl PageserverFeedback { } b"shard_number" => { let len = buf.get_i32(); - // TODO: this will be implemented in the next update, - // for now, we just skip the value. - buf.advance(len as usize); + assert_eq!(len, 4); + rf.shard_number = buf.get_u32(); } _ => { let len = buf.get_i32(); @@ -200,10 +217,7 @@ mod tests { rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys - if let Some(first) = data.first_mut() { - *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; - } - + data[0] += 1; data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 703a6dfd52..a3aee45b58 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -110,6 +110,49 @@ impl OnceCell { } } + /// Returns a guard to an existing initialized value, or returns an unique initialization + /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`. + pub async fn get_or_init_detached(&self) -> Result, InitPermit> { + // It looks like OnceCell::get_or_init could be implemented using this method instead of + // duplication. However, that makes the future be !Send due to possibly holding on to the + // MutexGuard over an await point. + loop { + let sem = { + let guard = self.inner.lock().unwrap(); + if guard.value.is_some() { + return Ok(Guard(guard)); + } + guard.init_semaphore.clone() + }; + + { + let permit = { + // increment the count for the duration of queued + let _guard = CountWaitingInitializers::start(self); + sem.acquire().await + }; + + let Ok(permit) = permit else { + let guard = self.inner.lock().unwrap(); + if !Arc::ptr_eq(&sem, &guard.init_semaphore) { + // there was a take_and_deinit in between + continue; + } + assert!( + guard.value.is_some(), + "semaphore got closed, must be initialized" + ); + return Ok(Guard(guard)); + }; + + permit.forget(); + } + + let permit = InitPermit(sem); + return Err(permit); + } + } + /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used /// to complete initializing the inner value. /// @@ -481,4 +524,39 @@ mod tests { assert_eq!("t1", *cell.get().unwrap()); } + + #[tokio::test(start_paused = true)] + async fn detached_init_smoke() { + let target = OnceCell::default(); + + let Err(permit) = target.get_or_init_detached().await else { + unreachable!("it is not initialized") + }; + + tokio::time::timeout( + std::time::Duration::from_secs(3600 * 24 * 7 * 365), + target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }), + ) + .await + .expect_err("should timeout since we are already holding the permit"); + + target.set(42, permit); + + let (_answer, permit) = { + let mut guard = target + .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) }) + .await + .unwrap(); + + assert_eq!(*guard, 42); + + guard.take_and_deinit() + }; + + assert!(target.get().is_none()); + + target.set(11, permit); + + assert_eq!(*target.get().unwrap(), 11); + } } diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs new file mode 100644 index 0000000000..be2dcc00f5 --- /dev/null +++ b/libs/utils/src/zstd.rs @@ -0,0 +1,78 @@ +use std::io::SeekFrom; + +use anyhow::{Context, Result}; +use async_compression::{ + tokio::{bufread::ZstdDecoder, write::ZstdEncoder}, + zstd::CParameter, + Level, +}; +use camino::Utf8Path; +use nix::NixPath; +use tokio::{ + fs::{File, OpenOptions}, + io::AsyncBufRead, + io::AsyncSeekExt, + io::AsyncWriteExt, +}; +use tokio_tar::{Archive, Builder, HeaderMode}; +use walkdir::WalkDir; + +/// Creates a Zstandard tarball. +pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(&tarball) + .await + .with_context(|| format!("tempfile creation {tarball}"))?; + + let mut paths = Vec::new(); + for entry in WalkDir::new(path) { + let entry = entry?; + let metadata = entry.metadata().expect("error getting dir entry metadata"); + // Also allow directories so that we also get empty directories + if !(metadata.is_file() || metadata.is_dir()) { + continue; + } + let path = entry.into_path(); + paths.push(path); + } + // Do a sort to get a more consistent listing + paths.sort_unstable(); + let zstd = ZstdEncoder::with_quality_and_params( + file, + Level::Default, + &[CParameter::enable_long_distance_matching(true)], + ); + let mut builder = Builder::new(zstd); + // Use reproducible header mode + builder.mode(HeaderMode::Deterministic); + for p in paths { + let rel_path = p.strip_prefix(path)?; + if rel_path.is_empty() { + // The top directory should not be compressed, + // the tar crate doesn't like that + continue; + } + builder.append_path_with_name(&p, rel_path).await?; + } + let mut zstd = builder.into_inner().await?; + zstd.shutdown().await?; + let mut compressed = zstd.into_inner(); + let compressed_len = compressed.metadata().await?.len(); + compressed.seek(SeekFrom::Start(0)).await?; + Ok((compressed, compressed_len)) +} + +/// Creates a Zstandard tarball. +pub async fn extract_zst_tarball( + path: &Utf8Path, + tarball: impl AsyncBufRead + Unpin, +) -> Result<()> { + let decoder = Box::pin(ZstdDecoder::new(tarball)); + let mut archive = Archive::new(decoder); + archive.unpack(path).await?; + Ok(()) +} diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index f5ed6ebb97..906302e46e 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { } } -extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) { +extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).process_safekeeper_feedback(&mut (*wp)) + (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk)); } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 734967da3f..14cc3e05a2 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -142,7 +142,7 @@ pub trait ApiImpl { todo!() } - fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) { + fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) { todo!() } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 5adeaffe1a..f304294591 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -89,6 +89,9 @@ enumset = { workspace = true, features = ["serde"]} strum.workspace = true strum_macros.workspace = true +[target.'cfg(target_os = "linux")'.dependencies] +procfs.workspace = true + [dev-dependencies] criterion.workspace = true hex-literal.workspace = true diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 732eb951c9..ab55d2b0a3 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -169,7 +169,7 @@ impl Client { self.request(Method::GET, uri, ()).await } - async fn request( + async fn request_noerror( &self, method: Method, uri: U, @@ -181,7 +181,16 @@ impl Client { } else { req }; - let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?; + req.json(&body).send().await.map_err(Error::ReceiveBody) + } + + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let res = self.request_noerror(method, uri, body).await?; let response = res.error_from_body().await?; Ok(response) } @@ -240,13 +249,26 @@ impl Client { Ok(()) } - pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> { - let uri = format!( + pub async fn tenant_secondary_download( + &self, + tenant_id: TenantShardId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress)> { + let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/secondary/download", self.mgmt_api_endpoint, tenant_id - ); - self.request(Method::POST, &uri, ()).await?; - Ok(()) + )) + .expect("Cannot build URL"); + + if let Some(wait) = wait { + path.query_pairs_mut() + .append_pair("wait_ms", &format!("{}", wait.as_millis())); + } + + let response = self.request(Method::POST, path, ()).await?; + let status = response.status(); + let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?; + Ok((status, progress)) } pub async fn location_config( @@ -257,7 +279,7 @@ impl Client { lazy: bool, ) -> Result<()> { let req_body = TenantLocationConfigRequest { - tenant_id: tenant_shard_id, + tenant_id: Some(tenant_shard_id), config, }; @@ -416,4 +438,77 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn get_utilization(&self) -> Result { + let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); + self.get(uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn layer_map_info( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn layer_evict( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + layer_file_name: &str, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer/{}", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name + ); + let resp = self.request_noerror(Method::DELETE, &uri, ()).await?; + match resp.status() { + StatusCode::OK => Ok(true), + StatusCode::NOT_MODIFIED => Ok(false), + // TODO: dedupe this pattern / introduce separate error variant? + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn layer_ondemand_download( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + layer_file_name: &str, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer/{}", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name + ); + let resp = self.request_noerror(Method::GET, &uri, ()).await?; + match resp.status() { + StatusCode::OK => Ok(true), + StatusCode::NOT_MODIFIED => Ok(false), + // TODO: dedupe this pattern / introduce separate error variant? + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } } diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs new file mode 100644 index 0000000000..197e782dca --- /dev/null +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -0,0 +1,272 @@ +use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; + +use pageserver_client::mgmt_api; +use rand::seq::SliceRandom; +use tracing::{debug, info}; +use utils::id::{TenantTimelineId, TimelineId}; + +use tokio::{ + sync::{mpsc, OwnedSemaphorePermit}, + task::JoinSet, +}; + +use std::{ + num::NonZeroUsize, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant}, +}; + +/// Evict & on-demand download random layers. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long)] + runtime: Option, + #[clap(long, default_value = "1")] + tasks_per_target: NonZeroUsize, + #[clap(long, default_value = "1")] + concurrency_per_target: NonZeroUsize, + /// Probability for sending `latest=true` in the request (uniform distribution). + #[clap(long)] + limit_to_first_n_targets: Option, + /// Before starting the benchmark, live-reconfigure the pageserver to use the given + /// [`pageserver_api::models::virtual_file::IoEngineKind`]. + #[clap(long)] + set_io_engine: Option, + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + let task = rt.spawn(main_impl(args)); + rt.block_on(task).unwrap().unwrap(); + Ok(()) +} + +#[derive(Debug, Default)] +struct LiveStats { + evictions: AtomicU64, + downloads: AtomicU64, + timeline_restarts: AtomicU64, +} + +impl LiveStats { + fn eviction_done(&self) { + self.evictions.fetch_add(1, Ordering::Relaxed); + } + fn download_done(&self) { + self.downloads.fetch_add(1, Ordering::Relaxed); + } + fn timeline_restart_done(&self) { + self.timeline_restarts.fetch_add(1, Ordering::Relaxed); + } +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + if let Some(engine_str) = &args.set_io_engine { + mgmt_api_client.put_io_engine(engine_str).await?; + } + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + let mut tasks = JoinSet::new(); + + let live_stats = Arc::new(LiveStats::default()); + tasks.spawn({ + let live_stats = Arc::clone(&live_stats); + async move { + let mut last_at = Instant::now(); + loop { + tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await; + let now = Instant::now(); + let delta: Duration = now - last_at; + last_at = now; + + let LiveStats { + evictions, + downloads, + timeline_restarts, + } = &*live_stats; + let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64(); + let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64(); + let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed); + info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}"); + } + } + }); + + for tl in timelines { + for _ in 0..args.tasks_per_target.get() { + tasks.spawn(timeline_actor( + args, + Arc::clone(&mgmt_api_client), + tl, + Arc::clone(&live_stats), + )); + } + } + + while let Some(res) = tasks.join_next().await { + res.unwrap(); + } + Ok(()) +} + +async fn timeline_actor( + args: &'static Args, + mgmt_api_client: Arc, + timeline: TenantTimelineId, + live_stats: Arc, +) { + // TODO: support sharding + let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); + + struct Timeline { + joinset: JoinSet<()>, + layers: Vec>, + concurrency: Arc, + } + loop { + debug!("restarting timeline"); + let layer_map_info = mgmt_api_client + .layer_map_info(tenant_shard_id, timeline.timeline_id) + .await + .unwrap(); + let concurrency = Arc::new(tokio::sync::Semaphore::new( + args.concurrency_per_target.get(), + )); + + let mut joinset = JoinSet::new(); + let layers = layer_map_info + .historic_layers + .into_iter() + .map(|historic_layer| { + let (tx, rx) = mpsc::channel(1); + joinset.spawn(layer_actor( + tenant_shard_id, + timeline.timeline_id, + historic_layer, + rx, + Arc::clone(&mgmt_api_client), + Arc::clone(&live_stats), + )); + tx + }) + .collect::>(); + + let mut timeline = Timeline { + joinset, + layers, + concurrency, + }; + + live_stats.timeline_restart_done(); + + loop { + assert!(!timeline.joinset.is_empty()); + if let Some(res) = timeline.joinset.try_join_next() { + debug!(?res, "a layer actor exited, should not happen"); + timeline.joinset.shutdown().await; + break; + } + + let mut permit = Some( + Arc::clone(&timeline.concurrency) + .acquire_owned() + .await + .unwrap(), + ); + + loop { + let layer_tx = { + let mut rng = rand::thread_rng(); + timeline.layers.choose_mut(&mut rng).expect("no layers") + }; + match layer_tx.try_send(permit.take().unwrap()) { + Ok(_) => break, + Err(e) => match e { + mpsc::error::TrySendError::Full(back) => { + // TODO: retrying introduces bias away from slow downloaders + permit.replace(back); + } + mpsc::error::TrySendError::Closed(_) => panic!(), + }, + } + } + } + } +} + +async fn layer_actor( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + mut layer: HistoricLayerInfo, + mut rx: mpsc::Receiver, + mgmt_api_client: Arc, + live_stats: Arc, +) { + #[derive(Clone, Copy)] + enum Action { + Evict, + OnDemandDownload, + } + + while let Some(_permit) = rx.recv().await { + let action = if layer.is_remote() { + Action::OnDemandDownload + } else { + Action::Evict + }; + + let did_it = match action { + Action::Evict => { + let did_it = mgmt_api_client + .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name()) + .await + .unwrap(); + live_stats.eviction_done(); + did_it + } + Action::OnDemandDownload => { + let did_it = mgmt_api_client + .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name()) + .await + .unwrap(); + live_stats.download_done(); + did_it + } + }; + if !did_it { + debug!("local copy of layer map appears out of sync, re-downloading"); + return; + } + debug!("did it"); + layer.set_remote(match action { + Action::Evict => true, + Action::OnDemandDownload => false, + }); + } +} diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 5d688ed2d1..743102d853 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -16,6 +16,7 @@ mod util { mod cmd { pub(super) mod basebackup; pub(super) mod getpage_latest_lsn; + pub(super) mod ondemand_download_churn; pub(super) mod trigger_initial_size_calculation; } @@ -25,6 +26,7 @@ enum Args { Basebackup(cmd::basebackup::Args), GetPageLatestLsn(cmd::getpage_latest_lsn::Args), TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), + OndemandDownloadChurn(cmd::ondemand_download_churn::Args), } fn main() { @@ -43,6 +45,7 @@ fn main() { Args::TriggerInitialSizeCalculation(args) => { cmd::trigger_initial_size_calculation::main(args) } + Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), } .unwrap() } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 2f172bd384..1fd7c775d5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,3 +1,5 @@ +#![recursion_limit = "300"] + //! Main entry point for the Page Server executable. use std::env::{var, VarError}; @@ -118,6 +120,9 @@ fn main() -> anyhow::Result<()> { &[("node_id", &conf.id.to_string())], ); + // after setting up logging, log the effective IO engine choice + info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); + let tenants_path = conf.tenants_path(); if !tenants_path.exists() { utils::crashsafe::create_dir_all(conf.tenants_path()) @@ -312,6 +317,7 @@ fn start_pageserver( let http_listener = tcp_listener::bind(http_addr)?; let pg_addr = &conf.listen_pg_addr; + info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; @@ -544,7 +550,7 @@ fn start_pageserver( let router_state = Arc::new( http::routes::State::new( conf, - tenant_manager, + tenant_manager.clone(), http_auth.clone(), remote_storage.clone(), broker_client.clone(), @@ -688,6 +694,7 @@ fn start_pageserver( let bg_remote_storage = remote_storage.clone(); let bg_deletion_queue = deletion_queue.clone(); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( + &tenant_manager, bg_remote_storage.map(|_| bg_deletion_queue), 0, )); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 70aa30d24e..89fda958ff 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,18 +30,17 @@ use utils::{ logging::LogFormat, }; -use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; -use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::timeline::GetVectoredImpl; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; -use crate::virtual_file; +use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; +use crate::{tenant::config::TenantConf, virtual_file}; use crate::{ IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, - TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, + TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, }; use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; @@ -287,16 +286,23 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); // use dedicated enum for builder to better indicate the intention // and avoid possible confusion with nested options +#[derive(Clone, Default)] pub enum BuilderValue { Set(T), + #[default] NotSet, } -impl BuilderValue { - pub fn ok_or(self, err: E) -> Result { +impl BuilderValue { + pub fn ok_or(&self, field_name: &'static str, default: BuilderValue) -> anyhow::Result { match self { - Self::Set(v) => Ok(v), - Self::NotSet => Err(err), + Self::Set(v) => Ok(v.clone()), + Self::NotSet => match default { + BuilderValue::Set(v) => Ok(v.clone()), + BuilderValue::NotSet => { + anyhow::bail!("missing config value {field_name:?}") + } + }, } } } @@ -322,6 +328,7 @@ pub(crate) struct NodeMetadata { } // needed to simplify config construction +#[derive(Default)] struct PageServerConfigBuilder { listen_pg_addr: BuilderValue, @@ -388,8 +395,9 @@ struct PageServerConfigBuilder { validate_vectored_get: BuilderValue, } -impl Default for PageServerConfigBuilder { - fn default() -> Self { +impl PageServerConfigBuilder { + #[inline(always)] + fn default_values() -> Self { use self::BuilderValue::*; use defaults::*; Self { @@ -636,122 +644,95 @@ impl PageServerConfigBuilder { } pub fn build(self) -> anyhow::Result { - let concurrent_tenant_warmup = self - .concurrent_tenant_warmup - .ok_or(anyhow!("missing concurrent_tenant_warmup"))?; - let concurrent_tenant_size_logical_size_queries = self - .concurrent_tenant_size_logical_size_queries - .ok_or(anyhow!( - "missing concurrent_tenant_size_logical_size_queries" - ))?; - Ok(PageServerConf { - listen_pg_addr: self - .listen_pg_addr - .ok_or(anyhow!("missing listen_pg_addr"))?, - listen_http_addr: self - .listen_http_addr - .ok_or(anyhow!("missing listen_http_addr"))?, - availability_zone: self - .availability_zone - .ok_or(anyhow!("missing availability_zone"))?, - wait_lsn_timeout: self - .wait_lsn_timeout - .ok_or(anyhow!("missing wait_lsn_timeout"))?, - superuser: self.superuser.ok_or(anyhow!("missing superuser"))?, - page_cache_size: self - .page_cache_size - .ok_or(anyhow!("missing page_cache_size"))?, - max_file_descriptors: self - .max_file_descriptors - .ok_or(anyhow!("missing max_file_descriptors"))?, - workdir: self.workdir.ok_or(anyhow!("missing workdir"))?, - pg_distrib_dir: self - .pg_distrib_dir - .ok_or(anyhow!("missing pg_distrib_dir"))?, - http_auth_type: self - .http_auth_type - .ok_or(anyhow!("missing http_auth_type"))?, - pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?, - auth_validation_public_key_path: self - .auth_validation_public_key_path - .ok_or(anyhow!("missing auth_validation_public_key_path"))?, - remote_storage_config: self - .remote_storage_config - .ok_or(anyhow!("missing remote_storage_config"))?, - id: self.id.ok_or(anyhow!("missing id"))?, - // TenantConf is handled separately - default_tenant_conf: TenantConf::default(), - broker_endpoint: self - .broker_endpoint - .ok_or(anyhow!("No broker endpoints provided"))?, - broker_keepalive_interval: self - .broker_keepalive_interval - .ok_or(anyhow!("No broker keepalive interval provided"))?, - log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, - concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( - concurrent_tenant_size_logical_size_queries, - ), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( - concurrent_tenant_size_logical_size_queries, - ), - metric_collection_interval: self - .metric_collection_interval - .ok_or(anyhow!("missing metric_collection_interval"))?, - cached_metric_collection_interval: self - .cached_metric_collection_interval - .ok_or(anyhow!("missing cached_metric_collection_interval"))?, - metric_collection_endpoint: self - .metric_collection_endpoint - .ok_or(anyhow!("missing metric_collection_endpoint"))?, - synthetic_size_calculation_interval: self - .synthetic_size_calculation_interval - .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, - disk_usage_based_eviction: self - .disk_usage_based_eviction - .ok_or(anyhow!("missing disk_usage_based_eviction"))?, - test_remote_failures: self - .test_remote_failures - .ok_or(anyhow!("missing test_remote_failuers"))?, - ondemand_download_behavior_treat_error_as_warn: self - .ondemand_download_behavior_treat_error_as_warn - .ok_or(anyhow!( - "missing ondemand_download_behavior_treat_error_as_warn" - ))?, - background_task_maximum_delay: self - .background_task_maximum_delay - .ok_or(anyhow!("missing background_task_maximum_delay"))?, - control_plane_api: self - .control_plane_api - .ok_or(anyhow!("missing control_plane_api"))?, - control_plane_api_token: self - .control_plane_api_token - .ok_or(anyhow!("missing control_plane_api_token"))?, - control_plane_emergency_mode: self - .control_plane_emergency_mode - .ok_or(anyhow!("missing control_plane_emergency_mode"))?, - heatmap_upload_concurrency: self - .heatmap_upload_concurrency - .ok_or(anyhow!("missing heatmap_upload_concurrency"))?, - secondary_download_concurrency: self - .secondary_download_concurrency - .ok_or(anyhow!("missing secondary_download_concurrency"))?, - ingest_batch_size: self - .ingest_batch_size - .ok_or(anyhow!("missing ingest_batch_size"))?, - virtual_file_io_engine: self - .virtual_file_io_engine - .ok_or(anyhow!("missing virtual_file_io_engine"))?, - get_vectored_impl: self - .get_vectored_impl - .ok_or(anyhow!("missing get_vectored_impl"))?, - max_vectored_read_bytes: self - .max_vectored_read_bytes - .ok_or(anyhow!("missing max_vectored_read_bytes"))?, - validate_vectored_get: self - .validate_vectored_get - .ok_or(anyhow!("missing validate_vectored_get"))?, - }) + let default = Self::default_values(); + + macro_rules! conf { + (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => { + PageServerConf { + $( + $field: self.$field.ok_or(stringify!($field), default.$field)?, + )* + $( + $custom_field: $custom_value, + )* + } + }; + } + + Ok(conf!( + USING DEFAULT + { + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + superuser, + page_cache_size, + max_file_descriptors, + workdir, + pg_distrib_dir, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage_config, + id, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + cached_metric_collection_interval, + metric_collection_endpoint, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_api_token, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + get_vectored_impl, + max_vectored_read_bytes, + validate_vectored_get, + } + CUSTOM LOGIC + { + // TenantConf is handled separately + default_tenant_conf: TenantConf::default(), + concurrent_tenant_warmup: ConfigurableSemaphore::new({ + self + .concurrent_tenant_warmup + .ok_or("concurrent_tenant_warmpup", + default.concurrent_tenant_warmup)? + }), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + self + .concurrent_tenant_size_logical_size_queries + .ok_or("concurrent_tenant_size_logical_size_queries", + default.concurrent_tenant_size_logical_size_queries.clone())? + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + // re-use `concurrent_tenant_size_logical_size_queries` + self + .concurrent_tenant_size_logical_size_queries + .ok_or("eviction_task_immitated_concurrent_logical_size_queries", + default.concurrent_tenant_size_logical_size_queries.clone())?, + ), + virtual_file_io_engine: match self.virtual_file_io_engine { + BuilderValue::Set(v) => v, + BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? { + io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise + io_engine::FeatureTestResult::Worse { engine, remark } => { + // TODO: bubble this up to the caller so we can tracing::warn! it. + eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + engine + } + }, + }, + } + )) } } @@ -831,18 +812,7 @@ impl PageServerConf { .join(timeline_id.to_string()) } - pub fn timeline_uninit_mark_file_path( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - ) -> Utf8PathBuf { - path_with_suffix_extension( - self.timeline_path(&tenant_shard_id, &timeline_id), - TIMELINE_UNINIT_MARK_SUFFIX, - ) - } - - pub fn timeline_delete_mark_file_path( + pub(crate) fn timeline_delete_mark_file_path( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -853,7 +823,10 @@ impl PageServerConf { ) } - pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + pub(crate) fn tenant_deleted_mark_file_path( + &self, + tenant_shard_id: &TenantShardId, + ) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_DELETED_MARKER_FILE_NAME) } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 6a070e2135..0771229845 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -567,9 +567,9 @@ paths: application/json: schema: $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/{tenant_id}/location_config: + /v1/tenant/{tenant_shard_id}/location_config: parameters: - - name: tenant_id + - name: tenant_shard_id in: path required: true schema: @@ -965,12 +965,28 @@ paths: required: true schema: type: string + - name: wait_ms + description: If set, we will wait this long for download to complete, and if it isn't complete then return 202 + in: query + required: false + schema: + type: integer post: description: | If the location is in secondary mode, download latest heatmap and layers responses: "200": description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/SecondaryProgress" + "202": + description: Download has started but not yet finished + content: + application/json: + schema: + $ref: "#/components/schemas/SecondaryProgress" "500": description: Generic operation error content: @@ -1367,10 +1383,11 @@ components: TenantLocationConfigRequest: type: object required: - - tenant_id + - mode properties: tenant_id: type: string + description: Not used, scheduled for removal. mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] @@ -1622,6 +1639,37 @@ components: Lower is better score for how good this pageserver would be for the next tenant. The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated. + SecondaryProgress: + type: object + required: + - heatmap_mtime + - layers_downloaded + - layers_total + - bytes_downloaded + - bytes_total + properties: + heatmap_mtime: + type: string + format: date-time + description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format) + layers_downloaded: + type: integer + format: int64 + description: How many layers from the latest layer heatmap are present on disk + bytes_downloaded: + type: integer + format: int64 + description: How many bytes of layer content from the latest layer heatmap are present on disk + layers_total: + type: integer + format: int64 + description: How many layers were in the latest layer heatmap + bytes_total: + type: integer + format: int64 + description: How many bytes of layer content were in the latest layer heatmap + + Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bb8b1bb7e5..229f3ae98f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -535,9 +535,9 @@ async fn timeline_create_handler( ) } Err( - tenant::CreateTimelineError::Conflict - | tenant::CreateTimelineError::AlreadyCreating, - ) => json_response(StatusCode::CONFLICT, ()), + e @ tenant::CreateTimelineError::Conflict + | e @ tenant::CreateTimelineError::AlreadyCreating, + ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())), Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response( StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(format!("{err:#}")), @@ -885,14 +885,16 @@ async fn tenant_detach_handler( let state = get_state(&request); let conf = state.conf; - mgr::detach_tenant( - conf, - tenant_shard_id, - detach_ignored.unwrap_or(false), - &state.deletion_queue_client, - ) - .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) - .await?; + state + .tenant_manager + .detach_tenant( + conf, + tenant_shard_id, + detach_ignored.unwrap_or(false), + &state.deletion_queue_client, + ) + .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) + .await?; json_response(StatusCode::OK, ()) } @@ -1403,7 +1405,9 @@ async fn update_tenant_config_handler( TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; let state = get_state(&request); - mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id) + state + .tenant_manager + .set_new_tenant_config(tenant_conf, tenant_id) .instrument(info_span!("tenant_config", %tenant_id)) .await?; @@ -1428,13 +1432,14 @@ async fn put_tenant_location_config_handler( // The `Detached` state is special, it doesn't upsert a tenant, it removes // its local disk content and drops it from memory. if let LocationConfigMode::Detached = request_data.config.mode { - if let Err(e) = - mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) - .instrument(info_span!("tenant_detach", - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug() - )) - .await + if let Err(e) = state + .tenant_manager + .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) + .instrument(info_span!("tenant_detach", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + )) + .await { match e { TenantStateError::SlotError(TenantSlotError::NotFound(_)) => { @@ -1648,8 +1653,7 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = - mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; + let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?; let gc_result = wait_task_done .await .context("wait for gc task") @@ -1987,13 +1991,42 @@ async fn secondary_download_handler( ) -> Result, ApiError> { let state = get_state(&request); let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; - state - .secondary_controller - .download_tenant(tenant_shard_id) - .await - .map_err(ApiError::InternalServerError)?; + let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis); - json_response(StatusCode::OK, ()) + // We don't need this to issue the download request, but: + // - it enables us to cleanly return 404 if we get a request for an absent shard + // - we will use this to provide status feedback in the response + let Some(secondary_tenant) = state + .tenant_manager + .get_secondary_tenant_shard(tenant_shard_id) + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), + )); + }; + + let timeout = wait.unwrap_or(Duration::MAX); + + let status = match tokio::time::timeout( + timeout, + state.secondary_controller.download_tenant(tenant_shard_id), + ) + .await + { + // Download job ran to completion. + Ok(Ok(())) => StatusCode::OK, + // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered + // okay. We could get an error here in the unlikely edge case that the tenant + // was detached between our check above and executing the download job. + Ok(Err(e)) => return Err(ApiError::InternalServerError(e)), + // A timeout is not an error: we have started the download, we're just not done + // yet. The caller will get a response body indicating status. + Err(_) => StatusCode::ACCEPTED, + }; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + json_response(status, progress) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -2053,6 +2086,10 @@ async fn get_utilization( r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { + fail::fail_point!("get-utilization-http-handler", |_| { + Err(ApiError::ResourceUnavailable("failpoint".into())) + }); + // this probably could be completely public, but lets make that change later. check_permission(&r, None)?; @@ -2108,6 +2145,16 @@ where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, { + if request.uri() != &"/v1/failpoints".parse::().unwrap() { + fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable( + "failpoint".into() + ))); + + fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError( + anyhow::anyhow!("failpoint") + ))); + } + // Spawn a new task to handle the request, to protect the handler from unexpected // async cancellations. Most pageserver functions are not async cancellation safe. // We arm a drop-guard, so that if Hyper drops the Future, we signal the task diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index d66df36b3a..343dec2ca1 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,28 +2,20 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! -use std::io::SeekFrom; use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; -use async_compression::tokio::bufread::ZstdDecoder; -use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; -use nix::NixPath; -use tokio::fs::{File, OpenOptions}; -use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; -use tokio_tar::Builder; -use tokio_tar::HeaderMode; use tracing::*; use walkdir::WalkDir; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; -use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; @@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result reader.read_to_end(&mut buf).await?; Ok(Bytes::from(buf)) } - -pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> { - let file = OpenOptions::new() - .create(true) - .truncate(true) - .read(true) - .write(true) - .open(&tmp_path) - .await - .with_context(|| format!("tempfile creation {tmp_path}"))?; - - let mut paths = Vec::new(); - for entry in WalkDir::new(pgdata_path) { - let entry = entry?; - let metadata = entry.metadata().expect("error getting dir entry metadata"); - // Also allow directories so that we also get empty directories - if !(metadata.is_file() || metadata.is_dir()) { - continue; - } - let path = entry.into_path(); - paths.push(path); - } - // Do a sort to get a more consistent listing - paths.sort_unstable(); - let zstd = ZstdEncoder::with_quality_and_params( - file, - Level::Default, - &[CParameter::enable_long_distance_matching(true)], - ); - let mut builder = Builder::new(zstd); - // Use reproducible header mode - builder.mode(HeaderMode::Deterministic); - for path in paths { - let rel_path = path.strip_prefix(pgdata_path)?; - if rel_path.is_empty() { - // The top directory should not be compressed, - // the tar crate doesn't like that - continue; - } - builder.append_path_with_name(&path, rel_path).await?; - } - let mut zstd = builder.into_inner().await?; - zstd.shutdown().await?; - let mut compressed = zstd.into_inner(); - let compressed_len = compressed.metadata().await?.len(); - const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; - if compressed_len > INITDB_TAR_ZST_WARN_LIMIT { - warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."); - } - compressed.seek(SeekFrom::Start(0)).await?; - Ok((compressed, compressed_len)) -} - -pub async fn extract_tar_zst( - pgdata_path: &Utf8Path, - tar_zst: impl AsyncBufRead + Unpin, -) -> Result<()> { - let tar = Box::pin(ZstdDecoder::new(tar_zst)); - let mut archive = Archive::new(tar); - archive.unpack(pgdata_path).await?; - Ok(()) -} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 02a690d4e1..f947a75f61 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -31,6 +31,7 @@ pub mod walredo; use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; +use tenant::mgr::TenantManager; use tracing::info; /// Current storage format version @@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; #[tracing::instrument(skip_all, fields(%exit_code))] -pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) { +pub async fn shutdown_pageserver( + tenant_manager: &TenantManager, + deletion_queue: Option, + exit_code: i32, +) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. timed( - tenant::mgr::shutdown_all_tenants(), + tenant_manager.shutdown(), "shutdown all tenants", Duration::from_secs(5), ) @@ -114,27 +119,27 @@ pub const METADATA_FILE_NAME: &str = "metadata"; /// Per-tenant configuration file. /// Full path: `tenants//config`. -pub const TENANT_CONFIG_NAME: &str = "config"; +pub(crate) const TENANT_CONFIG_NAME: &str = "config"; /// Per-tenant configuration file. /// Full path: `tenants//config`. -pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; +pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; /// Per-tenant copy of their remote heatmap, downloaded into the local /// tenant path while in secondary mode. -pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; +pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; /// A suffix used for various temporary files. Any temporary files found in the /// data directory at pageserver startup can be automatically removed. -pub const TEMP_FILE_SUFFIX: &str = "___temp"; +pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp"; /// A marker file to mark that a timeline directory was not fully initialized. /// If a timeline directory with this marker is encountered at pageserver startup, /// the timeline directory and the marker file are both removed. /// Full path: `tenants//timelines/___uninit`. -pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; +pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; -pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; +pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; /// A marker file to prevent pageserver from loading a certain tenant on restart. /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding @@ -161,11 +166,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool { // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once // from the name. -pub fn is_uninit_mark(path: &Utf8Path) -> bool { +pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool { ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) } -pub fn is_delete_mark(path: &Utf8Path) -> bool { +pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool { ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 814b3e1f96..075bb76a1b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -167,7 +167,7 @@ impl GetVectoredLatency { pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", - "Time spent in get_vectored", + "Time spent in get_vectored, excluding time spent in timeline_get_throttle.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) @@ -2465,7 +2465,8 @@ impl>, O, E> Future for MeasuredRemoteOp { } pub mod tokio_epoll_uring { - use metrics::UIntGauge; + use metrics::{register_int_counter, UIntGauge}; + use once_cell::sync::Lazy; pub struct Collector { descs: Vec, @@ -2473,15 +2474,13 @@ pub mod tokio_epoll_uring { systems_destroyed: UIntGauge, } - const NMETRICS: usize = 2; - impl metrics::core::Collector for Collector { fn desc(&self) -> Vec<&metrics::core::Desc> { self.descs.iter().collect() } fn collect(&self) -> Vec { - let mut mfs = Vec::with_capacity(NMETRICS); + let mut mfs = Vec::with_capacity(Self::NMETRICS); let tokio_epoll_uring::metrics::Metrics { systems_created, systems_destroyed, @@ -2495,6 +2494,8 @@ pub mod tokio_epoll_uring { } impl Collector { + const NMETRICS: usize = 2; + #[allow(clippy::new_without_default)] pub fn new() -> Self { let mut descs = Vec::new(); @@ -2528,6 +2529,22 @@ pub mod tokio_epoll_uring { } } } + + pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count", + "Number of times where thread_local_system creation spanned multiple executor threads", + ) + .unwrap() + }); + + pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count", + "Number of times thread_local_system creation failed and was retried after back-off.", + ) + .unwrap() + }); } pub(crate) mod tenant_throttling { @@ -2656,6 +2673,8 @@ pub fn preinitialize_metrics() { &WALRECEIVER_BROKER_UPDATES, &WALRECEIVER_CANDIDATES_ADDED, &WALRECEIVER_CANDIDATES_REMOVED, + &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, + &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 275a72c0b0..69e163effa 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -50,8 +50,6 @@ use once_cell::sync::Lazy; use utils::id::TimelineId; -use crate::shutdown_pageserver; - // // There are four runtimes: // @@ -453,7 +451,7 @@ async fn task_finish( } if shutdown_process { - shutdown_pageserver(None, 1).await; + std::process::exit(1); } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f0996328c0..7a6ddd6a4e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -43,6 +43,8 @@ use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; use utils::timeout::timeout_cancellable; use utils::timeout::TimeoutCancellableError; +use utils::zstd::create_zst_tarball; +use utils::zstd::extract_zst_tarball; use self::config::AttachedLocationConfig; use self::config::AttachmentMode; @@ -55,8 +57,8 @@ use self::mgr::GetTenantError; use self::mgr::TenantsMap; use self::remote_timeline_client::upload::upload_index_part; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; -use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; use self::timeline::TimelineResources; @@ -565,9 +567,8 @@ impl Tenant { // avoiding holding it across awaits let mut timelines_accessor = self.timelines.lock().unwrap(); match timelines_accessor.entry(timeline_id) { + // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { - // The uninit mark file acts as a lock that prevents another task from - // initializing the timeline at the same time. unreachable!( "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" ); @@ -1064,8 +1065,7 @@ impl Tenant { let entry_path = entry.path(); let purge = if crate::is_temporary(entry_path) - // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already - // covered by the check that the timeline must exist in remote storage. + // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718) || is_uninit_mark(entry_path) || crate::is_delete_mark(entry_path) { @@ -1298,11 +1298,6 @@ impl Tenant { /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0)) /// and the timeline will fail to load at a restart. /// - /// That's why we add an uninit mark file, and wrap it together witht the Timeline - /// in-memory object into UninitializedTimeline. - /// Once the caller is done setting up the timeline, they should call - /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark. - /// /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the /// minimum amount of keys required to get a writable timeline. /// (Without it, `put` might fail due to `repartition` failing.) @@ -1318,7 +1313,9 @@ impl Tenant { "Cannot create empty timelines on inactive tenant" ); - let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?; + // Protect against concurrent attempts to use this TimelineId + let create_guard = self.create_timeline_create_guard(new_timeline_id)?; + let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() @@ -1333,7 +1330,7 @@ impl Tenant { self.prepare_new_timeline( new_timeline_id, &new_metadata, - timeline_uninit_mark, + create_guard, initdb_lsn, None, ) @@ -1421,9 +1418,8 @@ impl Tenant { .map_err(|_| CreateTimelineError::ShuttingDown)?; // Get exclusive access to the timeline ID: this ensures that it does not already exist, - // and that no other creation attempts will be allowed in while we are working. The - // uninit_mark is a guard. - let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) { + // and that no other creation attempts will be allowed in while we are working. + let create_guard = match self.create_timeline_create_guard(new_timeline_id) { Ok(m) => m, Err(TimelineExclusionError::AlreadyCreating) => { // Creation is in progress, we cannot create it again, and we cannot @@ -1466,6 +1462,8 @@ impl Tenant { } }; + pausable_failpoint!("timeline-creation-after-uninit"); + let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = self @@ -1513,7 +1511,7 @@ impl Tenant { &ancestor_timeline, new_timeline_id, ancestor_start_lsn, - uninit_mark, + create_guard, ctx, ) .await? @@ -1523,7 +1521,7 @@ impl Tenant { new_timeline_id, pg_version, load_existing_initdb, - uninit_mark, + create_guard, ctx, ) .await? @@ -2870,9 +2868,9 @@ impl Tenant { start_lsn: Option, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap(); + let create_guard = self.create_timeline_create_guard(dst_id).unwrap(); let tl = self - .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx) + .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx) .await?; tl.set_state(TimelineState::Active); Ok(tl) @@ -2886,10 +2884,10 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx) + self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx) .await } @@ -2898,7 +2896,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, _ctx: &RequestContext, ) -> Result, CreateTimelineError> { let src_id = src_timeline.timeline_id; @@ -2982,7 +2980,7 @@ impl Tenant { .prepare_new_timeline( dst_id, &metadata, - timeline_uninit_mark, + timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), ) @@ -3014,12 +3012,12 @@ impl Tenant { load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { - let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap(); + let create_guard = self.create_timeline_create_guard(timeline_id).unwrap(); self.bootstrap_timeline( timeline_id, pg_version, load_existing_initdb, - uninit_mark, + create_guard, ctx, ) .await @@ -3046,8 +3044,13 @@ impl Tenant { } } - let (pgdata_zstd, tar_zst_size) = - import_datadir::create_tar_zst(pgdata_path, &temp_path).await?; + let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?; + const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; + if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT { + warn!( + "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}." + ); + } pausable_failpoint!("before-initdb-upload"); @@ -3083,7 +3086,7 @@ impl Tenant { timeline_id: TimelineId, pg_version: u32, load_existing_initdb: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, ctx: &RequestContext, ) -> anyhow::Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` @@ -3095,13 +3098,14 @@ impl Tenant { TEMP_FILE_SUFFIX, ); - // an uninit mark was placed before, nothing else can access this timeline files - // current initdb was not run yet, so remove whatever was left from the previous runs + // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees + // we won't race with other creations or existent timelines with the same path. if pgdata_path.exists() { fs::remove_dir_all(&pgdata_path).with_context(|| { format!("Failed to remove already existing initdb directory: {pgdata_path}") })?; } + // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it scopeguard::defer! { if let Err(e) = fs::remove_dir_all(&pgdata_path) { @@ -3146,7 +3150,7 @@ impl Tenant { let buf_read = BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst); - import_datadir::extract_tar_zst(&pgdata_path, buf_read) + extract_zst_tarball(&pgdata_path, buf_read) .await .context("extract initdb tar")?; } else { @@ -3178,7 +3182,7 @@ impl Tenant { .prepare_new_timeline( timeline_id, &new_metadata, - timeline_uninit_mark, + timeline_create_guard, pgdata_lsn, None, ) @@ -3250,13 +3254,12 @@ impl Tenant { /// /// An empty layer map is initialized, and new data and WAL can be imported starting /// at 'disk_consistent_lsn'. After any initial data has been imported, call - /// `finish_creation` to insert the Timeline into the timelines map and to remove the - /// uninit mark file. + /// `finish_creation` to insert the Timeline into the timelines map. async fn prepare_new_timeline<'a>( &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - uninit_mark: TimelineUninitMark<'a>, + create_guard: TimelineCreateGuard<'a>, start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result { @@ -3279,9 +3282,12 @@ impl Tenant { timeline_struct.init_empty_layer_map(start_lsn); - if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await { + if let Err(e) = self + .create_timeline_files(&create_guard.timeline_path) + .await + { error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); - cleanup_timeline_directory(uninit_mark); + cleanup_timeline_directory(create_guard); return Err(e); } @@ -3292,41 +3298,31 @@ impl Tenant { Ok(UninitializedTimeline::new( self, new_timeline_id, - Some((timeline_struct, uninit_mark)), + Some((timeline_struct, create_guard)), )) } async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> { crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?; - fail::fail_point!("after-timeline-uninit-mark-creation", |_| { - anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); + fail::fail_point!("after-timeline-dir-creation", |_| { + anyhow::bail!("failpoint after-timeline-dir-creation"); }); Ok(()) } - /// Attempts to create an uninit mark file for the timeline initialization. - /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists. - /// - /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init. - fn create_timeline_uninit_mark( + /// Get a guard that provides exclusive access to the timeline directory, preventing + /// concurrent attempts to create the same timeline. + fn create_timeline_create_guard( &self, timeline_id: TimelineId, - ) -> Result { + ) -> Result { let tenant_shard_id = self.tenant_shard_id; - let uninit_mark_path = self - .conf - .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id); let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); - let uninit_mark = TimelineUninitMark::new( - self, - timeline_id, - uninit_mark_path.clone(), - timeline_path.clone(), - )?; + let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?; // At this stage, we have got exclusive access to in-memory state for this timeline ID // for creation. @@ -3342,23 +3338,7 @@ impl Tenant { ))); } - // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees - // that during process runtime, colliding creations will be caught in-memory without getting - // as far as failing to write a file. - fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&uninit_mark_path) - .context("Failed to create uninit mark file") - .and_then(|_| { - crashsafe::fsync_file_and_parent(&uninit_mark_path) - .context("Failed to fsync uninit mark file") - }) - .with_context(|| { - format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}") - })?; - - Ok(uninit_mark) + Ok(create_guard) } /// Gathers inputs from all of the timelines to produce a sizing model input. @@ -5099,15 +5079,15 @@ mod tests { } #[tokio::test] - async fn test_uninit_mark_crash() -> anyhow::Result<()> { - let name = "test_uninit_mark_crash"; + async fn test_create_guard_crash() -> anyhow::Result<()> { + let name = "test_create_guard_crash"; let harness = TenantHarness::create(name)?; { let (tenant, ctx) = harness.load().await; let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; - // Keeps uninit mark in place + // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again let raw_tline = tline.raw_timeline().unwrap(); raw_tline .shutdown() @@ -5135,11 +5115,6 @@ mod tests { .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) .exists()); - assert!(!harness - .conf - .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID) - .exists()); - Ok(()) } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index ffb7206b1e..cab60c3111 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -296,6 +296,7 @@ impl DeleteTenantFlow { remote_storage: Option, tenants: &'static std::sync::RwLock, tenant: Arc, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { span::debug_assert_current_span_has_tenant_id(); @@ -303,7 +304,9 @@ impl DeleteTenantFlow { let mut guard = Self::prepare(&tenant).await?; - if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { + if let Err(e) = + Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await + { tenant.set_broken(format!("{e:#}")).await; return Err(e); } @@ -322,6 +325,7 @@ impl DeleteTenantFlow { conf: &'static PageServerConf, remote_storage: Option<&GenericRemoteStorage>, tenant: &Tenant, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { guard.mark_in_progress()?; @@ -335,15 +339,9 @@ impl DeleteTenantFlow { // Though sounds scary, different mark name? // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark( - conf, - remote_storage, - &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), - ) - .await - .context("remote_mark")? + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) + .await + .context("remote_mark")? } fail::fail_point!("tenant-delete-before-create-local-mark", |_| { @@ -546,8 +544,7 @@ impl DeleteTenantFlow { conf, remote_storage.as_ref(), &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), + &task_mgr::shutdown_token(), ) .await?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 26fcce1f38..f456ca3006 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -102,7 +102,7 @@ pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. /// New tenants can be added using [`tenant_map_acquire_slot`]. Open(BTreeMap), - /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`]. + /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`]. /// Existing tenants are still accessible, but no new tenants can be created. ShuttingDown(BTreeMap), } @@ -261,6 +261,12 @@ pub struct TenantManager { // See https://github.com/neondatabase/neon/issues/5796 tenants: &'static std::sync::RwLock, resources: TenantSharedResources, + + // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token. + // This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime), + // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or + // when the tenant detaches. + cancel: CancellationToken, } fn emergency_generations( @@ -620,13 +626,14 @@ pub async fn init_tenant_mgr( conf, tenants: &TENANTS, resources, + cancel: CancellationToken::new(), }) } /// Wrapper for Tenant::spawn that checks invariants before running, and inserts /// a broken tenant in the map if Tenant::spawn fails. #[allow(clippy::too_many_arguments)] -pub(crate) fn tenant_spawn( +fn tenant_spawn( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, tenant_path: &Utf8Path, @@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn( Ok(tenant) } -/// -/// Shut down all tenants. This runs as part of pageserver shutdown. -/// -/// NB: We leave the tenants in the map, so that they remain accessible through -/// the management API until we shut it down. If we removed the shut-down tenants -/// from the tenants map, the management API would return 404 for these tenants, -/// because TenantsMap::get() now returns `None`. -/// That could be easily misinterpreted by control plane, the consumer of the -/// management API. For example, it could attach the tenant on a different pageserver. -/// We would then be in split-brain once this pageserver restarts. -#[instrument(skip_all)] -pub(crate) async fn shutdown_all_tenants() { - shutdown_all_tenants0(&TENANTS).await -} - async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { let mut join_set = JoinSet::new(); @@ -833,40 +825,6 @@ pub(crate) enum SetNewTenantConfigError { Other(anyhow::Error), } -pub(crate) async fn set_new_tenant_config( - conf: &'static PageServerConf, - new_tenant_conf: TenantConfOpt, - tenant_id: TenantId, -) -> Result<(), SetNewTenantConfigError> { - // Legacy API: does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_shard_id, true)?; - - if !tenant.tenant_shard_id().shard_count.is_unsharded() { - // Note that we use ShardParameters::default below. - return Err(SetNewTenantConfigError::Other(anyhow::anyhow!( - "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants" - ))); - } - - // This is a legacy API that only operates on attached tenants: the preferred - // API to use is the location_config/ endpoint, which lets the caller provide - // the full LocationConf. - let location_conf = LocationConf::attached_single( - new_tenant_conf.clone(), - tenant.generation, - &ShardParameters::default(), - ); - - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) - .await - .map_err(SetNewTenantConfigError::Persist)?; - tenant.set_new_tenant_config(new_tenant_conf); - Ok(()) -} - #[derive(thiserror::Error, Debug)] pub(crate) enum UpsertLocationError { #[error("Bad config request: {0}")] @@ -1428,6 +1386,7 @@ impl TenantManager { self.resources.remote_storage.clone(), &TENANTS, tenant, + &self.cancel, ) .await; @@ -1443,6 +1402,35 @@ impl TenantManager { new_shard_count: ShardCount, new_stripe_size: Option, ctx: &RequestContext, + ) -> anyhow::Result> { + let r = self + .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx) + .await; + if r.is_err() { + // Shard splitting might have left the original shard in a partially shut down state (it + // stops the shard's remote timeline client). Reset it to ensure we leave things in + // a working state. + if self.get(tenant_shard_id).is_some() { + tracing::warn!("Resetting {tenant_shard_id} after shard split failure"); + if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await { + // Log this error because our return value will still be the original error, not this one. This is + // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional + // (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or + // setting it broken probably won't help either. + tracing::error!("Failed to reset {tenant_shard_id}: {e}"); + } + } + } + + r + } + + pub(crate) async fn do_shard_split( + &self, + tenant_shard_id: TenantShardId, + new_shard_count: ShardCount, + new_stripe_size: Option, + ctx: &RequestContext, ) -> anyhow::Result> { let tenant = get_tenant(tenant_shard_id, true)?; @@ -1477,6 +1465,10 @@ impl TenantManager { .join(",") ); + fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + let parent_shard_identity = tenant.shard_identity; let parent_tenant_conf = tenant.get_tenant_conf(); let parent_generation = tenant.generation; @@ -1490,6 +1482,10 @@ impl TenantManager { return Err(e); } + fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + self.resources.deletion_queue_client.flush_advisory(); // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant @@ -1511,11 +1507,16 @@ impl TenantManager { anyhow::bail!("Detached parent shard in the middle of split!") } }; - + fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); // Optimization: hardlink layers from the parent into the children, so that they don't have to // re-download & duplicate the data referenced in their initial IndexPart self.shard_split_hardlink(parent, child_shards.clone()) .await?; + fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); // Take a snapshot of where the parent's WAL ingest had got to: we will wait for // child shards to reach this point. @@ -1555,6 +1556,10 @@ impl TenantManager { .await?; } + fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + // Phase 4: wait for child chards WAL ingest to catch up to target LSN for child_shard_id in &child_shards { let child_shard_id = *child_shard_id; @@ -1587,6 +1592,10 @@ impl TenantManager { timeline.timeline_id, target_lsn ); + + fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( + "failpoint" + ))); if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await { // Failure here might mean shutdown, in any case this part is an optimization // and we shouldn't hold up the split operation. @@ -1618,19 +1627,11 @@ impl TenantManager { let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - None, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); + self.spawn_background_purge(tmp_path); + + fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( + "failpoint" + ))); parent_slot_guard.drop_old_value()?; @@ -1763,6 +1764,151 @@ impl TenantManager { Ok(()) } + + /// + /// Shut down all tenants. This runs as part of pageserver shutdown. + /// + /// NB: We leave the tenants in the map, so that they remain accessible through + /// the management API until we shut it down. If we removed the shut-down tenants + /// from the tenants map, the management API would return 404 for these tenants, + /// because TenantsMap::get() now returns `None`. + /// That could be easily misinterpreted by control plane, the consumer of the + /// management API. For example, it could attach the tenant on a different pageserver. + /// We would then be in split-brain once this pageserver restarts. + #[instrument(skip_all)] + pub(crate) async fn shutdown(&self) { + self.cancel.cancel(); + + shutdown_all_tenants0(self.tenants).await + } + + /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in + /// the background, and thereby avoid blocking any API requests on this deletion completing. + fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) { + // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. + // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. + let task_tenant_id = None; + + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::MgmtRequest, + task_tenant_id, + None, + "tenant_files_delete", + false, + async move { + fs::remove_dir_all(tmp_path.as_path()) + .await + .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) + }, + ); + } + + pub(crate) async fn detach_tenant( + &self, + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + detach_ignored: bool, + deletion_queue_client: &DeletionQueueClient, + ) -> Result<(), TenantStateError> { + let tmp_path = self + .detach_tenant0( + conf, + &TENANTS, + tenant_shard_id, + detach_ignored, + deletion_queue_client, + ) + .await?; + self.spawn_background_purge(tmp_path); + + Ok(()) + } + + async fn detach_tenant0( + &self, + conf: &'static PageServerConf, + tenants: &std::sync::RwLock, + tenant_shard_id: TenantShardId, + detach_ignored: bool, + deletion_queue_client: &DeletionQueueClient, + ) -> Result { + let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); + safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + }) + }; + + let removal_result = remove_tenant_from_memory( + tenants, + tenant_shard_id, + tenant_dir_rename_operation(tenant_shard_id), + ) + .await; + + // Flush pending deletions, so that they have a good chance of passing validation + // before this tenant is potentially re-attached elsewhere. + deletion_queue_client.flush_advisory(); + + // Ignored tenants are not present in memory and will bail the removal from memory operation. + // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. + if detach_ignored + && matches!( + removal_result, + Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) + ) + { + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); + if tenant_ignore_mark.exists() { + info!("Detaching an ignored tenant"); + let tmp_path = tenant_dir_rename_operation(tenant_shard_id) + .await + .with_context(|| { + format!("Ignored tenant {tenant_shard_id} local directory rename") + })?; + return Ok(tmp_path); + } + } + + removal_result + } + + pub(crate) async fn set_new_tenant_config( + &self, + new_tenant_conf: TenantConfOpt, + tenant_id: TenantId, + ) -> Result<(), SetNewTenantConfigError> { + // Legacy API: does not support sharding + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + info!("configuring tenant {tenant_id}"); + let tenant = get_tenant(tenant_shard_id, true)?; + + if !tenant.tenant_shard_id().shard_count.is_unsharded() { + // Note that we use ShardParameters::default below. + return Err(SetNewTenantConfigError::Other(anyhow::anyhow!( + "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants" + ))); + } + + // This is a legacy API that only operates on attached tenants: the preferred + // API to use is the location_config/ endpoint, which lets the caller provide + // the full LocationConf. + let location_conf = LocationConf::attached_single( + new_tenant_conf.clone(), + tenant.generation, + &ShardParameters::default(), + ); + + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf) + .await + .map_err(SetNewTenantConfigError::Persist)?; + tenant.set_new_tenant_config(new_tenant_conf); + Ok(()) + } } #[derive(Debug, thiserror::Error)] @@ -1964,87 +2110,6 @@ pub(crate) enum TenantStateError { Other(#[from] anyhow::Error), } -pub(crate) async fn detach_tenant( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result<(), TenantStateError> { - let tmp_path = detach_tenant0( - conf, - &TENANTS, - tenant_shard_id, - detach_ignored, - deletion_queue_client, - ) - .await?; - // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. - // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. - let task_tenant_id = None; - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); - Ok(()) -} - -async fn detach_tenant0( - conf: &'static PageServerConf, - tenants: &std::sync::RwLock, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result { - let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { - let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); - safe_rename_tenant_dir(&local_tenant_directory) - .await - .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename")) - }; - - let removal_result = remove_tenant_from_memory( - tenants, - tenant_shard_id, - tenant_dir_rename_operation(tenant_shard_id), - ) - .await; - - // Flush pending deletions, so that they have a good chance of passing validation - // before this tenant is potentially re-attached elsewhere. - deletion_queue_client.flush_advisory(); - - // Ignored tenants are not present in memory and will bail the removal from memory operation. - // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. - if detach_ignored - && matches!( - removal_result, - Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) - ) - { - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - if tenant_ignore_mark.exists() { - info!("Detaching an ignored tenant"); - let tmp_path = tenant_dir_rename_operation(tenant_shard_id) - .await - .with_context(|| { - format!("Ignored tenant {tenant_shard_id} local directory rename") - })?; - return Ok(tmp_path); - } - } - - removal_result -} - pub(crate) async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, @@ -2665,7 +2730,7 @@ use { utils::http::error::ApiError, }; -pub(crate) async fn immediate_gc( +pub(crate) fn immediate_gc( tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, @@ -2687,6 +2752,8 @@ pub(crate) async fn immediate_gc( // Run in task_mgr to avoid race with tenant_detach operation let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + // TODO: spawning is redundant now, need to hold the gate task_mgr::spawn( &tokio::runtime::Handle::current(), @@ -2701,16 +2768,15 @@ pub(crate) async fn immediate_gc( #[allow(unused_mut)] let mut result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. #[cfg(feature = "testing")] { + // we need to synchronize with drop completion for python tests without polling for + // log messages if let Ok(result) = result.as_mut() { - // why not futures unordered? it seems it needs very much the same task structure - // but would only run on single task. let mut js = tokio::task::JoinSet::new(); for layer in std::mem::take(&mut result.doomed_layers) { js.spawn(layer.wait_drop()); @@ -2726,7 +2792,7 @@ pub(crate) async fn immediate_gc( if let Some(rtc) = rtc { // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care just exit fast about the shutdown error + // deletions; don't care about the shutdown error, just exit fast drop(rtc.wait_completion().await); } } @@ -2737,6 +2803,7 @@ pub(crate) async fn immediate_gc( } Ok(()) } + .instrument(span) ); // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 6fff6e78e2..6ee8ad7155 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; -use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; use utils::id::TimelineId; @@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>( // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); - let (mut destination_file, bytes_amount) = download_retry( - || async { - let destination_file = tokio::fs::File::create(&temp_file_path) - .await - .with_context(|| format!("create a destination file for layer '{temp_file_path}'")) - .map_err(DownloadError::Other)?; - - let download = storage.download(&remote_path, cancel).await?; - - let mut destination_file = - tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); - - let mut reader = tokio_util::io::StreamReader::new(download.download_stream); - - let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await; - - match bytes_amount { - Ok(bytes_amount) => { - let destination_file = destination_file.into_inner(); - Ok((destination_file, bytes_amount)) - } - Err(e) => { - if let Err(e) = tokio::fs::remove_file(&temp_file_path).await { - on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}")); - } - - Err(e.into()) - } - } - }, + let bytes_amount = download_retry( + || async { download_object(storage, &remote_path, &temp_file_path, cancel).await }, &format!("download {remote_path:?}"), cancel, ) .await?; - // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: - // A file will not be closed immediately when it goes out of scope if there are any IO operations - // that have not yet completed. To ensure that a file is closed immediately when it is dropped, - // you should call flush before dropping it. - // - // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because - // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. - // But for additional safety lets check/wait for any pending operations. - destination_file - .flush() - .await - .with_context(|| format!("flush source file at {temp_file_path}")) - .map_err(DownloadError::Other)?; - let expected = layer_metadata.file_size(); if expected != bytes_amount { return Err(DownloadError::Other(anyhow!( @@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>( ))); } - // not using sync_data because it can lose file size update - destination_file - .sync_all() - .await - .with_context(|| format!("failed to fsync source file at {temp_file_path}")) - .map_err(DownloadError::Other)?; - drop(destination_file); - fail::fail_point!("remote-storage-download-pre-rename", |_| { Err(DownloadError::Other(anyhow!( "remote-storage-download-pre-rename failpoint triggered" @@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>( Ok(bytes_amount) } +/// Download the object `src_path` in the remote `storage` to local path `dst_path`. +/// +/// If Ok() is returned, the download succeeded and the inode & data have been made durable. +/// (Note that the directory entry for the inode is not made durable.) +/// The file size in bytes is returned. +/// +/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked. +/// The unlinking has _not_ been made durable. +async fn download_object<'a>( + storage: &'a GenericRemoteStorage, + src_path: &RemotePath, + dst_path: &Utf8PathBuf, + cancel: &CancellationToken, +) -> Result { + let res = match crate::virtual_file::io_engine::get() { + crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"), + crate::virtual_file::io_engine::IoEngine::StdFs => { + async { + let destination_file = tokio::fs::File::create(dst_path) + .await + .with_context(|| format!("create a destination file for layer '{dst_path}'")) + .map_err(DownloadError::Other)?; + + let download = storage.download(src_path, cancel).await?; + + let mut buf_writer = + tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); + + let mut reader = tokio_util::io::StreamReader::new(download.download_stream); + + let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?; + buf_writer.flush().await?; + + let mut destination_file = buf_writer.into_inner(); + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because + // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. + // But for additional safety lets check/wait for any pending operations. + destination_file + .flush() + .await + .with_context(|| format!("flush source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| format!("failed to fsync source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + Ok(bytes_amount) + } + .await + } + #[cfg(target_os = "linux")] + crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { + use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; + async { + let destination_file = VirtualFile::create(dst_path) + .await + .with_context(|| format!("create a destination file for layer '{dst_path}'")) + .map_err(DownloadError::Other)?; + + let mut download = storage.download(src_path, cancel).await?; + + // TODO: use vectored write (writev) once supported by tokio-epoll-uring. + // There's chunks_vectored() on the stream. + let (bytes_amount, destination_file) = async { + let size_tracking = size_tracking_writer::Writer::new(destination_file); + let mut buffered = owned_buffers_io::write::BufferedWriter::< + { super::BUFFER_SIZE }, + _, + >::new(size_tracking); + while let Some(res) = + futures::StreamExt::next(&mut download.download_stream).await + { + let chunk = match res { + Ok(chunk) => chunk, + Err(e) => return Err(e), + }; + buffered + .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk)) + .await?; + } + let size_tracking = buffered.flush_and_into_inner().await?; + Ok(size_tracking.into_inner()) + } + .await?; + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| format!("failed to fsync source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + Ok(bytes_amount) + } + .await + } + }; + + // in case the download failed, clean up + match res { + Ok(bytes_amount) => Ok(bytes_amount), + Err(e) => { + if let Err(e) = tokio::fs::remove_file(dst_path).await { + if e.kind() != std::io::ErrorKind::NotFound { + on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}")); + } + } + Err(e) + } + } +} + const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 14e88b836e..19f36c722e 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant { shard_identity: ShardIdentity, tenant_conf: std::sync::Mutex, + // Internal state used by the Downloader. detail: std::sync::Mutex, + + // Public state indicating overall progress of downloads relative to the last heatmap seen + pub(crate) progress: std::sync::Mutex, } impl SecondaryTenant { @@ -118,6 +122,8 @@ impl SecondaryTenant { tenant_conf: std::sync::Mutex::new(tenant_conf), detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), + + progress: std::sync::Mutex::default(), }) } @@ -247,9 +253,12 @@ impl SecondaryTenant { } /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, -/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests, -/// where we want to immediately upload/download for a particular tenant. In normal operation -/// uploads & downloads are autonomous and not driven by this interface. +/// and heatmap uploads. This is not a hot data path: it's used for: +/// - Live migrations, where we want to ensure a migration destination has the freshest possible +/// content before trying to cut over. +/// - Tests, where we want to immediately upload/download for a particular tenant. +/// +/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface. pub struct SecondaryController { upload_req_tx: tokio::sync::mpsc::Sender>, download_req_tx: tokio::sync::mpsc::Sender>, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index b679077358..a595096133 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -41,14 +41,16 @@ use crate::tenant::{ use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; +use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use rand::Rng; -use remote_storage::{DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId, + backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, + id::TimelineId, }; use super::{ @@ -128,6 +130,7 @@ pub(super) struct SecondaryDetail { pub(super) config: SecondaryLocationConfig, last_download: Option, + last_etag: Option, next_download: Option, pub(super) timelines: HashMap, } @@ -138,11 +141,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat> { datetime.format("%d/%m/%Y %T") } +/// Information returned from download function when it detects the heatmap has changed +struct HeatMapModified { + etag: Etag, + last_modified: SystemTime, + bytes: Vec, +} + +enum HeatMapDownload { + // The heatmap's etag has changed: return the new etag, mtime and the body bytes + Modified(HeatMapModified), + // The heatmap's etag is unchanged + Unmodified, +} + impl SecondaryDetail { pub(super) fn new(config: SecondaryLocationConfig) -> Self { Self { config, last_download: None, + last_etag: None, next_download: None, timelines: HashMap::new(), } @@ -477,11 +495,31 @@ impl<'a> TenantDownloader<'a> { }; let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + + // We will use the etag from last successful download to make the download conditional on changes + let last_etag = self + .secondary_state + .detail + .lock() + .unwrap() + .last_etag + .clone(); + // Download the tenant's heatmap - let heatmap_bytes = tokio::select!( - bytes = self.download_heatmap() => {bytes?}, + let HeatMapModified { + last_modified: heatmap_mtime, + etag: heatmap_etag, + bytes: heatmap_bytes, + } = match tokio::select!( + bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?}, _ = self.secondary_state.cancel.cancelled() => return Ok(()) - ); + ) { + HeatMapDownload::Unmodified => { + tracing::info!("Heatmap unchanged since last successful download"); + return Ok(()); + } + HeatMapDownload::Modified(m) => m, + }; let heatmap = serde_json::from_slice::(&heatmap_bytes)?; @@ -498,6 +536,14 @@ impl<'a> TenantDownloader<'a> { tracing::debug!("Wrote local heatmap to {}", heatmap_path); + // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general + // principle that deletions should be done before writes wherever possible, and so that we can use this + // phase to initialize our SecondaryProgress. + { + *self.secondary_state.progress.lock().unwrap() = + self.prepare_timelines(&heatmap, heatmap_mtime).await?; + } + // Download the layers in the heatmap for timeline in heatmap.timelines { if self.secondary_state.cancel.is_cancelled() { @@ -515,30 +561,159 @@ impl<'a> TenantDownloader<'a> { .await?; } + // Only update last_etag after a full successful download: this way will not skip + // the next download, even if the heatmap's actual etag is unchanged. + self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag); + Ok(()) } - async fn download_heatmap(&self) -> Result, UpdateError> { + /// Do any fast local cleanup that comes before the much slower process of downloading + /// layers from remote storage. In the process, initialize the SecondaryProgress object + /// that will later be updated incrementally as we download layers. + async fn prepare_timelines( + &self, + heatmap: &HeatMapTenant, + heatmap_mtime: SystemTime, + ) -> Result { + let heatmap_stats = heatmap.get_stats(); + // We will construct a progress object, and then populate its initial "downloaded" numbers + // while iterating through local layer state in [`Self::prepare_timelines`] + let mut progress = SecondaryProgress { + layers_total: heatmap_stats.layers, + bytes_total: heatmap_stats.bytes, + heatmap_mtime: Some(heatmap_mtime), + layers_downloaded: 0, + bytes_downloaded: 0, + }; + // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock + let mut delete_layers = Vec::new(); + let mut delete_timelines = Vec::new(); + { + let mut detail = self.secondary_state.detail.lock().unwrap(); + for (timeline_id, timeline_state) in &mut detail.timelines { + let Some(heatmap_timeline_index) = heatmap + .timelines + .iter() + .position(|t| t.timeline_id == *timeline_id) + else { + // This timeline is no longer referenced in the heatmap: delete it locally + delete_timelines.push(*timeline_id); + continue; + }; + + let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap(); + + let layers_in_heatmap = heatmap_timeline + .layers + .iter() + .map(|l| &l.name) + .collect::>(); + let layers_on_disk = timeline_state + .on_disk_layers + .iter() + .map(|l| l.0) + .collect::>(); + + let mut layer_count = layers_on_disk.len(); + let mut layer_byte_count: u64 = timeline_state + .on_disk_layers + .values() + .map(|l| l.metadata.file_size()) + .sum(); + + // Remove on-disk layers that are no longer present in heatmap + for layer in layers_on_disk.difference(&layers_in_heatmap) { + layer_count -= 1; + layer_byte_count -= timeline_state + .on_disk_layers + .get(layer) + .unwrap() + .metadata + .file_size(); + + delete_layers.push((*timeline_id, (*layer).clone())); + } + + progress.bytes_downloaded += layer_byte_count; + progress.layers_downloaded += layer_count; + } + } + + // Execute accumulated deletions + for (timeline_id, layer_name) in delete_layers { + let timeline_path = self + .conf + .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); + let local_path = timeline_path.join(layer_name.to_string()); + tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary layer")?; + + // Update in-memory housekeeping to reflect the absence of the deleted layer + let mut detail = self.secondary_state.detail.lock().unwrap(); + let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else { + continue; + }; + timeline_state.on_disk_layers.remove(&layer_name); + } + + for timeline_id in delete_timelines { + let timeline_path = self + .conf + .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); + tracing::info!(timeline_id=%timeline_id, + "Timeline no longer in heatmap, removing from secondary location" + ); + tokio::fs::remove_dir_all(&timeline_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary timeline")?; + } + + Ok(progress) + } + + /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object + /// still matches `prev_etag`. + async fn download_heatmap( + &self, + prev_etag: Option<&Etag>, + ) -> Result { debug_assert_current_span_has_tenant_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - // TODO: make download conditional on ETag having changed since last download + // TODO: pull up etag check into the request, to do a conditional GET rather than + // issuing a GET and then maybe ignoring the response body // (https://github.com/neondatabase/neon/issues/6199) tracing::debug!("Downloading heatmap for secondary tenant",); let heatmap_path = remote_heatmap_path(tenant_shard_id); let cancel = &self.secondary_state.cancel; - let heatmap_bytes = backoff::retry( + backoff::retry( || async { let download = self .remote_storage .download(&heatmap_path, cancel) .await .map_err(UpdateError::from)?; - let mut heatmap_bytes = Vec::new(); - let mut body = tokio_util::io::StreamReader::new(download.download_stream); - let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; - Ok(heatmap_bytes) + + if Some(&download.etag) == prev_etag { + Ok(HeatMapDownload::Unmodified) + } else { + let mut heatmap_bytes = Vec::new(); + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; + SECONDARY_MODE.download_heatmap.inc(); + Ok(HeatMapDownload::Modified(HeatMapModified { + etag: download.etag, + last_modified: download.last_modified, + bytes: heatmap_bytes, + })) + } }, |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), FAILED_DOWNLOAD_WARN_THRESHOLD, @@ -548,11 +723,7 @@ impl<'a> TenantDownloader<'a> { ) .await .ok_or_else(|| UpdateError::Cancelled) - .and_then(|x| x)?; - - SECONDARY_MODE.download_heatmap.inc(); - - Ok(heatmap_bytes) + .and_then(|x| x) } async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { @@ -593,27 +764,6 @@ impl<'a> TenantDownloader<'a> { } }; - let layers_in_heatmap = timeline - .layers - .iter() - .map(|l| &l.name) - .collect::>(); - let layers_on_disk = timeline_state - .on_disk_layers - .iter() - .map(|l| l.0) - .collect::>(); - - // Remove on-disk layers that are no longer present in heatmap - for layer in layers_on_disk.difference(&layers_in_heatmap) { - let local_path = timeline_path.join(layer.to_string()); - tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",); - tokio::fs::remove_file(&local_path) - .await - .or_else(fs_ext::ignore_not_found) - .maybe_fatal_err("Removing secondary layer")?; - } - // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { @@ -662,6 +812,12 @@ impl<'a> TenantDownloader<'a> { } } + // Failpoint for simulating slow remote storage + failpoint_support::sleep_millis_async!( + "secondary-layer-download-sleep", + &self.secondary_state.cancel + ); + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally let downloaded_bytes = match download_layer_file( self.conf, @@ -701,6 +857,11 @@ impl<'a> TenantDownloader<'a> { tokio::fs::remove_file(&local_path) .await .or_else(fs_ext::ignore_not_found)?; + } else { + tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.bytes_downloaded += downloaded_bytes; + progress.layers_downloaded += 1; } SECONDARY_MODE.download_layer.inc(); diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 99aaaeb8c8..73cdf6c6d4 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -62,3 +62,25 @@ impl HeatMapTimeline { } } } + +pub(crate) struct HeatMapStats { + pub(crate) bytes: u64, + pub(crate) layers: usize, +} + +impl HeatMapTenant { + pub(crate) fn get_stats(&self) -> HeatMapStats { + let mut stats = HeatMapStats { + bytes: 0, + layers: 0, + }; + for timeline in &self.timelines { + for layer in &timeline.layers { + stats.layers += 1; + stats.bytes += layer.metadata.file_size; + } + } + + stats + } +} diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index e0b1652d98..ad79b74d8b 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -183,7 +183,13 @@ pub(super) async fn gather_inputs( // new gc run, which we have no control over. however differently from `Timeline::gc` // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not // actually removing files. - let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + // + // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from + // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather + // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their + // PITR interval, they will see synthetic size decrease, even if we are still storing data inside + // horizon_cutoff. + let mut next_gc_cutoff = gc_info.pitr_cutoff; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 299950cc21..5c3bab9868 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -20,6 +20,7 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::models::{ LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, }; +use std::borrow::Cow; use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; @@ -427,7 +428,7 @@ impl LayerAccessStatFullDetails { } = self; pageserver_api::models::LayerAccessStatFullDetails { when_millis_since_epoch: system_time_to_millis_since_epoch(when), - task_kind: task_kind.into(), // into static str, powered by strum_macros + task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros access_kind: *access_kind, } } @@ -525,7 +526,7 @@ impl LayerAccessStats { .collect(), task_kind_access_flag: task_kind_flag .iter() - .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros + .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros .collect(), first: first_access.as_ref().map(|a| a.as_api_model()), accesses_history: last_accesses.map(|m| m.as_api_model()), diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 959065bc4c..f37d7e6449 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -710,10 +710,6 @@ impl LayerInner { // disable any scheduled but not yet running eviction deletions for this let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - // no need to make the evict_and_wait wait for the actual download to complete drop(self.status.send(Status::Downloaded)); @@ -722,7 +718,9 @@ impl LayerInner { .upgrade() .ok_or_else(|| DownloadError::TimelineShutdown)?; - // FIXME: grab a gate + // count cancellations, which currently remain largely unexpected + let init_cancelled = + scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let can_ever_evict = timeline.remote_client.as_ref().is_some(); @@ -731,9 +729,17 @@ impl LayerInner { let needs_download = self .needs_download() .await - .map_err(DownloadError::PreStatFailed)?; + .map_err(DownloadError::PreStatFailed); - let permit = if let Some(reason) = needs_download { + let needs_download = match needs_download { + Ok(reason) => reason, + Err(e) => { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } + }; + + let (permit, downloaded) = if let Some(reason) = needs_download { if let NeedsDownload::NotFile(ft) = reason { return Err(DownloadError::NotFile(ft)); } @@ -744,36 +750,59 @@ impl LayerInner { self.wanted_evicted.store(false, Ordering::Release); if !can_ever_evict { + scopeguard::ScopeGuard::into_inner(init_cancelled); return Err(DownloadError::NoRemoteStorage); } if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; + let res = self.check_expected_download(ctx); + if let Err(e) = res { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } } if !allow_download { // this does look weird, but for LayerInner the "downloading" means also changing // internal once related state ... + scopeguard::ScopeGuard::into_inner(init_cancelled); return Err(DownloadError::DownloadRequired); } tracing::info!(%reason, "downloading on-demand"); - self.spawn_download_and_wait(timeline, permit).await? + let permit = self.spawn_download_and_wait(timeline, permit).await; + + let permit = match permit { + Ok(permit) => permit, + Err(e) => { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } + }; + + (permit, true) } else { // the file is present locally, probably by a previous but cancelled call to // get_or_maybe_download. alternatively we might be running without remote storage. LAYER_IMPL_METRICS.inc_init_needed_no_download(); - permit + (permit, false) }; - let since_last_eviction = - self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); - if let Some(since_last_eviction) = since_last_eviction { - // FIXME: this will not always be recorded correctly until #6028 (the no - // download needed branch above) - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + scopeguard::ScopeGuard::into_inner(init_cancelled); + + if downloaded { + let since_last_eviction = self + .last_evicted_at + .lock() + .unwrap() + .take() + .map(|ts| ts.elapsed()); + + if let Some(since_last_eviction) = since_last_eviction { + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } } let res = Arc::new(DownloadedLayer { @@ -795,8 +824,6 @@ impl LayerInner { ); } - scopeguard::ScopeGuard::into_inner(init_cancelled); - Ok((ResidentOrWantedEvicted::Resident(res), permit)) } .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) @@ -1457,7 +1484,7 @@ impl ResidentLayer { } /// Loads all keys stored in the layer. Returns key, lsn and value size. - #[tracing::instrument(skip_all, fields(layer=%self))] + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] pub(crate) async fn load_keys<'a>( &'a self, ctx: &RequestContext, @@ -1477,9 +1504,9 @@ impl ResidentLayer { // while it's being held. delta_layer::DeltaLayerInner::load_keys(d, ctx) .await - .context("Layer index is corrupted") + .with_context(|| format!("Layer index is corrupted for {self}")) } - Image(_) => anyhow::bail!("cannot load_keys on a image layer"), + Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")), } } diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 280773e9c3..f3f3d5e3ae 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -130,10 +130,10 @@ where self.inner.load().config.steady_rps() } - pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) { + pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { let inner = self.inner.load_full(); // clones the `Inner` Arc if !inner.task_kinds.contains(ctx.task_kind()) { - return; + return None; }; let start = std::time::Instant::now(); let mut did_throttle = false; @@ -170,6 +170,9 @@ where }); } } + Some(wait_time) + } else { + None } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a733a3b1a7..2ab7301cce 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -634,6 +634,8 @@ impl Timeline { /// If a remote layer file is needed, it is downloaded as part of this /// call. /// + /// This method enforces [`Self::timeline_get_throttle`] internally. + /// /// NOTE: It is considered an error to 'get' a key that doesn't exist. The /// abstraction above this needs to store suitable metadata to track what /// data exists with what keys, in separate metadata entries. If a @@ -644,18 +646,27 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. + #[inline(always)] pub(crate) async fn get( &self, key: Key, lsn: Lsn, ctx: &RequestContext, + ) -> Result { + self.timeline_get_throttle.throttle(ctx, 1).await; + self.get_impl(key, lsn, ctx).await + } + /// Not subject to [`Self::timeline_get_throttle`]. + async fn get_impl( + &self, + key: Key, + lsn: Lsn, + ctx: &RequestContext, ) -> Result { if !lsn.is_valid() { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } - self.timeline_get_throttle.throttle(ctx, 1).await; - // This check is debug-only because of the cost of hashing, and because it's a double-check: we // already checked the key against the shard_identity when looking up the Timeline from // page_service. @@ -752,10 +763,6 @@ impl Timeline { return Err(GetVectoredError::Oversized(key_count)); } - self.timeline_get_throttle - .throttle(ctx, key_count as usize) - .await; - for range in &keyspace.ranges { let mut key = range.start; while key != range.end { @@ -772,11 +779,18 @@ impl Timeline { self.conf.get_vectored_impl ); - let _timer = crate::metrics::GET_VECTORED_LATENCY + let start = crate::metrics::GET_VECTORED_LATENCY .for_task_kind(ctx.task_kind()) - .map(|t| t.start_timer()); + .map(|metric| (metric, Instant::now())); - match self.conf.get_vectored_impl { + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + .throttle(ctx, key_count as usize) + .await; + + let res = match self.conf.get_vectored_impl { GetVectoredImpl::Sequential => { self.get_vectored_sequential_impl(keyspace, lsn, ctx).await } @@ -790,9 +804,33 @@ impl Timeline { vectored_res } + }; + + if let Some((metric, start)) = start { + let elapsed = start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + + if let Some(ex_throttled) = ex_throttled { + metric.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } } + + res } + /// Not subject to [`Self::timeline_get_throttle`]. pub(super) async fn get_vectored_sequential_impl( &self, keyspace: KeySpace, @@ -803,7 +841,7 @@ impl Timeline { for range in keyspace.ranges { let mut key = range.start; while key != range.end { - let block = self.get(key, lsn, ctx).await; + let block = self.get_impl(key, lsn, ctx).await; use PageReconstructError::*; match block { @@ -853,6 +891,7 @@ impl Timeline { Ok(results) } + /// Not subject to [`Self::timeline_get_throttle`]. pub(super) async fn validate_get_vectored_impl( &self, vectored_res: &Result>, GetVectoredError>, @@ -2967,7 +3006,6 @@ impl Timeline { } trace!("waking up"); - let timer = self.metrics.flush_time_histo.start_timer(); let flush_counter = *layer_flush_start_rx.borrow(); let result = loop { if self.cancel.is_cancelled() { @@ -2978,6 +3016,8 @@ impl Timeline { return; } + let timer = self.metrics.flush_time_histo.start_timer(); + let layer_to_flush = { let guard = self.layers.read().await; guard.layer_map().frozen_layers.front().cloned() @@ -2999,13 +3039,12 @@ impl Timeline { break err; } } + timer.stop_and_record(); }; // Notify any listeners that we're done let _ = self .layer_flush_done_tx .send_replace((flush_counter, result)); - - timer.stop_and_record(); } } @@ -3073,6 +3112,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result<(), FlushLayerError> { debug_assert_current_span_has_tenant_and_timeline_id(); + // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the @@ -3744,8 +3784,11 @@ impl Timeline { // The timestamp is in the future. That sounds impossible, // but what it really means is that there hasn't been // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. debug!("future({})", lsn); - cutoff_horizon + self.get_last_record_lsn() } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 27d6fd9c28..e1034a9fe2 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc}; use anyhow::Context; use camino::Utf8PathBuf; -use tracing::{error, info, info_span, warn}; -use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn}; +use tracing::{error, info, info_span}; +use utils::{fs_ext, id::TimelineId, lsn::Lsn}; use crate::{context::RequestContext, import_datadir, tenant::Tenant}; @@ -11,22 +11,22 @@ use super::Timeline; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or -/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory -/// to be removed on next restart. +/// its local files are removed. If we crash while this class exists, then the timeline's local +/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage. /// /// The caller is responsible for proper timeline data filling before the final init. #[must_use] pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, ) -> Self { Self { owning_tenant, @@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> { } } - /// Finish timeline creation: insert it into the Tenant's timelines map and remove the - /// uninit mark file. + /// Finish timeline creation: insert it into the Tenant's timelines map /// /// This function launches the flush loop if not already done. /// @@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> { Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not // cleanup after and would block for example the tenant deletion - let (new_timeline, uninit_mark) = + let (new_timeline, _create_guard) = self.raw_timeline.take().expect("already checked"); - // this is the mutual exclusion between different retries to create the timeline; - // this should be an assertion. - uninit_mark.remove_uninit_mark().with_context(|| { - format!( - "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}" - ) - })?; v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); @@ -120,8 +112,7 @@ impl<'t> UninitializedTimeline<'t> { .await .context("Failed to flush after basebackup import")?; - // All the data has been imported. Insert the Timeline into the tenant's timelines - // map and remove the uninit mark file. + // All the data has been imported. Insert the Timeline into the tenant's timelines map let tl = self.finish_creation()?; tl.activate(broker_client, None, ctx); Ok(tl) @@ -143,37 +134,35 @@ impl<'t> UninitializedTimeline<'t> { impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { - if let Some((_, uninit_mark)) = self.raw_timeline.take() { + if let Some((_, create_guard)) = self.raw_timeline.take() { let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); error!("Timeline got dropped without initializing, cleaning its files"); - cleanup_timeline_directory(uninit_mark); + cleanup_timeline_directory(create_guard); } } } -pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { - let timeline_path = &uninit_mark.timeline_path; +pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) { + let timeline_path = &create_guard.timeline_path; match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { Ok(()) => { - info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + info!("Timeline dir {timeline_path:?} removed successfully") } Err(e) => { error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") } } - drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists + // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other + // timeline creation attempts under this TimelineId to proceed + drop(create_guard); } -/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, -/// or gets removed eventually. -/// -/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +/// A guard for timeline creations in process: as long as this object exists, the timeline ID +/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline. #[must_use] -pub(crate) struct TimelineUninitMark<'t> { +pub(crate) struct TimelineCreateGuard<'t> { owning_tenant: &'t Tenant, timeline_id: TimelineId, - uninit_mark_deleted: bool, - uninit_mark_path: Utf8PathBuf, pub(crate) timeline_path: Utf8PathBuf, } @@ -190,11 +179,10 @@ pub(crate) enum TimelineExclusionError { Other(#[from] anyhow::Error), } -impl<'t> TimelineUninitMark<'t> { +impl<'t> TimelineCreateGuard<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf, ) -> Result { // Lock order: this is the only place we take both locks. During drop() we only @@ -214,56 +202,14 @@ impl<'t> TimelineUninitMark<'t> { Ok(Self { owning_tenant, timeline_id, - uninit_mark_deleted: false, - uninit_mark_path, timeline_path, }) } } - - fn remove_uninit_mark(mut self) -> anyhow::Result<()> { - if !self.uninit_mark_deleted { - self.delete_mark_file_if_present()?; - } - - Ok(()) - } - - fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> { - let uninit_mark_file = &self.uninit_mark_path; - let uninit_mark_parent = uninit_mark_file - .parent() - .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { - format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") - })?; - crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; - self.uninit_mark_deleted = true; - - Ok(()) - } } -impl Drop for TimelineUninitMark<'_> { +impl Drop for TimelineCreateGuard<'_> { fn drop(&mut self) { - if !self.uninit_mark_deleted { - if self.timeline_path.exists() { - error!( - "Uninit mark {} is not removed, timeline {} stays uninitialized", - self.uninit_mark_path, self.timeline_path - ) - } else { - // unblock later timeline creation attempts - warn!( - "Removing intermediate uninit mark file {}", - self.uninit_mark_path - ); - if let Err(e) = self.delete_mark_file_if_present() { - error!("Failed to remove the uninit mark file: {e}") - } - } - } - self.owning_tenant .timelines_creating .lock() diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 8297ca6563..d9f780cfd1 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -448,6 +448,7 @@ pub(super) async fn handle_walreceiver_connection( disk_consistent_lsn, remote_consistent_lsn, replytime: ts, + shard_number: timeline.tenant_shard_id.shard_number.0 as u32, }; debug!("neon_status_update {status_update:?}"); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 6d4774cf75..dee36d8afd 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -28,12 +28,31 @@ use tokio::time::Instant; pub use pageserver_api::models::virtual_file as api; pub(crate) mod io_engine; +pub use io_engine::feature_test as io_engine_feature_test; +pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; mod metadata; mod open_options; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; +#[cfg_attr(not(target_os = "linux"), allow(dead_code))] +pub(crate) mod owned_buffers_io { + //! Abstractions for IO with owned buffers. + //! + //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary + //! reason we need this abstraction. + //! + //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`, + //! but for the time being we're proving out the primitives in the neon.git repo + //! for faster iteration. + + pub(crate) mod write; + pub(crate) mod util { + pub(crate) mod size_tracking_writer; + } +} + /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index e369d28711..7a27be2ca1 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -6,6 +6,11 @@ //! Initialize using [`init`]. //! //! Then use [`get`] and [`super::OpenOptions`]. +//! +//! + +#[cfg(target_os = "linux")] +pub(super) mod tokio_epoll_uring_ext; use tokio_epoll_uring::{IoBuf, Slice}; use tracing::Instrument; @@ -145,7 +150,7 @@ impl IoEngine { } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { - let system = tokio_epoll_uring::thread_local_system().await; + let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = system.read(file_guard, offset, buf).await; (resources, res.map_err(epoll_uring_error_to_std)) } @@ -160,7 +165,7 @@ impl IoEngine { } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { - let system = tokio_epoll_uring::thread_local_system().await; + let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = system.fsync(file_guard).await; (resources, res.map_err(epoll_uring_error_to_std)) } @@ -178,7 +183,7 @@ impl IoEngine { } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { - let system = tokio_epoll_uring::thread_local_system().await; + let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = system.fdatasync(file_guard).await; (resources, res.map_err(epoll_uring_error_to_std)) } @@ -197,7 +202,7 @@ impl IoEngine { } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { - let system = tokio_epoll_uring::thread_local_system().await; + let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = system.statx(file_guard).await; ( resources, @@ -220,7 +225,7 @@ impl IoEngine { } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { - let system = tokio_epoll_uring::thread_local_system().await; + let system = tokio_epoll_uring_ext::thread_local_system().await; let (resources, res) = system.write(file_guard, offset, buf).await; (resources, res.map_err(epoll_uring_error_to_std)) } @@ -253,3 +258,82 @@ impl IoEngine { } } } + +pub enum FeatureTestResult { + PlatformPreferred(IoEngineKind), + Worse { + engine: IoEngineKind, + remark: String, + }, +} + +impl FeatureTestResult { + #[cfg(target_os = "linux")] + const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring; + #[cfg(not(target_os = "linux"))] + const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs; +} + +impl From for IoEngineKind { + fn from(val: FeatureTestResult) -> Self { + match val { + FeatureTestResult::PlatformPreferred(e) => e, + FeatureTestResult::Worse { engine, .. } => engine, + } + } +} + +/// Somewhat costly under the hood, do only once. +/// Panics if we can't set up the feature test. +pub fn feature_test() -> anyhow::Result { + std::thread::spawn(|| { + + #[cfg(not(target_os = "linux"))] + { + Ok(FeatureTestResult::PlatformPreferred( + FeatureTestResult::PLATFORM_PREFERRED, + )) + } + #[cfg(target_os = "linux")] + { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + Ok(match rt.block_on(tokio_epoll_uring::System::launch()) { + Ok(_) => FeatureTestResult::PlatformPreferred({ + assert!(matches!( + IoEngineKind::TokioEpollUring, + FeatureTestResult::PLATFORM_PREFERRED + )); + FeatureTestResult::PLATFORM_PREFERRED + }), + Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => { + let remark = match e.raw_os_error() { + Some(nix::libc::EPERM) => { + // fall back + "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled " + .to_string() + } + Some(nix::libc::EFAULT) => { + // fail feature test + anyhow::bail!( + "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory" + ); + } + Some(_) | None => { + // fall back + format!("creating tokio-epoll-uring fails with error: {e:#}") + } + }; + FeatureTestResult::Worse { + engine: IoEngineKind::StdFs, + remark, + } + } + }) + } + }) + .join() + .unwrap() +} diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs new file mode 100644 index 0000000000..6ea19d6b2d --- /dev/null +++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs @@ -0,0 +1,194 @@ +//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific +//! handling in case the instance can't launched. +//! +//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation +//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. +//! See for more details. + +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::Arc; + +use tokio_util::sync::CancellationToken; +use tracing::{error, info, info_span, warn, Instrument}; +use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; + +use tokio_epoll_uring::{System, SystemHandle}; + +use crate::virtual_file::on_fatal_io_error; + +use crate::metrics::tokio_epoll_uring as metrics; + +#[derive(Clone)] +struct ThreadLocalState(Arc); + +struct ThreadLocalStateInner { + cell: tokio::sync::OnceCell, + launch_attempts: AtomicU32, + /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`] + thread_local_state_id: u64, +} + +impl ThreadLocalState { + pub fn new() -> Self { + Self(Arc::new(ThreadLocalStateInner { + cell: tokio::sync::OnceCell::default(), + launch_attempts: AtomicU32::new(0), + thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed), + })) + } + + pub fn make_id_string(&self) -> String { + format!("{}", self.0.thread_local_state_id) + } +} + +static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0); + +thread_local! { + static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new(); +} + +/// Panics if we cannot [`System::launch`]. +pub async fn thread_local_system() -> Handle { + let fake_cancel = CancellationToken::new(); + loop { + let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone()); + let inner = &thread_local_state.0; + let get_or_init_res = inner + .cell + .get_or_try_init(|| async { + let attempt_no = inner + .launch_attempts + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no); + async { + // Rate-limit retries per thread-local. + // NB: doesn't yield to executor at attempt_no=0. + utils::backoff::exponential_backoff( + attempt_no, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + &fake_cancel, + ) + .await; + let res = System::launch() + // this might move us to another executor thread => loop outside the get_or_try_init, not inside it + .await; + match res { + Ok(system) => { + info!("successfully launched system"); + metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc(); + Ok(system) + } + Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => { + warn!("not enough locked memory to tokio-epoll-uring, will retry"); + info_span!("stats").in_scope(|| { + emit_launch_failure_process_stats(); + }); + metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc(); + Err(()) + } + // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere. + // This is equivalent to a fatal IO error. + Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => { + error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process"); + info_span!("stats").in_scope(|| { + emit_launch_failure_process_stats(); + }); + on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring"); + }, + } + } + .instrument(span) + .await + }) + .await; + if get_or_init_res.is_ok() { + return Handle(thread_local_state); + } + } +} + +fn emit_launch_failure_process_stats() { + // tokio-epoll-uring stats + // vmlck + rlimit + // number of threads + // rss / system memory usage generally + + let tokio_epoll_uring::metrics::Metrics { + systems_created, + systems_destroyed, + } = tokio_epoll_uring::metrics::global(); + info!(systems_created, systems_destroyed, "tokio-epoll-uring"); + + match procfs::process::Process::myself() { + Ok(myself) => { + match myself.limits() { + Ok(limits) => { + info!(?limits.max_locked_memory, "/proc/self/limits"); + } + Err(error) => { + info!(%error, "no limit stats due to error"); + } + } + + match myself.status() { + Ok(status) => { + let procfs::process::Status { + vmsize, + vmlck, + vmpin, + vmrss, + rssanon, + rssfile, + rssshmem, + vmdata, + vmstk, + vmexe, + vmlib, + vmpte, + threads, + .. + } = status; + info!( + vmsize, + vmlck, + vmpin, + vmrss, + rssanon, + rssfile, + rssshmem, + vmdata, + vmstk, + vmexe, + vmlib, + vmpte, + threads, + "/proc/self/status" + ); + } + Err(error) => { + info!(%error, "no status status due to error"); + } + } + } + Err(error) => { + info!(%error, "no process stats due to error"); + } + }; +} + +#[derive(Clone)] +pub struct Handle(ThreadLocalState); + +impl std::ops::Deref for Handle { + type Target = SystemHandle; + + fn deref(&self) -> &Self::Target { + self.0 + .0 + .cell + .get() + .expect("must be already initialized when using this") + } +} diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index f75edb0bac..7f951270d1 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -98,7 +98,7 @@ impl OpenOptions { OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()), #[cfg(target_os = "linux")] OpenOptions::TokioEpollUring(x) => { - let system = tokio_epoll_uring::thread_local_system().await; + let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; system.open(path, x).await.map_err(|e| match e { tokio_epoll_uring::Error::Op(e) => e, tokio_epoll_uring::Error::System(system) => { diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs new file mode 100644 index 0000000000..7505b7487e --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -0,0 +1,34 @@ +use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile}; +use tokio_epoll_uring::{BoundedBuf, IoBuf}; + +pub struct Writer { + dst: VirtualFile, + bytes_amount: u64, +} + +impl Writer { + pub fn new(dst: VirtualFile) -> Self { + Self { + dst, + bytes_amount: 0, + } + } + /// Returns the wrapped `VirtualFile` object as well as the number + /// of bytes that were written to it through this object. + pub fn into_inner(self) -> (u64, VirtualFile) { + (self.bytes_amount, self.dst) + } +} + +impl OwnedAsyncWriter for Writer { + #[inline(always)] + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> std::io::Result<(usize, B::Buf)> { + let (buf, res) = self.dst.write_all(buf).await; + let nwritten = res?; + self.bytes_amount += u64::try_from(nwritten).unwrap(); + Ok((nwritten, buf)) + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs new file mode 100644 index 0000000000..f1812d9b51 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -0,0 +1,206 @@ +use bytes::BytesMut; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; + +/// A trait for doing owned-buffer write IO. +/// Think [`tokio::io::AsyncWrite`] but with owned buffers. +pub trait OwnedAsyncWriter { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> std::io::Result<(usize, B::Buf)>; +} + +/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers +/// into `BUFFER_SIZE`-sized writes. +/// +/// # Passthrough Of Large Writers +/// +/// Buffered writes larger than the `BUFFER_SIZE` cause the internal +/// buffer to be flushed, even if it is not full yet. Then, the large +/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`]. +/// +/// This pass-through is generally beneficial for throughput, but if +/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, +/// unlimited large writes may cause latency or fairness issues. +/// +/// In such cases, a different implementation that always buffers in memory +/// may be preferable. +pub struct BufferedWriter { + writer: W, + // invariant: always remains Some(buf) + // with buf.capacity() == BUFFER_SIZE except + // - while IO is ongoing => goes back to Some() once the IO completed successfully + // - after an IO error => stays `None` forever + // In these exceptional cases, it's `None`. + buf: Option, +} + +impl BufferedWriter +where + W: OwnedAsyncWriter, +{ + pub fn new(writer: W) -> Self { + Self { + writer, + buf: Some(BytesMut::with_capacity(BUFFER_SIZE)), + } + } + + pub async fn flush_and_into_inner(mut self) -> std::io::Result { + self.flush().await?; + let Self { buf, writer } = self; + assert!(buf.is_some()); + Ok(writer) + } + + pub async fn write_buffered(&mut self, chunk: Slice) -> std::io::Result<()> + where + B: IoBuf + Send, + { + // avoid memcpy for the middle of the chunk + if chunk.len() >= BUFFER_SIZE { + self.flush().await?; + // do a big write, bypassing `buf` + assert_eq!( + self.buf + .as_ref() + .expect("must not use after an error") + .len(), + 0 + ); + let chunk_len = chunk.len(); + let (nwritten, chunk) = self.writer.write_all(chunk).await?; + assert_eq!(nwritten, chunk_len); + drop(chunk); + return Ok(()); + } + // in-memory copy the < BUFFER_SIZED tail of the chunk + assert!(chunk.len() < BUFFER_SIZE); + let mut chunk = &chunk[..]; + while !chunk.is_empty() { + let buf = self.buf.as_mut().expect("must not use after an error"); + let need = BUFFER_SIZE - buf.len(); + let have = chunk.len(); + let n = std::cmp::min(need, have); + buf.extend_from_slice(&chunk[..n]); + chunk = &chunk[n..]; + if buf.len() >= BUFFER_SIZE { + assert_eq!(buf.len(), BUFFER_SIZE); + self.flush().await?; + } + } + assert!(chunk.is_empty(), "by now we should have drained the chunk"); + Ok(()) + } + + async fn flush(&mut self) -> std::io::Result<()> { + let buf = self.buf.take().expect("must not use after an error"); + if buf.is_empty() { + self.buf = Some(buf); + return std::io::Result::Ok(()); + } + let buf_len = buf.len(); + let (nwritten, mut buf) = self.writer.write_all(buf).await?; + assert_eq!(nwritten, buf_len); + buf.clear(); + self.buf = Some(buf); + Ok(()) + } +} + +impl OwnedAsyncWriter for Vec { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> std::io::Result<(usize, B::Buf)> { + let nbytes = buf.bytes_init(); + if nbytes == 0 { + return Ok((0, Slice::into_inner(buf.slice_full()))); + } + let buf = buf.slice(0..nbytes); + self.extend_from_slice(&buf[..]); + Ok((buf.len(), Slice::into_inner(buf))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Default)] + struct RecorderWriter { + writes: Vec>, + } + impl OwnedAsyncWriter for RecorderWriter { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> std::io::Result<(usize, B::Buf)> { + let nbytes = buf.bytes_init(); + if nbytes == 0 { + self.writes.push(vec![]); + return Ok((0, Slice::into_inner(buf.slice_full()))); + } + let buf = buf.slice(0..nbytes); + self.writes.push(Vec::from(&buf[..])); + Ok((buf.len(), Slice::into_inner(buf))) + } + } + + macro_rules! write { + ($writer:ident, $data:literal) => {{ + $writer + .write_buffered(::bytes::Bytes::from_static($data).slice_full()) + .await?; + }}; + } + + #[tokio::test] + async fn test_buffered_writes_only() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::<2, _>::new(recorder); + write!(writer, b"a"); + write!(writer, b"b"); + write!(writer, b"c"); + write!(writer, b"d"); + write!(writer, b"e"); + let recorder = writer.flush_and_into_inner().await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_passthrough_writes_only() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::<2, _>::new(recorder); + write!(writer, b"abc"); + write!(writer, b"de"); + write!(writer, b""); + write!(writer, b"fghijk"); + let recorder = writer.flush_and_into_inner().await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::<2, _>::new(recorder); + write!(writer, b"a"); + write!(writer, b"bc"); + write!(writer, b"d"); + write!(writer, b"e"); + let recorder = writer.flush_and_into_inner().await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] + ); + Ok(()) + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 63a2b30d09..9c7e8748d5 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -109,6 +109,8 @@ impl WalIngest { self.checkpoint_modified = true; } + failpoint_support::sleep_millis_async!("wal-ingest-record-sleep"); + match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { // Heap AM records need some special handling, because they modify VM pages diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 9ff0493352..d7987954d4 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); -static void HandleSafekeeperResponse(WalProposer *wp); +static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); @@ -1405,7 +1405,6 @@ static bool RecvAppendResponses(Safekeeper *sk) { WalProposer *wp = sk->wp; - XLogRecPtr newCommitLsn; bool readAnything = false; while (true) @@ -1425,6 +1424,8 @@ RecvAppendResponses(Safekeeper *sk) LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), sk->host, sk->port); + readAnything = true; + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1438,35 +1439,28 @@ RecvAppendResponses(Safekeeper *sk) sk->appendResponse.term, wp->propTerm); } - readAnything = true; + HandleSafekeeperResponse(wp, sk); } if (!readAnything) return sk->state == SS_ACTIVE; - /* update commit_lsn */ - newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); - /* - * Send the new value to all safekeepers. - */ - if (newCommitLsn > wp->commitLsn) - { - wp->commitLsn = newCommitLsn; - BroadcastAppendRequest(wp); - } - - HandleSafekeeperResponse(wp); - return sk->state == SS_ACTIVE; } +#define psfeedback_log(fmt, key, ...) \ + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__) + /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ static void -ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf) +ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback) { uint8 nkeys; int i; - int32 len; + + /* initialize the struct before parsing */ + memset(ps_feedback, 0, sizeof(PageserverFeedback)); + ps_feedback->present = true; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); @@ -1474,66 +1468,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgstring(reply_message); + unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->currentClusterSize = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + Assert(value_len == sizeof(int64)); + ps_feedback->currentClusterSize = pq_getmsgint64(reply_message); + psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->last_received_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", - LSN_FORMAT_ARGS(rf->last_received_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->last_received_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->disk_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->remote_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->replytime = pq_getmsgint64(reply_message); - { - char *replyTimeStr; - - /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", - rf->replytime, replyTimeStr); - - pfree(replyTimeStr); - } + Assert(value_len == sizeof(int64)); + ps_feedback->replytime = pq_getmsgint64(reply_message); + psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime)); + } + else if (strcmp(key, "shard_number") == 0) + { + Assert(value_len == sizeof(uint32)); + ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32)); + psfeedback_log("%u", key, ps_feedback->shard_number); } else { - len = pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - /* * Skip unknown keys to support backward compatibile protocol * changes */ - wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); - pq_getmsgbytes(reply_message, len); + wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len); + pq_getmsgbytes(reply_message, value_len); }; } } @@ -1630,12 +1610,30 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) return donor; } +/* + * Process AppendResponse message from safekeeper. + */ static void -HandleSafekeeperResponse(WalProposer *wp) +HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) { XLogRecPtr candidateTruncateLsn; + XLogRecPtr newCommitLsn; - wp->api.process_safekeeper_feedback(wp); + newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); + if (newCommitLsn > wp->commitLsn) + { + wp->commitLsn = newCommitLsn; + /* Send new value to all safekeepers. */ + BroadcastAppendRequest(wp); + } + + /* + * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown(). + * The last one will terminate the process if the shutdown is requested + * and WAL is committed by the quorum. BroadcastAppendRequest() should be + * called to notify safekeepers about the new commitLsn. + */ + wp->api.process_safekeeper_feedback(wp, sk); /* * Try to advance truncateLsn -- the last record flushed to all @@ -1811,8 +1809,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) msg->hs.ts = pq_getmsgint64_le(&s); msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParsePageserverFeedbackMessage(wp, &s, &msg->rf); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; pq_getmsgend(&s); return true; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index bc674fd979..28585eb4e7 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -10,6 +10,7 @@ #include "libpqwalproposer.h" #include "neon_walreader.h" +#include "pagestore_client.h" #define SK_MAGIC 0xCafeCeefu #define SK_PROTOCOL_VERSION 2 @@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback typedef struct PageserverFeedback { + /* true if AppendResponse contains this feedback */ + bool present; /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ @@ -276,14 +279,21 @@ typedef struct PageserverFeedback XLogRecPtr disk_consistent_lsn; XLogRecPtr remote_consistent_lsn; TimestampTz replytime; + uint32 shard_number; } PageserverFeedback; typedef struct WalproposerShmemState { slock_t mutex; - PageserverFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; + + /* last feedback from each shard */ + PageserverFeedback shard_ps_feedback[MAX_SHARDS]; + int num_shards; + + /* aggregated feedback with min LSNs across shards */ + PageserverFeedback min_ps_feedback; } WalproposerShmemState; /* @@ -307,12 +317,12 @@ typedef struct AppendResponse /* Feedback received from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - PageserverFeedback rf; + PageserverFeedback ps_feedback; } AppendResponse; /* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ -#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) +#define APPENDRESPONSE_FIXEDPART_SIZE 56 struct WalProposer; typedef struct WalProposer WalProposer; @@ -560,11 +570,11 @@ typedef struct walproposer_api void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn); /* - * Called after every new message from the safekeeper. Used to propagate + * Called after every AppendResponse from the safekeeper. Used to propagate * backpressure feedback and to confirm WAL persistence (has been commited * on the quorum of safekeepers). */ - void (*process_safekeeper_feedback) (WalProposer *wp); + void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk); /* * Write a log message to the internal log processor. This is used only diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 8eec2f02c1..c46fd9b3ec 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,7 +63,6 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -static AppendResponse quorumFeedback; static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; @@ -71,6 +70,10 @@ static const walproposer_api walprop_pg; static volatile sig_atomic_t got_SIGUSR2 = false; static bool reported_sigusr2 = false; +static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr; +static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr; +static HotStandbyFeedback agg_hs_feedback; + static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void nwp_prepare_shmem(void); @@ -402,21 +405,58 @@ walprop_pg_get_shmem_state(WalProposer *wp) return walprop_shared; } -static void -replication_feedback_set(PageserverFeedback *rf) +/* + * Record new ps_feedback in the array with shards and update min_feedback. + */ +static PageserverFeedback +record_pageserver_feedback(PageserverFeedback *ps_feedback) { + PageserverFeedback min_feedback; + + Assert(ps_feedback->present); + Assert(ps_feedback->shard_number < MAX_SHARDS); + SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); + + /* Update the number of shards */ + if (ps_feedback->shard_number + 1 > walprop_shared->num_shards) + walprop_shared->num_shards = ps_feedback->shard_number + 1; + + /* Update the feedback */ + memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback)); + + /* Calculate min LSNs */ + memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback)); + for (int i = 0; i < walprop_shared->num_shards; i++) + { + PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i]; + if (feedback->present) + { + if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn) + min_feedback.last_received_lsn = feedback->last_received_lsn; + + if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn) + min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn; + + if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn) + min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn; + } + } + /* Copy min_feedback back to shmem */ + memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback)); + SpinLockRelease(&walprop_shared->mutex); + + return min_feedback; } void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.last_received_lsn; - *flushLsn = walprop_shared->feedback.disk_consistent_lsn; - *applyLsn = walprop_shared->feedback.remote_consistent_lsn; + *writeLsn = walprop_shared->min_ps_feedback.last_received_lsn; + *flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } @@ -1869,39 +1909,6 @@ CheckGracefulShutdown(WalProposer *wp) } } -/* - * Choose most advanced PageserverFeedback and set it to *rf. - */ -static void -GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) -{ - int latest_safekeeper = 0; - XLogRecPtr last_received_lsn = InvalidXLogRecPtr; - - for (int i = 0; i < wp->n_safekeepers; i++) - { - if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) - { - latest_safekeeper = i; - last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn; - } - } - - rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; - rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; - rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; - rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; - - wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); -} - /* * Combine hot standby feedbacks from all safekeepers. */ @@ -1949,26 +1956,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) * None of that is functional in sync-safekeepers. */ static void -walprop_pg_process_safekeeper_feedback(WalProposer *wp) +walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) { - HotStandbyFeedback hsFeedback; - XLogRecPtr oldDiskConsistentLsn; + HotStandbyFeedback hsFeedback; + bool needToAdvanceSlot = false; if (wp->config->syncSafekeepers) return; - oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; - - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf, wp); - replication_feedback_set(&quorumFeedback.rf); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - - if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) + /* handle fresh ps_feedback */ + if (sk->appendResponse.ps_feedback.present) { - if (wp->commitLsn > quorumFeedback.flushLsn) - quorumFeedback.flushLsn = wp->commitLsn; + PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback); + /* Only one main shard sends non-zero currentClusterSize */ + if (sk->appendResponse.ps_feedback.currentClusterSize > 0) + SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); + + if (min_feedback.disk_consistent_lsn != standby_apply_lsn) + { + standby_apply_lsn = min_feedback.disk_consistent_lsn; + needToAdvanceSlot = true; + } + } + + if (wp->commitLsn > standby_flush_lsn) + { + standby_flush_lsn = wp->commitLsn; + needToAdvanceSlot = true; + } + + if (needToAdvanceSlot) + { /* * Advance the replication slot to commitLsn. WAL before it is * hardened and will be fetched from one of safekeepers by @@ -1977,23 +1996,23 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp) * Also wakes up syncrep waiters. */ ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, + /* write_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, + /* flush_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, /* * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.disk_consistent_lsn, + standby_apply_lsn, walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { - quorumFeedback.hs = hsFeedback; + agg_hs_feedback = hsFeedback; ProcessStandbyHSFeedback(hsFeedback.ts, XidFromFullTransactionId(hsFeedback.xmin), EpochFromFullTransactionId(hsFeedback.xmin), diff --git a/poetry.lock b/poetry.lock index 832d7c4334..7b49daf42a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2182,7 +2182,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2529,6 +2528,87 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"] optional = ["python-socks", "wsaccel"] test = ["websockets"] +[[package]] +name = "websockets" +version = "12.0" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"}, + {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"}, + {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"}, + {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"}, + {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"}, + {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"}, + {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"}, + {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"}, + {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"}, + {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"}, + {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"}, + {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"}, + {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"}, + {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"}, + {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"}, + {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"}, + {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"}, + {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"}, + {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"}, + {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"}, + {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"}, + {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"}, + {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"}, + {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"}, + {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"}, + {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"}, + {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"}, + {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"}, + {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"}, + {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"}, + {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"}, + {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"}, + {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"}, +] + [[package]] name = "werkzeug" version = "3.0.1" @@ -2572,16 +2652,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2819,4 +2889,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9" +content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d8112c8bf0..b3a5bf873e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -59,7 +59,7 @@ rustls.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -sha2.workspace = true +sha2 = { workspace = true, features = ["asm"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 11af85caa4..bc307230dd 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -254,7 +254,7 @@ async fn authenticate_with_secret( config: &'static AuthenticationConfig, ) -> auth::Result { if let Some(password) = unauthenticated_password { - let auth_outcome = validate_password_and_exchange(&password, secret)?; + let auth_outcome = validate_password_and_exchange(&password, secret).await?; let keys = match auth_outcome { crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 788381b6c0..f26dcb7c9a 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -126,7 +126,7 @@ impl AuthFlow<'_, S, CleartextPassword> { .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - let outcome = validate_password_and_exchange(password, self.state.0)?; + let outcome = validate_password_and_exchange(password, self.state.0).await?; if let sasl::Outcome::Success(_) = &outcome { self.stream.write_message_noflush(&Be::AuthenticationOk)?; @@ -180,7 +180,7 @@ impl AuthFlow<'_, S, Scram<'_>> { } } -pub(crate) fn validate_password_and_exchange( +pub(crate) async fn validate_password_and_exchange( password: &[u8], secret: AuthSecret, ) -> super::Result> { @@ -200,7 +200,8 @@ pub(crate) fn validate_password_and_exchange( &scram_secret, sasl_client, crate::config::TlsServerEndPoint::Undefined, - )?; + ) + .await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 3b2e0cc204..b36663518d 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -250,6 +250,8 @@ impl super::Api for Api { // which means that we might cache it to reduce the load and latency. if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); + info!("cold_start_info=warm"); + ctx.set_cold_start_info(ColdStartInfo::Warm); return Ok(cached); } @@ -260,6 +262,7 @@ impl super::Api for Api { if permit.should_check_cache() { if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); + info!("cold_start_info=warm"); ctx.set_cold_start_info(ColdStartInfo::Warm); return Ok(cached); } diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index f476cb9b37..700c8c8681 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -341,7 +341,14 @@ impl Accept for ProxyProtocolAccept { cx: &mut Context<'_>, ) -> Poll>> { let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - tracing::info!(protocol = self.protocol, "accepted new TCP connection"); + + let conn_id = uuid::Uuid::new_v4(); + let span = tracing::info_span!("http_conn", ?conn_id); + { + let _enter = span.enter(); + tracing::info!("accepted new TCP connection"); + } + let Some(conn) = conn else { return Poll::Ready(None); }; @@ -354,6 +361,7 @@ impl Accept for ProxyProtocolAccept { .with_label_values(&[self.protocol]) .guard(), )), + span, }))) } } @@ -364,6 +372,14 @@ pin_project! { pub inner: T, pub connection_id: Uuid, pub gauge: Mutex>, + pub span: tracing::Span, + } + + impl PinnedDrop for WithConnectionGuard { + fn drop(this: Pin<&mut Self>) { + let _enter = this.span.enter(); + tracing::info!("HTTP connection closed") + } } } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index a95e734d06..df4b3ec8d7 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -113,7 +113,7 @@ mod tests { ); } - fn run_round_trip_test(server_password: &str, client_password: &str) { + async fn run_round_trip_test(server_password: &str, client_password: &str) { let scram_secret = ServerSecret::build(server_password).unwrap(); let sasl_client = ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported()); @@ -123,6 +123,7 @@ mod tests { sasl_client, crate::config::TlsServerEndPoint::Undefined, ) + .await .unwrap(); match outcome { @@ -131,14 +132,14 @@ mod tests { } } - #[test] - fn round_trip() { - run_round_trip_test("pencil", "pencil") + #[tokio::test] + async fn round_trip() { + run_round_trip_test("pencil", "pencil").await } - #[test] + #[tokio::test] #[should_panic(expected = "password doesn't match")] - fn failure() { - run_round_trip_test("pencil", "eraser") + async fn failure() { + run_round_trip_test("pencil", "eraser").await } } diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 9af7db5201..16575d5d98 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -71,7 +71,7 @@ impl<'a> Exchange<'a> { } } -pub fn exchange( +pub async fn exchange( secret: &ServerSecret, mut client: ScramSha256, tls_server_end_point: config::TlsServerEndPoint, @@ -86,7 +86,14 @@ pub fn exchange( .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; let sent = match init.transition(secret, &tls_server_end_point, client_first)? { Continue(sent, server_first) => { - client.update(server_first.as_bytes())?; + // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread. + // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl + client = tokio::task::spawn_blocking(move || { + client.update(server_first.as_bytes())?; + Ok::(client) + }) + .await + .expect("should not panic while performing password hash")?; sent } Success(x, _) => match x {}, diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 68f68eaba1..be9f90acde 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -19,6 +19,7 @@ use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; +use tracing::instrument::Instrumented; use crate::context::RequestMonitoring; use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; @@ -30,13 +31,12 @@ use hyper::{ Body, Method, Request, Response, }; -use std::convert::Infallible; use std::net::IpAddr; use std::sync::Arc; use std::task::Poll; use tls_listener::TlsListener; use tokio::net::TcpListener; -use tokio_util::sync::CancellationToken; +use tokio_util::sync::{CancellationToken, DropGuard}; use tracing::{error, info, warn, Instrument}; use utils::http::{error::ApiError, json::json_response}; @@ -100,12 +100,7 @@ pub async fn task_main( let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); ws_connections.close(); // allows `ws_connections.wait to complete` - let tls_listener = TlsListener::new( - tls_acceptor, - addr_incoming, - "http", - config.handshake_timeout, - ); + let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout); let make_svc = hyper::service::make_service_fn( |stream: &tokio_rustls::server::TlsStream< @@ -121,6 +116,11 @@ pub async fn task_main( .take() .expect("gauge should be set on connection start"); + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let cancel_connection = http_cancellation_token.clone().drop_guard(); + + let span = conn.span.clone(); let client_addr = conn.inner.client_addr(); let remote_addr = conn.inner.inner.remote_addr(); let backend = backend.clone(); @@ -136,27 +136,43 @@ pub async fn task_main( Ok(MetricService::new( hyper::service::service_fn(move |req: Request| { let backend = backend.clone(); - let ws_connections = ws_connections.clone(); + let ws_connections2 = ws_connections.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); let cancellation_handler = cancellation_handler.clone(); + let http_cancellation_token = http_cancellation_token.child_token(); - async move { - Ok::<_, Infallible>( - request_handler( + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + ws_connections.spawn( + async move { + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let _cancel_session = http_cancellation_token.clone().drop_guard(); + + let res = request_handler( req, config, backend, - ws_connections, + ws_connections2, cancellation_handler, peer_addr.ip(), endpoint_rate_limiter, + http_cancellation_token, ) .await - .map_or_else(|e| e.into_response(), |r| r), - ) - } + .map_or_else(|e| e.into_response(), |r| r); + + _cancel_session.disarm(); + + res + } + .in_current_span(), + ) }), gauge, + cancel_connection, + span, )) } }, @@ -176,11 +192,23 @@ pub async fn task_main( struct MetricService { inner: S, _gauge: IntCounterPairGuard, + _cancel: DropGuard, + span: tracing::Span, } impl MetricService { - fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService { - MetricService { inner, _gauge } + fn new( + inner: S, + _gauge: IntCounterPairGuard, + _cancel: DropGuard, + span: tracing::Span, + ) -> MetricService { + MetricService { + inner, + _gauge, + _cancel, + span, + } } } @@ -190,14 +218,16 @@ where { type Response = S::Response; type Error = S::Error; - type Future = S::Future; + type Future = Instrumented; fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { self.inner.poll_ready(cx) } fn call(&mut self, req: Request) -> Self::Future { - self.inner.call(req) + self.span + .in_scope(|| self.inner.call(req)) + .instrument(self.span.clone()) } } @@ -210,6 +240,8 @@ async fn request_handler( cancellation_handler: Arc, peer_addr: IpAddr, endpoint_rate_limiter: Arc, + // used to cancel in-flight HTTP requests. not used to cancel websockets + http_cancellation_token: CancellationToken, ) -> Result, ApiError> { let session_id = uuid::Uuid::new_v4(); @@ -253,7 +285,7 @@ async fn request_handler( let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); let span = ctx.span.clone(); - sql_over_http::handle(config, ctx, request, backend) + sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 9b3ca8d447..72b55c45f0 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -50,7 +50,7 @@ impl PoolingBackend { } }; let auth_outcome = - crate::auth::validate_password_and_exchange(&conn_info.password, secret)?; + crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => Ok(key), crate::sasl::Outcome::Failure(reason) => { @@ -84,6 +84,7 @@ impl PoolingBackend { }; if let Some(client) = maybe_client { + info!("cold_start_info=warm"); ctx.set_cold_start_info(ColdStartInfo::Warm); return Ok(client); } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 86c278030f..f675375ff1 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -217,8 +217,8 @@ pub async fn handle( mut ctx: RequestMonitoring, request: Request, backend: Arc, + cancel: CancellationToken, ) -> Result, ApiError> { - let cancel = CancellationToken::new(); let cancel2 = cancel.clone(); let handle = tokio::spawn(async move { time::sleep(config.http_config.request_timeout).await; diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs index cce02e3850..33f194dd59 100644 --- a/proxy/src/serverless/tls_listener.rs +++ b/proxy/src/serverless/tls_listener.rs @@ -13,7 +13,7 @@ use tokio::{ time::timeout, }; use tokio_rustls::{server::TlsStream, TlsAcceptor}; -use tracing::{info, warn}; +use tracing::{info, warn, Instrument}; use crate::{ metrics::TLS_HANDSHAKE_FAILURES, @@ -29,24 +29,17 @@ pin_project! { tls: TlsAcceptor, waiting: JoinSet>>, timeout: Duration, - protocol: &'static str, } } impl TlsListener { /// Create a `TlsListener` with default options. - pub(crate) fn new( - tls: TlsAcceptor, - listener: A, - protocol: &'static str, - timeout: Duration, - ) -> Self { + pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self { TlsListener { listener, tls, waiting: JoinSet::new(), timeout, - protocol, } } } @@ -73,7 +66,7 @@ where Poll::Ready(Some(Ok(mut conn))) => { let t = *this.timeout; let tls = this.tls.clone(); - let protocol = *this.protocol; + let span = conn.span.clone(); this.waiting.spawn(async move { let peer_addr = match conn.inner.wait_for_addr().await { Ok(Some(addr)) => addr, @@ -86,21 +79,24 @@ where let accept = tls.accept(conn); match timeout(t, accept).await { - Ok(Ok(conn)) => Some(conn), + Ok(Ok(conn)) => { + info!(%peer_addr, "accepted new TLS connection"); + Some(conn) + }, // The handshake failed, try getting another connection from the queue Ok(Err(e)) => { TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}"); + warn!(%peer_addr, "failed to accept TLS connection: {e:?}"); None } // The handshake timed out, try getting another connection from the queue Err(_) => { TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout"); + warn!(%peer_addr, "failed to accept TLS connection: timeout"); None } } - }); + }.instrument(span)); } Poll::Ready(Some(Err(e))) => { tracing::error!("error accepting TCP connection: {e}"); @@ -112,10 +108,7 @@ where loop { return match this.waiting.poll_join_next(cx) { - Poll::Ready(Some(Ok(Some(conn)))) => { - info!(protocol = this.protocol, "accepted new TLS connection"); - Poll::Ready(Some(Ok(conn))) - } + Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))), // The handshake failed to complete, try getting another connection from the queue Poll::Ready(Some(Ok(None))) => continue, // The handshake panicked or was cancelled. ignore and get another connection diff --git a/pyproject.toml b/pyproject.toml index 6dff112a5e..e347d47cbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ pytest-split = "^0.8.1" zstandard = "^0.21.0" httpx = {extras = ["http2"], version = "^0.26.0"} pytest-repeat = "^0.9.3" +websockets = "^12.0" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 5c79e9082b..42340ba1df 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -224,6 +224,16 @@ impl SimulationApi { }) .collect::>(); + let empty_feedback = PageserverFeedback { + present: false, + currentClusterSize: 0, + last_received_lsn: 0, + disk_consistent_lsn: 0, + remote_consistent_lsn: 0, + replytime: 0, + shard_number: 0, + }; + Self { os: args.os, safekeepers: RefCell::new(sk_conns), @@ -232,15 +242,11 @@ impl SimulationApi { last_logged_commit_lsn: 0, shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState { mutex: 0, - feedback: PageserverFeedback { - currentClusterSize: 0, - last_received_lsn: 0, - disk_consistent_lsn: 0, - remote_consistent_lsn: 0, - replytime: 0, - }, mineLastElectedTerm: 0, backpressureThrottlingTime: pg_atomic_uint64 { value: 0 }, + shard_ps_feedback: [empty_feedback; 128], + num_shards: 0, + min_ps_feedback: empty_feedback, }), config: args.config, event_set: RefCell::new(None), @@ -598,7 +604,11 @@ impl ApiImpl for SimulationApi { } } - fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) { + fn process_safekeeper_feedback( + &mut self, + wp: &mut walproposer::bindings::WalProposer, + _sk: &mut walproposer::bindings::Safekeeper, + ) { debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn); if wp.commitLsn > self.last_logged_commit_lsn { self.os.log_event(format!("commit_lsn;{}", wp.commitLsn)); diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store index 4cca3a9857..1f88f252eb 100755 --- a/scripts/ps_ec2_setup_instance_store +++ b/scripts/ps_ec2_setup_instance_store @@ -40,7 +40,7 @@ To run your local neon.git build on the instance store volume, run the following commands from the top of the neon.git checkout # raise file descriptor limit of your shell and its child processes - sudo prlimit -p $$ --nofile=800000:800000 + sudo prlimit -p \$\$ --nofile=800000:800000 # test suite run export TEST_OUTPUT="$TEST_OUTPUT" diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 200c9c3740..4b0c9ac71d 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -2,6 +2,7 @@ pytest_plugins = ( "fixtures.pg_version", "fixtures.parametrize", "fixtures.httpserver", + "fixtures.compute_reconfigure", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py new file mode 100644 index 0000000000..9dd66fe636 --- /dev/null +++ b/test_runner/fixtures/compute_reconfigure.py @@ -0,0 +1,62 @@ +import concurrent.futures +from typing import Any + +import pytest +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +from fixtures.log_helper import log +from fixtures.types import TenantId + + +class ComputeReconfigure: + def __init__(self, server): + self.server = server + self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.workloads = {} + + def register_workload(self, workload): + self.workloads[workload.tenant_id] = workload + + +@pytest.fixture(scope="function") +def compute_reconfigure_listener(make_httpserver): + """ + This fixture exposes an HTTP listener for the storage controller to submit + compute notifications to us, instead of updating neon_local endpoints itself. + + Although storage controller can use neon_local directly, this causes problems when + the test is also concurrently modifying endpoints. Instead, configure storage controller + to send notifications up to this test code, which will route all endpoint updates + through Workload, which has a mutex to make concurrent updates safe. + """ + server = make_httpserver + + self = ComputeReconfigure(server) + + # Do neon_local endpoint reconfiguration in the background so that we can + # accept a healthy rate of calls into notify-attach. + reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1) + + def handler(request: Request): + assert request.json is not None + body: dict[str, Any] = request.json + log.info(f"notify-attach request: {body}") + + try: + workload = self.workloads[TenantId(body["tenant_id"])] + except KeyError: + pass + else: + # This causes the endpoint to query storage controller for its location, which + # is redundant since we already have it here, but this avoids extending the + # neon_local CLI to take full lists of locations + reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return] + + return Response(status=200) + + self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + + yield self + reconfigure_threads.shutdown() + server.clear() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b3f460c7fe..1d30c45278 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -51,7 +51,7 @@ from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, - scan_pageserver_log_for_errors, + DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.types import IndexPartDump @@ -77,6 +77,7 @@ from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, allure_attach_from_dir, + assert_no_errors, get_self_dir, subprocess_capture, wait_until, @@ -944,6 +945,8 @@ class NeonEnvBuilder: for pageserver in self.env.pageservers: pageserver.assert_no_errors() + self.env.storage_controller.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -1525,6 +1528,7 @@ class NeonCli(AbstractNeonCli): conf: Optional[Dict[str, Any]] = None, shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, + placement_policy: Optional[str] = None, set_default: bool = False, ) -> Tuple[TenantId, TimelineId]: """ @@ -1558,6 +1562,9 @@ class NeonCli(AbstractNeonCli): if shard_stripe_size is not None: args.extend(["--shard-stripe-size", str(shard_stripe_size)]) + if placement_policy is not None: + args.extend(["--placement-policy", str(placement_policy)]) + res = self.raw_cli(args) res.check_returncode() return tenant_id, timeline_id @@ -1885,19 +1892,6 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(args, check_return_code=True) - def tenant_migrate( - self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int] - ): - args = [ - "tenant", - "migrate", - "--tenant-id", - str(tenant_shard_id), - "--id", - str(new_pageserver), - ] - return self.raw_cli(args, check_return_code=True, timeout=timeout_secs) - def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": return self.raw_cli(["start"], check_return_code=check_return_code) @@ -1957,6 +1951,7 @@ class NeonStorageController(MetricsGetter): self.env = env self.running = False self.auth_enabled = auth_enabled + self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS def start(self): assert not self.running @@ -1981,6 +1976,11 @@ class NeonStorageController(MetricsGetter): msg = "" raise StorageControllerApiException(msg, res.status_code) from e + def assert_no_errors(self): + assert_no_errors( + self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors + ) + def pageserver_api(self) -> PageserverHttpClient: """ The storage controller implements a subset of the pageserver REST API, for mapping @@ -2088,6 +2088,14 @@ class NeonStorageController(MetricsGetter): ) return response.json() + def tenant_list(self): + response = self.request( + "GET", + f"{self.env.storage_controller_api}/debug/v1/tenant", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + def node_configure(self, node_id, body: dict[str, Any]): log.info(f"node_configure({node_id}, {body})") body["node_id"] = node_id @@ -2135,7 +2143,7 @@ class NeonStorageController(MetricsGetter): """ response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate", + f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate", headers=self.headers(TokenScope.ADMIN), ) body = response.json() @@ -2177,6 +2185,23 @@ class NeonStorageController(MetricsGetter): ) log.info("storage controller passed consistency check") + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.request( + "PUT", + f"{self.env.storage_controller_api}/debug/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + headers=self.headers(TokenScope.ADMIN), + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + def __enter__(self) -> "NeonStorageController": return self @@ -2328,18 +2353,9 @@ class NeonPageserver(PgProtocol): return self.env.repo_dir / f"pageserver_{self.id}" def assert_no_errors(self): - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return - - with logfile.open("r") as f: - errors = scan_pageserver_log_for_errors(f, self.allowed_errors) - - for _lineno, error in errors: - log.info(f"not allowed error: {error.strip()}") - - assert not errors + assert_no_errors( + self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors + ) def assert_no_metric_errors(self): """ @@ -2944,6 +2960,7 @@ class NeonProxy(PgProtocol): user = quote(kwargs["user"]) password = quote(kwargs["password"]) expected_code = kwargs.get("expected_code") + timeout = kwargs.get("timeout") log.info(f"Executing http query: {query}") @@ -2957,6 +2974,7 @@ class NeonProxy(PgProtocol): "Neon-Pool-Opt-In": "true", }, verify=str(self.test_output_dir / "proxy.crt"), + timeout=timeout, ) if expected_code is not None: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 8ff4341cc0..ec0f81b380 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -55,7 +55,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # FIXME: These need investigation ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", - ".*Removing intermediate uninit mark file.*", # Tenant::delete_timeline() can cause any of the four following errors. # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946 ".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed @@ -90,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ) +DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ + # Many tests will take pageservers offline, resulting in log warnings on the controller + # failing to connect to them. + ".*Call to node.*management API.*failed.*receive body.*", + ".*Call to node.*management API.*failed.*ReceiveBody.*", + # Many tests will start up with a node offline + ".*startup_reconcile: Could not scan node.*", +] + + def _check_allowed_errors(input): allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 6e082374d7..99ec894106 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -34,7 +34,7 @@ class TimelineCreate406(PageserverApiException): class TimelineCreate409(PageserverApiException): def __init__(self, res: requests.Response): assert res.status_code == 409 - super().__init__("", res.status_code) + super().__init__(res.json()["msg"], res.status_code) @dataclass @@ -357,9 +357,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) - def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download") + def tenant_secondary_download( + self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None + ) -> tuple[int, dict[Any, Any]]: + url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download" + if wait_ms is not None: + url = url + f"?wait_ms={wait_ms}" + res = self.post(url) self.verbose_error(res) + return (res.status_code, res.json()) def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): assert "tenant_id" not in config.keys() diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index b28da83508..c8ab550ad7 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -28,7 +28,7 @@ def platform() -> Optional[str]: @pytest.fixture(scope="function", autouse=True) def pageserver_virtual_file_io_engine() -> Optional[str]: - return None + return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") def pytest_generate_tests(metafunc: Metafunc): diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index ea648e460d..80c9b9ce9a 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -158,6 +158,9 @@ class TenantShardId: def __str__(self): return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + def __repr__(self): + return self.__str__() + def _tuple(self) -> tuple[TenantId, int, int]: return (self.tenant_id, self.shard_number, self.shard_count) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 7fc3bae3af..9365d65fc9 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -11,6 +11,7 @@ from typing import ( Any, Callable, Dict, + Iterable, List, Optional, Tuple, @@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float: ) return round(total_ms, 3) + + +def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]: + error_or_warn = re.compile(r"\s(ERROR|WARN)") + errors = [] + for lineno, line in enumerate(input, start=1): + if len(line) == 0: + continue + + if error_or_warn.search(line): + # Is this a torn log line? This happens when force-killing a process and restarting + # Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192" + if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line): + continue + + # It's an ERROR or WARN. Is it in the allow-list? + for a in allowed_errors: + if re.match(a, line): + break + else: + errors.append((lineno, line)) + return errors + + +def assert_no_errors(log_file, service, allowed_errors): + if not log_file.exists(): + log.warning(f"Skipping {service} log check: {log_file} does not exist") + return + + with log_file.open("r") as f: + errors = scan_log_for_errors(f, allowed_errors) + + for _lineno, error in errors: + log.info(f"not allowed {service} error: {error.strip()}") + + assert not errors, f"Log errors on {service}: {errors[0]}" diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 1d5394dc1d..ab8717de54 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -1,4 +1,5 @@ -from typing import Optional +import threading +from typing import Any, Optional from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -11,6 +12,10 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import TenantId, TimelineId +# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex +# to ensure we don't do that: this enables running lots of Workloads in parallel safely. +ENDPOINT_LOCK = threading.Lock() + class Workload: """ @@ -27,6 +32,7 @@ class Workload: tenant_id: TenantId, timeline_id: TimelineId, branch_name: Optional[str] = None, + endpoint_opts: Optional[dict[str, Any]] = None, ): self.env = env self.tenant_id = tenant_id @@ -40,18 +46,33 @@ class Workload: self.churn_cursor = 0 self._endpoint: Optional[Endpoint] = None + self._endpoint_opts = endpoint_opts or {} + + def reconfigure(self): + """ + Request the endpoint to reconfigure based on location reported by storage controller + """ + if self._endpoint is not None: + with ENDPOINT_LOCK: + self._endpoint.reconfigure() def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: - if self._endpoint is None: - self._endpoint = self.env.endpoints.create( - self.branch_name, - tenant_id=self.tenant_id, - pageserver_id=pageserver_id, - endpoint_id="ep-workload", - ) - self._endpoint.start(pageserver_id=pageserver_id) - else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + # We may be running alongside other Workloads for different tenants. Full TTID is + # obnoxiously long for use here, but a cut-down version is still unique enough for tests. + endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}" + + with ENDPOINT_LOCK: + if self._endpoint is None: + self._endpoint = self.env.endpoints.create( + self.branch_name, + tenant_id=self.tenant_id, + pageserver_id=pageserver_id, + endpoint_id=endpoint_id, + **self._endpoint_opts, + ) + self._endpoint.start(pageserver_id=pageserver_id) + else: + self._endpoint.reconfigure(pageserver_id=pageserver_id) connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -94,7 +115,7 @@ class Workload: else: return False - def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True): + def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True): assert self.expect_rows >= n max_iters = 10 @@ -132,22 +153,28 @@ class Workload: ] ) - for tenant_shard_id, pageserver in tenant_get_shards( - self.env, self.tenant_id, pageserver_id - ): - last_flush_lsn = wait_for_last_flush_lsn( - self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id - ) - ps_http = pageserver.http_client() - wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) + if ingest: + # Wait for written data to be ingested by the pageserver + for tenant_shard_id, pageserver in tenant_get_shards( + self.env, self.tenant_id, pageserver_id + ): + last_flush_lsn = wait_for_last_flush_lsn( + self.env, + endpoint, + self.tenant_id, + self.timeline_id, + pageserver_id=pageserver_id, + ) + ps_http = pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) - if upload: - # force a checkpoint to trigger upload - ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id) - wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) - log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") - else: - log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") + if upload: + # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload) + ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id) + wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) + log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") + else: + log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") def validate(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index c98fa44b1a..324ef0d516 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -1,6 +1,5 @@ import asyncio import json -import os from pathlib import Path from typing import Any, Dict, Tuple @@ -20,10 +19,6 @@ from performance.pageserver.util import ( @pytest.mark.parametrize("n_tenants", [10]) @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) @pytest.mark.timeout(1000) -@pytest.mark.skipif( - os.getenv("CI", "false") == "true", - reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006", -) def test_basebackup_with_high_slru_count( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index bdc944f352..ddd02238ea 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() - env.pageserver.allowed_errors.extend( - [ - ".*invalid branch start lsn: less than latest GC cutoff.*", - ".*invalid branch start lsn: less than planned GC cutoff.*", - ] - ) + error_regexes = [ + ".*invalid branch start lsn: less than latest GC cutoff.*", + ".*invalid branch start lsn: less than planned GC cutoff.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 46c74a26b8..b79cad979f 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend( - [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"] - ) + error_regexes = [ + ".*invalid branch start lsn.*", + ".*invalid start lsn .* for ancestor timeline.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 9a0b91b54e..2a7a3c41ac 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -347,6 +347,64 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB ps_http.timeline_detail(env.initial_tenant, branch_id) +def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + env.pageserver.tenant_create(env.initial_tenant) + + success_timeline = TimelineId.generate() + log.info(f"Creating timeline {success_timeline}") + ps_http = env.pageserver.http_client() + success_result = ps_http.timeline_create( + env.pg_version, env.initial_tenant, success_timeline, timeout=60 + ) + + ps_http.configure_failpoints(("timeline-creation-after-uninit", "pause")) + + def start_creating_timeline(): + log.info(f"Creating (expect failure) timeline {env.initial_timeline}") + with pytest.raises(RequestException): + ps_http.timeline_create( + env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 + ) + + t = threading.Thread(target=start_creating_timeline) + try: + t.start() + + wait_until_paused(env, "timeline-creation-after-uninit") + + # While timeline creation is in progress, trying to create a timeline + # again with the same ID should return 409 + with pytest.raises( + PageserverApiException, match="creation of timeline with the given ID is in progress" + ): + ps_http.timeline_create( + env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 + ) + + # Creation of a timeline already successfully created is idempotent, and is not impeded by some + # other timeline creation with a different TimelineId being stuck. + repeat_result = ps_http.timeline_create( + env.pg_version, env.initial_tenant, success_timeline, timeout=60 + ) + assert repeat_result == success_result + finally: + env.pageserver.stop(immediate=True) + t.join() + + # now without a failpoint + env.pageserver.start() + + wait_until_tenant_active(ps_http, env.initial_tenant) + + with pytest.raises(PageserverApiException, match="not found"): + ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + + # The one successfully created timeline should still be there. + assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1 + + def wait_until_paused(env: NeonEnv, failpoint: str): found = False msg = f"at failpoint {failpoint}" diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index b046ed7f1b..804ad135ce 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -204,7 +204,7 @@ def test_timeline_init_break_before_checkpoint_recreate( assert timeline_id == new_timeline_id -def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder): +def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -214,9 +214,9 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] - # Introduce failpoint when creating a new timeline uninit mark, before any other files were created - pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return")) - with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"): + # Introduce failpoint when creating a new timeline, right after creating its directory + pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return")) + with pytest.raises(Exception, match="after-timeline-dir-creation"): _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate()) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 5f815d3e6c..e0bb4c2062 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -238,6 +238,10 @@ def test_forward_compatibility( pg_distrib_dir=compatibility_postgres_distrib_dir, ) + # TODO: remove this workaround after release-5090 is no longer the most recent release. + # There was a bug in that code that generates a warning in the storage controller log. + env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*") + # Use current neon_local even though we're using old binaries for # everything else: our test code is written for latest CLI args. env.neon_local_binpath = neon_local_binpath diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index ec57860033..132427ba2d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -90,7 +90,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build [ ".*error importing base backup .*", ".*Timeline got dropped without initializing, cleaning its files.*", - ".*Removing intermediate uninit mark file.*", ".*InternalServerError.*timeline not found.*", ".*InternalServerError.*Tenant .* not found.*", ".*InternalServerError.*Timeline .* not found.*", diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 877deee08f..81aed704bb 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -37,23 +37,18 @@ def test_pageserver_init_node_id( assert ( bad_init.returncode == 1 ), "pageserver should not be able to init new config without the node id" - assert "missing id" in bad_init.stderr + assert 'missing config value "id"' in bad_init.stderr assert not pageserver_config.exists(), "config file should not be created after init error" - completed_init = run_pageserver( - ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] - ) + good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + completed_init = run_pageserver(good_init_cmd) assert ( completed_init.returncode == 0 ), "pageserver should be able to create a new config with the node id given" assert pageserver_config.exists(), "config file should be created successfully" - bad_reinit = run_pageserver( - ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] - ) - assert ( - bad_reinit.returncode == 1 - ), "pageserver should not be able to init new config without the node id" + bad_reinit = run_pageserver(good_init_cmd) + assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists" assert "already exists, cannot init it" in bad_reinit.stderr bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 3ca13a904d..56b4548b64 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -209,10 +209,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.storage_controller.node_register(env.pageserver) env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) env.neon_cli.create_tenant( tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline ) + generate_uploads_and_deletions(env, pageserver=env.pageserver) def parse_generation_suffix(key): diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 79145f61b3..e664547b69 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,4 +1,5 @@ import json +import os import random from pathlib import Path from typing import Any, Dict, Optional @@ -553,3 +554,103 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) ), ) + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +@pytest.mark.parametrize("via_controller", [True, False]) +def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool): + """ + Test use of secondary download API for slow downloads, where slow means either a healthy + system with a large capacity shard, or some unhealthy remote storage. + + The download API is meant to respect a client-supplied time limit, and return 200 or 202 + selectively based on whether the download completed. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + + # Expect lots of layers + assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10 + + # Simulate large data by making layer downloads artifically slow + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + # Upload a heatmap, so that secondaries have something to download + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + if via_controller: + http_client = env.storage_controller.pageserver_api() + http_client.tenant_location_conf( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + "generation": None, + }, + ) + else: + http_client = ps_secondary.http_client() + + # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms + (status, progress_1) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) + assert status == 202 + assert progress_1["heatmap_mtime"] is not None + assert progress_1["layers_downloaded"] > 0 + assert progress_1["bytes_downloaded"] > 0 + assert progress_1["layers_total"] > progress_1["layers_downloaded"] + assert progress_1["bytes_total"] > progress_1["bytes_downloaded"] + + # Multiple polls should work: use a shorter wait period this time + (status, progress_2) = http_client.tenant_secondary_download(tenant_id, wait_ms=1000) + assert status == 202 + assert progress_2["heatmap_mtime"] is not None + assert progress_2["layers_downloaded"] > 0 + assert progress_2["bytes_downloaded"] > 0 + assert progress_2["layers_total"] > progress_2["layers_downloaded"] + assert progress_2["bytes_total"] > progress_2["bytes_downloaded"] + + # Progress should be >= the first poll: this can only go backward if we see a new heatmap, + # and the heatmap period on the attached node is much longer than the runtime of this test, so no + # new heatmap should have been uploaded. + assert progress_2["layers_downloaded"] >= progress_1["layers_downloaded"] + assert progress_2["bytes_downloaded"] >= progress_1["bytes_downloaded"] + assert progress_2["layers_total"] == progress_1["layers_total"] + assert progress_2["bytes_total"] == progress_1["bytes_total"] + + # Make downloads fast again: when the download completes within this last request, we + # get a 200 instead of a 202 + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")]) + (status, progress_3) = http_client.tenant_secondary_download(tenant_id, wait_ms=20000) + assert status == 200 + assert progress_3["heatmap_mtime"] is not None + assert progress_3["layers_total"] == progress_3["layers_downloaded"] + assert progress_3["bytes_total"] == progress_3["bytes_downloaded"] diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 078589d8eb..3e986a8f7b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -596,3 +596,39 @@ def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy): assert ( "duplicate key value violates unique constraint" in res["message"] ), "HTTP query should conflict" + + +def test_sql_over_http_connection_cancel(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + static_proxy.safe_psql("create table test_table ( id int primary key )") + + # insert into a table, with a unique constraint, after sleeping for n seconds + query = "WITH temp AS ( \ + SELECT pg_sleep($1) as sleep, $2::int as id \ + ) INSERT INTO test_table (id) SELECT id FROM temp" + + try: + # The request should complete before the proxy HTTP timeout triggers. + # Timeout and cancel the request on the client side before the query completes. + static_proxy.http_query( + query, + [static_proxy.http_timeout_seconds - 1, 1], + user="http", + password="http", + timeout=2, + ) + except requests.exceptions.ReadTimeout: + pass + + # wait until the query _would_ have been complete + time.sleep(static_proxy.http_timeout_seconds) + + res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200) + assert res["command"] == "INSERT", "HTTP query should insert" + assert res["rowCount"] == 1, "HTTP query should insert" + + res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) + assert ( + "duplicate key value violates unique constraint" in res["message"] + ), "HTTP query should conflict" diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py new file mode 100644 index 0000000000..6d1cb9765a --- /dev/null +++ b/test_runner/regress/test_proxy_websockets.py @@ -0,0 +1,189 @@ +import ssl + +import pytest +import websockets +from fixtures.neon_fixtures import NeonProxy + + +@pytest.mark.asyncio +async def test_websockets(static_proxy: NeonProxy): + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + version = b"\x00\x03\x00\x00" + params = { + "user": user, + "database": "postgres", + "client_encoding": "UTF8", + } + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + async with websockets.connect( + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl=ssl_context, + ) as websocket: + startup_message = bytearray(version) + for key, value in params.items(): + startup_message.extend(key.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(value.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(b"\0") + length = (4 + len(startup_message)).to_bytes(4, byteorder="big") + + await websocket.send([length, startup_message]) + + startup_response = await websocket.recv() + assert isinstance(startup_response, bytes) + assert startup_response[0:1] == b"R", "should be authentication message" + assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext" + + auth_message = password.encode("utf-8") + b"\0" + length = (4 + len(auth_message)).to_bytes(4, byteorder="big") + await websocket.send([b"p", length, auth_message]) + + auth_response = await websocket.recv() + assert isinstance(auth_response, bytes) + assert auth_response[0:1] == b"R", "should be authentication message" + assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated" + + query_message = "SELECT 1".encode("utf-8") + b"\0" + length = (4 + len(query_message)).to_bytes(4, byteorder="big") + await websocket.send([b"Q", length, query_message]) + + query_response = await websocket.recv() + assert isinstance(query_response, bytes) + # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00' + # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011' + # 'C\x00\x00\x00\rSELECT 1\x00' + # 'Z\x00\x00\x00\x05I' + + assert query_response[0:1] == b"T", "should be row description message" + row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + row_description, query_response = ( + query_response[:row_description_len], + query_response[row_description_len:], + ) + assert row_description[5:7] == b"\x00\x01", "should have 1 column" + assert row_description[7:16] == b"?column?\0", "column should be named ?column?" + assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4" + + assert query_response[0:1] == b"D", "should be data row message" + data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] + assert ( + data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" + ), "should contain 1 column with text value 1" + + assert query_response[0:1] == b"C", "should be command complete message" + command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + command_complete, query_response = ( + query_response[:command_complete_len], + query_response[command_complete_len:], + ) + assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0" + + assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)" + + # close + await websocket.send(b"X\x00\x00\x00\x04") + await websocket.wait_closed() + + +@pytest.mark.asyncio +async def test_websockets_pipelined(static_proxy: NeonProxy): + """ + Test whether we can send the startup + auth + query all in one go + """ + + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + version = b"\x00\x03\x00\x00" + params = { + "user": user, + "database": "postgres", + "client_encoding": "UTF8", + } + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + async with websockets.connect( + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl=ssl_context, + ) as websocket: + startup_message = bytearray(version) + for key, value in params.items(): + startup_message.extend(key.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(value.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(b"\0") + length0 = (4 + len(startup_message)).to_bytes(4, byteorder="big") + + auth_message = password.encode("utf-8") + b"\0" + length1 = (4 + len(auth_message)).to_bytes(4, byteorder="big") + query_message = "SELECT 1".encode("utf-8") + b"\0" + length2 = (4 + len(query_message)).to_bytes(4, byteorder="big") + await websocket.send( + [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message] + ) + + startup_response = await websocket.recv() + assert isinstance(startup_response, bytes) + assert startup_response[0:1] == b"R", "should be authentication message" + assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext" + + auth_response = await websocket.recv() + assert isinstance(auth_response, bytes) + assert auth_response[0:1] == b"R", "should be authentication message" + assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated" + + query_response = await websocket.recv() + assert isinstance(query_response, bytes) + # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00' + # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011' + # 'C\x00\x00\x00\rSELECT 1\x00' + # 'Z\x00\x00\x00\x05I' + + assert query_response[0:1] == b"T", "should be row description message" + row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + row_description, query_response = ( + query_response[:row_description_len], + query_response[row_description_len:], + ) + assert row_description[5:7] == b"\x00\x01", "should have 1 column" + assert row_description[7:16] == b"?column?\0", "column should be named ?column?" + assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4" + + assert query_response[0:1] == b"D", "should be data row message" + data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] + assert ( + data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" + ), "should contain 1 column with text value 1" + + assert query_response[0:1] == b"C", "should be command complete message" + command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + command_complete, query_response = ( + query_response[:command_complete_len], + query_response[command_complete_len:], + ) + assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0" + + assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)" + + # close + await websocket.send(b"X\x00\x00\x00\x04") + await websocket.wait_closed() diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 9309af066b..cb58c640c3 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,10 +1,15 @@ import os -from typing import Dict, List, Union +import time +from typing import Dict, List, Optional, Union import pytest +import requests +from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, + StorageControllerApiException, tenant_get_shards, ) from fixtures.remote_storage import s3_storage @@ -259,7 +264,7 @@ def test_sharding_split_smoke( destination = migrate_to_pageserver_ids.pop() log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}") - env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10) + env.storage_controller.tenant_shard_migrate(migrate_shard, destination) workload.validate() @@ -294,7 +299,7 @@ def test_sharding_split_smoke( locations = pageserver.http_client().tenant_list_locations() shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"]) - log.info("Shards after split: {shards_exist}") + log.info(f"Shards after split: {shards_exist}") assert len(shards_exist) == split_shard_count # Ensure post-split pageserver locations survive a restart (i.e. the child shards @@ -495,3 +500,482 @@ def test_sharding_ingest( # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance. assert huge_layer_count <= shard_count + + +class Failure: + pageserver_id: Optional[int] + + def apply(self, env: NeonEnv): + raise NotImplementedError() + + def clear(self, env: NeonEnv): + """ + Clear the failure, in a way that should enable the system to proceed + to a totally clean state (all nodes online and reconciled) + """ + raise NotImplementedError() + + def expect_available(self): + raise NotImplementedError() + + def can_mitigate(self): + """Whether Self.mitigate is available for use""" + return False + + def mitigate(self, env: NeonEnv): + """ + Mitigate the failure in a way that should allow shard split to + complete and service to resume, but does not guarantee to leave + the whole world in a clean state (e.g. an Offline node might have + junk LocationConfigs on it) + """ + raise NotImplementedError() + + def fails_forward(self, env: NeonEnv): + """ + If true, this failure results in a state that eventualy completes the split. + """ + return False + + def expect_exception(self): + """ + How do we expect a call to the split API to fail? + """ + return StorageControllerApiException + + +class PageserverFailpoint(Failure): + def __init__(self, failpoint, pageserver_id, mitigate): + self.failpoint = failpoint + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.allowed_errors.extend( + [".*failpoint.*", ".*Resetting.*after shard split failure.*"] + ) + pageserver.http_client().configure_failpoints((self.failpoint, "return(1)")) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "off")) + if self._mitigate: + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"}) + + def expect_available(self): + return True + + def can_mitigate(self): + return self._mitigate + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class StorageControllerFailpoint(Failure): + def __init__(self, failpoint, action): + self.failpoint = failpoint + self.pageserver_id = None + self.action = action + + def apply(self, env: NeonEnv): + env.storage_controller.configure_failpoints((self.failpoint, self.action)) + + def clear(self, env: NeonEnv): + if "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + else: + env.storage_controller.configure_failpoints((self.failpoint, "off")) + + def expect_available(self): + # Controller panics _do_ leave pageservers available, but our test code relies + # on using the locate API to update configurations in Workload, so we must skip + # these actions when the controller has been panicked. + return "panic" not in self.action + + def can_mitigate(self): + return False + + def fails_forward(self, env): + # Edge case: the very last failpoint that simulates a DB connection error, where + # the abort path will fail-forward and result in a complete split. + fail_forward = self.failpoint == "shard-split-post-complete" + + # If the failure was a panic, then if we expect split to eventually (after restart) + # complete, we must restart before checking that. + if fail_forward and "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + + return fail_forward + + def expect_exception(self): + if "panic" in self.action: + return requests.exceptions.ConnectionError + else: + return StorageControllerApiException + + +class NodeKill(Failure): + def __init__(self, pageserver_id, mitigate): + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=True) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.start() + + def expect_available(self): + return False + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class CompositeFailure(Failure): + """ + Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and* + stop a pageserver to interfere with rollback) + """ + + def __init__(self, failures: list[Failure]): + self.failures = failures + + self.pageserver_id = None + for f in failures: + if f.pageserver_id is not None: + self.pageserver_id = f.pageserver_id + break + + def apply(self, env: NeonEnv): + for f in self.failures: + f.apply(env) + + def clear(self, env): + for f in self.failures: + f.clear(env) + + def expect_available(self): + return all(f.expect_available() for f in self.failures) + + def mitigate(self, env): + for f in self.failures: + f.mitigate(env) + + def expect_exception(self): + expect = set(f.expect_exception() for f in self.failures) + + # We can't give a sensible response if our failures have different expectations + assert len(expect) == 1 + + return list(expect)[0] + + +@pytest.mark.parametrize( + "failure", + [ + PageserverFailpoint("api-500", 1, False), + NodeKill(1, False), + PageserverFailpoint("api-500", 1, True), + NodeKill(1, True), + PageserverFailpoint("shard-split-pre-prepare", 1, False), + PageserverFailpoint("shard-split-post-prepare", 1, False), + PageserverFailpoint("shard-split-pre-hardlink", 1, False), + PageserverFailpoint("shard-split-post-hardlink", 1, False), + PageserverFailpoint("shard-split-post-child-conf", 1, False), + PageserverFailpoint("shard-split-lsn-wait", 1, False), + PageserverFailpoint("shard-split-pre-finish", 1, False), + StorageControllerFailpoint("shard-split-validation", "return(1)"), + StorageControllerFailpoint("shard-split-post-begin", "return(1)"), + StorageControllerFailpoint("shard-split-post-remote", "return(1)"), + StorageControllerFailpoint("shard-split-post-complete", "return(1)"), + StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"), + CompositeFailure( + [NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + CompositeFailure( + [NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + ], +) +def test_sharding_split_failures( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, + failure: Failure, +): + neon_env_builder.num_pageservers = 4 + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + initial_shard_count = 2 + split_shard_count = 4 + + env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # We exercise failure cases where abort itself will also fail (node offline) + ".*abort_tenant_shard_split.*", + ".*Failed to abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + # Node offline cases will fail to send requests + ".*Reconcile error: receive body: error sending request for url.*", + ] + ) + + for ps in env.pageservers: + # When we do node failures and abandon a shard, it will de-facto have old generation and + # thereby be unable to publish remote consistent LSN updates + ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + + # If we're using a failure that will panic the storage controller, all background + # upcalls from the pageserver can fail + ps.allowed_errors.append(".*calling control plane generation validation API failed.*") + + # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything + assert ( + failure.pageserver_id is None + or len( + env.get_pageserver(failure.pageserver_id) + .http_client() + .tenant_list_locations()["tenant_shards"] + ) + > 0 + ) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + # Put the environment into a failing state (exact meaning depends on `failure`) + failure.apply(env) + + with pytest.raises(failure.expect_exception()): + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # We expect that the overall operation will fail, but some split requests + # will have succeeded: the net result should be to return to a clean state, including + # detaching any child shards. + def assert_rolled_back(exclude_ps_id=None) -> None: + count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + assert tenant_shard_id.shard_count == initial_shard_count + count += 1 + assert count == initial_shard_count + + def assert_split_done(exclude_ps_id=None) -> None: + count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + assert tenant_shard_id.shard_count == split_shard_count + count += 1 + assert count == split_shard_count + + def finish_split(): + # Having failed+rolled back, we should be able to split again + # No failures this time; it will succeed + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + + workload.churn_rows(10) + workload.validate() + + if failure.expect_available(): + # Even though the split failed partway through, this should not have interrupted + # clients. Disable waiting for pageservers in the workload helper, because our + # failpoints may prevent API access. + # This only applies for failure modes that leave pageserver page_service API available. + workload.churn_rows(10, upload=False, ingest=False) + workload.validate() + + if failure.fails_forward(env): + log.info("Fail-forward failure, checking split eventually completes...") + # A failure type which results in eventual completion of the split + wait_until(30, 1, assert_split_done) + elif failure.can_mitigate(): + log.info("Mitigating failure...") + # Mitigation phase: we expect to be able to proceed with a successful shard split + failure.mitigate(env) + + # The split should appear to be rolled back from the point of view of all pageservers + # apart from the one that is offline + wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) + + finish_split() + wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) + + # Having cleared the failure, everything should converge to a pristine state + failure.clear(env) + wait_until(30, 1, assert_split_done) + else: + # Once we restore the faulty pageserver's API to good health, rollback should + # eventually complete. + log.info("Clearing failure...") + failure.clear(env) + + wait_until(30, 1, assert_rolled_back) + + # Having rolled back, the tenant should be working + workload.churn_rows(10) + workload.validate() + + # Splitting again should work, since we cleared the failure + finish_split() + assert_split_done() + + env.storage_controller.consistency_check() + + +def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): + """ + Check a scenario when one of the shards is much slower than others. + Without backpressure, this would lead to the slow shard falling behind + and eventually causing WAL timeouts. + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 256KiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.storage_controller.locate(tenant_id) + + # Slow down one of the shards, around ~1MB/s + pageservers[4].http_client().configure_failpoints(("wal-ingest-record-sleep", "5%sleep(1)")) + + def shards_info(): + infos = [] + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + shard_info = pageserver.http_client().timeline_detail(shard["shard_id"], timeline_id) + infos.append(shard_info) + last_record_lsn = shard_info["last_record_lsn"] + current_physical_size = shard_info["current_physical_size"] + log.info( + f"Shard on pageserver {node_id}: lsn={last_record_lsn}, size={current_physical_size}" + ) + return infos + + shards_info() + + workload = Workload( + env, + tenant_id, + timeline_id, + branch_name="main", + endpoint_opts={ + "config_lines": [ + # Tip: set to 100MB to make the test fail + "max_replication_write_lag=1MB", + ], + }, + ) + workload.init() + + endpoint = workload.endpoint() + + # on 2024-03-05, the default config on prod was [15MB, 10GB, null] + res = endpoint.safe_psql_many( + [ + "SHOW max_replication_write_lag", + "SHOW max_replication_flush_lag", + "SHOW max_replication_apply_lag", + ] + ) + log.info(f"backpressure config: {res}") + + last_flush_lsn = None + last_timestamp = None + + def update_write_lsn(): + nonlocal last_flush_lsn + nonlocal last_timestamp + + res = endpoint.safe_psql( + """ + SELECT + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag, + received_lsn, + pg_current_wal_flush_lsn() as flush_lsn, + neon.backpressure_throttling_time() as throttling_time + FROM neon.backpressure_lsns(); + """, + dbname="postgres", + )[0] + log.info( + f"received_lsn_lag = {res[0]}, received_lsn = {res[1]}, flush_lsn = {res[2]}, throttling_time = {res[3]}" + ) + + lsn = Lsn(res[2]) + now = time.time() + + if last_timestamp is not None: + delta = now - last_timestamp + delta_bytes = lsn - last_flush_lsn + avg_speed = delta_bytes / delta / 1024 / 1024 + log.info( + f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s" + ) + + last_flush_lsn = lsn + last_timestamp = now + + update_write_lsn() + + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.validate() + + update_write_lsn() + shards_info() + + for _write_iter in range(30): + # approximately 1MB of data + workload.write_rows(8000, upload=False) + update_write_lsn() + infos = shards_info() + min_lsn = min(Lsn(info["last_record_lsn"]) for info in infos) + max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) + diff = max_lsn - min_lsn + assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py index 7a0707b564..a6b0f76c96 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_sharding_service.py @@ -177,6 +177,7 @@ def test_node_status_after_restart( assert len(nodes) == 2 env.pageservers[1].stop() + env.storage_controller.allowed_errors.extend([".*Could not scan node"]) env.storage_controller.stop() env.storage_controller.start() @@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} + env.storage_controller.allowed_errors.append(".*Unauthorized.*") + env.storage_controller.allowed_errors.append(".*Forbidden.*") + # No token with pytest.raises( StorageControllerApiException, @@ -769,3 +773,178 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder): assert "pitr_interval" not in readback_ps.tenant_specific_overrides env.storage_controller.consistency_check() + + +class Failure: + pageserver_id: int + + def apply(self, env: NeonEnv): + raise NotImplementedError() + + def clear(self, env: NeonEnv): + raise NotImplementedError() + + +class NodeStop(Failure): + def __init__(self, pageserver_id, immediate): + self.pageserver_id = pageserver_id + self.immediate = immediate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=self.immediate) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.start() + + +class PageserverFailpoint(Failure): + def __init__(self, failpoint, pageserver_id): + self.failpoint = failpoint + self.pageserver_id = pageserver_id + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "return(1)")) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "off")) + + +def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: + tenants = env.storage_controller.tenant_list() + + node_to_tenants: dict[int, list[TenantId]] = {} + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "AttachedSingle" + ): + crnt = node_to_tenants.get(int(node_id), []) + crnt.append(TenantId(t["tenant_shard_id"])) + node_to_tenants[int(node_id)] = crnt + + return node_to_tenants + + +@pytest.mark.parametrize( + "failure", + [ + NodeStop(pageserver_id=1, immediate=False), + NodeStop(pageserver_id=1, immediate=True), + PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), + ], +) +def test_sharding_service_heartbeats( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure +): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Default log allow list permits connection errors, but this test will use error responses on + # the utilization endpoint. + env.storage_controller.allowed_errors.append( + ".*Call to node.*management API.*failed.*failpoint.*" + ) + + # Initially we have two online pageservers + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + assert all([n["availability"] == "Active" for n in nodes]) + + # ... then we create two tenants and write some data into them + def create_tenant(tid: TenantId): + env.storage_controller.tenant_create(tid) + + branch_name = "main" + env.neon_cli.create_timeline( + branch_name, + tenant_id=tid, + ) + + with env.endpoints.create_start("main", tenant_id=tid) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + + tenant_ids = [TenantId.generate(), TenantId.generate()] + for tid in tenant_ids: + create_tenant(tid) + + # ... expecting that each tenant will be placed on a different node + def tenants_placed(): + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"{node_to_tenants=}") + + # Check that all the tenants have been attached + assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids) + # Check that each node got one tenant + assert all((len(ts) == 1 for ts in node_to_tenants.values())) + + wait_until(10, 1, tenants_placed) + + # ... then we apply the failure + offline_node_id = failure.pageserver_id + online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop() + env.get_pageserver(offline_node_id).allowed_errors.append( + # In the case of the failpoint failure, the impacted pageserver + # still believes it has the tenant attached since location + # config calls into it will fail due to being marked offline. + ".*Dropped remote consistent LSN updates.*", + ) + + failure.apply(env) + + # ... expecting the heartbeats to mark it offline + def node_offline(): + nodes = env.storage_controller.node_list() + log.info(f"{nodes=}") + target = next(n for n in nodes if n["id"] == offline_node_id) + assert target["availability"] == "Offline" + + # A node is considered offline if the last successful heartbeat + # was more than 10 seconds ago (hardcoded in the storage controller). + wait_until(20, 1, node_offline) + + # .. expecting the tenant on the offline node to be migrated + def tenant_migrated(): + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"{node_to_tenants=}") + assert set(node_to_tenants[online_node_id]) == set(tenant_ids) + + wait_until(10, 1, tenant_migrated) + + # ... then we clear the failure + failure.clear(env) + + # ... expecting the offline node to become active again + def node_online(): + nodes = env.storage_controller.node_list() + target = next(n for n in nodes if n["id"] == offline_node_id) + assert target["availability"] == "Active" + + wait_until(10, 1, node_online) + + time.sleep(5) + + # ... then we create a new tenant + tid = TenantId.generate() + env.storage_controller.tenant_create(tid) + + # ... expecting it to be placed on the node that just came back online + tenants = env.storage_controller.tenant_list() + newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid)) + locations = list(newest_tenant["observed"]["locations"].keys()) + locations = [int(node_id) for node_id in locations] + assert locations == [offline_node_id] + + # ... expecting the storage controller to reach a consistent state + def storage_controller_consistent(): + env.storage_controller.consistency_check() + + wait_until(10, 1, storage_controller_consistent) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 52de889084..a164c7f60a 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # We may leave some upload tasks in the queue. They're likely deletes. # For uploads we explicitly wait with `last_flush_lsn_upload` below. # So by ignoring these instead of waiting for empty upload queue @@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach( # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 7cea301a9c..025cc930d7 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from typing import List, Tuple @@ -326,7 +327,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa size_debug_file.write(size_debug) -@pytest.mark.xfail +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") def test_single_branch_get_tenant_size_grows( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion ): @@ -349,10 +350,21 @@ def test_single_branch_get_tenant_size_grows( # adjust the gc_horizon accordingly. if pg_version == PgVersion.V14: gc_horizon = 0x4A000 + elif pg_version == PgVersion.V15: + gc_horizon = 0x3BA00 + elif pg_version == PgVersion.V16: + gc_horizon = 210000 + else: + raise NotImplementedError(pg_version) - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" + tenant_config = { + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "0s", + "gc_horizon": gc_horizon, + } - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf=tenant_config) tenant_id = env.initial_tenant branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0] @@ -405,6 +417,7 @@ def test_single_branch_get_tenant_size_grows( current_lsn = after_lsn size_debug_file.write(size_debug) assert size > 0 + log.info(f"size: {size} at lsn {current_lsn}") return (current_lsn, size) with env.endpoints.create_start( @@ -492,24 +505,41 @@ def test_single_branch_get_tenant_size_grows( collected_responses.append(("DELETE", current_lsn, size)) + size_before_drop = get_current_consistent_size( + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id + )[1] + with endpoint.cursor() as cur: cur.execute("DROP TABLE t0") + # Without setting a PITR interval, dropping the table doesn't reclaim any space + # from the user's point of view, because the DROP transaction is too small + # to fall out of gc_horizon. + (current_lsn, size) = get_current_consistent_size( + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id + ) + prev_size = collected_responses[-1][2] + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + + # Set a tiny PITR interval to allow the DROP to impact the synthetic size + # Because synthetic size calculation uses pitr interval when available, + # when our tenant is configured with a tiny pitr interval, dropping a table should + # cause synthetic size to go down immediately + tenant_config["pitr_interval"] = "1ms" + env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) + (current_lsn, size) = get_current_consistent_size( + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id + ) + assert size < size_before_drop + # The size of the tenant should still be as large as before we dropped # the table, because the drop operation can still be undone in the PITR # defined by gc_horizon. - (current_lsn, size) = get_current_consistent_size( - env, endpoint, size_debug_file, http_client, tenant_id, timeline_id - ) - - prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) - collected_responses.append(("DROP", current_lsn, size)) # Should have gone past gc_horizon, otherwise gc_horizon is too large - assert current_lsn - initdb_lsn > gc_horizon + bytes_written = current_lsn - initdb_lsn + assert bytes_written > gc_horizon # this isn't too many lines to forget for a while. observed while # developing these tests that locally the value is a bit more than what we diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 1e13a2f20f..f8701b65d7 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) [d for d in tenants_dir.iterdir()] - neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*") + error_regexes = [".*tenant-config-before-write.*"] + neon_simple_env.pageserver.allowed_errors.extend(error_regexes) + neon_simple_env.storage_controller.allowed_errors.extend(error_regexes) pageserver_http = neon_simple_env.pageserver.http_client() pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 96a5cc491a..0eb1327c9e 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( [ f".*{timeline_id}.*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # This happens when we fail before scheduling background operation. # Timeline is left in stopping state and retry tries to stop it again. ".*Ignoring new state, equal to the existing one: Stopping", @@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ".*failpoint: timeline-delete-before-rm", ".*Ignoring new state, equal to the existing one: Stopping", # this happens, because the stuck timeline is visible to shutdown - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ] ) @@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 205ca18050..628c484fbd 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -20,6 +20,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, wait_for_last_flush_lsn, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, @@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS +def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int): + def condition(): + assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count + + wait_until(5, 1.0, condition) + + def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): """ Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete @@ -767,10 +775,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # That one that we successfully accessed is now Active expect_activated += 1 assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # The ones we didn't touch are still in Attaching assert ( @@ -790,10 +795,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): == n_tenants - expect_activated ) - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # When we unblock logical size calculation, all tenants should proceed to active state via # the warmup route. @@ -813,7 +815,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert ( pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants ) - assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants + wait_for_tenant_startup_completions(pageserver_http, count=n_tenants) # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main # body of the test because it will disrupt tenant counts diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index b980d6f090..3b09894ddb 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit b980d6f090c676e55fb2c830fb2434f532f635c0 +Subproject commit 3b09894ddb8825b50c963942059eab1a2a0b0a89 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 56f32c0e73..80cef885ad 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 56f32c0e7330d17aaeee8bf211a73995180bd133 +Subproject commit 80cef885add1af6741aa31944c7d2c84d8f9098f diff --git a/vendor/revisions.json b/vendor/revisions.json index 1941c235ee..ae524d70b1 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b", - "postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133", - "postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0" + "postgres-v15": "80cef885add1af6741aa31944c7d2c84d8f9098f", + "postgres-v14": "3b09894ddb8825b50c963942059eab1a2a0b0a89" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 8593b752c2..152c452dd4 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -64,6 +64,7 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } +sha2 = { version = "0.10", features = ["asm"] } smallvec = { version = "1", default-features = false, features = ["write"] } subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }