diff --git a/Cargo.lock b/Cargo.lock index 563a998601..e9ebcdc5ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1831,6 +1831,8 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", + "async-trait", "byteorder", "bytes", "chrono", @@ -1871,6 +1873,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", + "tokio-util", "toml_edit", "tracing", "url", @@ -3481,9 +3484,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.34" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09" +checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" dependencies = [ "cfg-if", "log", @@ -3505,11 +3508,11 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.26" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f" +checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" dependencies = [ - "lazy_static", + "once_cell", "valuable", ] @@ -3626,6 +3629,7 @@ name = "utils" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "bincode", "byteorder", "bytes", @@ -3653,6 +3657,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "tokio-rustls", "tracing", "tracing-subscriber", "workspace_hack", diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index 9ee3e40085..e351c972cb 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -1,26 +1,39 @@ ## Thread management -Each thread in the system is tracked by the `thread_mgr` module. It -maintains a registry of threads, and which tenant or timeline they are -operating on. This is used for safe shutdown of a tenant, or the whole -system. +The pageserver uses Tokio for handling concurrency. Everything runs in +Tokio tasks, although some parts are written in blocking style and use +spawn_blocking(). + +Each Tokio task is tracked by the `task_mgr` module. It maintains a +registry of tasks, and which tenant or timeline they are operating +on. ### Handling shutdown -When a tenant or timeline is deleted, we need to shut down all threads -operating on it, before deleting the data on disk. A thread registered -in the thread registry can check if it has been requested to shut down, -by calling `is_shutdown_requested()`. For async operations, there's also -a `shudown_watcher()` async task that can be used to wake up on shutdown. +When a tenant or timeline is deleted, we need to shut down all tasks +operating on it, before deleting the data on disk. There's a function, +`shutdown_tasks`, to request all tasks of a particular tenant or +timeline to shutdown. It will also wait for them to finish. + +A task registered in the task registry can check if it has been +requested to shut down, by calling `is_shutdown_requested()`. There's +also a `shudown_watcher()` Future that can be used with `tokio::select!` +or similar, to wake up on shutdown. + ### Sync vs async -The primary programming model in the page server is synchronous, -blocking code. However, there are some places where async code is -used. Be very careful when mixing sync and async code. - -Async is primarily used to wait for incoming data on network -connections. For example, all WAL receivers have a shared thread pool, -with one async Task for each connection. Once a piece of WAL has been -received from the network, the thread calls the blocking functions in +We use async to wait for incoming data on network connections, and to +perform other long-running operations. For example, each WAL receiver +connection is handled by a tokio Task. Once a piece of WAL has been +received from the network, the task calls the blocking functions in the Repository to process the WAL. + +The core storage code in `layered_repository/` is synchronous, with +blocking locks and I/O calls. The current model is that we consider +disk I/Os to be short enough that we perform them while running in a +Tokio task. If that becomes a problem, we should use `spawn_blocking` +before entering the synchronous parts of the code, or switch to using +tokio I/O functions. + +Be very careful when mixing sync and async code! diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 28ad658de4..ce55277f29 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +async-trait = "0.1" anyhow = "1.0" bincode = "1.3" bytes = "1.0.1" @@ -16,6 +17,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" thiserror = "1.0" tokio = { version = "1.17", features = ["macros"]} +tokio-rustls = "0.23" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } nix = "0.23.0" diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index fa7a37adf1..caa7ac6c09 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -14,11 +14,9 @@ pub mod simple_rcu; /// append only ordered map implemented with a Vec pub mod vec_map; -// Async version of SeqWait. Currently unused. -// pub mod seqwait_async; - pub mod bin_ser; pub mod postgres_backend; +pub mod postgres_backend_async; pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs new file mode 100644 index 0000000000..383ad3742f --- /dev/null +++ b/libs/utils/src/postgres_backend_async.rs @@ -0,0 +1,485 @@ +//! Server-side asynchronous Postgres connection, as limited as we need. +//! To use, create PostgresBackend and run() it, passing the Handler +//! implementation determining how to process the queries. Currently its API +//! is rather narrow, but we can extend it once required. + +use crate::postgres_backend::AuthType; +use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; +use anyhow::{bail, Context, Result}; +use bytes::{Bytes, BytesMut}; +use rand::Rng; +use std::future::Future; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::task::Poll; +use tracing::{debug, error, trace}; + +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_rustls::TlsAcceptor; + +#[async_trait::async_trait] +pub trait Handler { + /// Handle single query. + /// postgres_backend will issue ReadyForQuery after calling this (this + /// might be not what we want after CopyData streaming, but currently we don't + /// care). + async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + + /// Called on startup packet receival, allows to process params. + /// + /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users + /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow + /// to override whole init logic in implementations. + fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + Ok(()) + } + + /// Check auth md5 + fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { + bail!("MD5 auth failed") + } + + /// Check auth jwt + fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { + bail!("JWT auth failed") + } +} + +/// PostgresBackend protocol state. +/// XXX: The order of the constructors matters. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub enum ProtoState { + Initialization, + Encrypted, + Authentication, + Established, + Closed, +} + +#[derive(Clone, Copy)] +pub enum ProcessMsgResult { + Continue, + Break, +} + +/// Always-writeable sock_split stream. +/// May not be readable. See [`PostgresBackend::take_stream_in`] +pub enum Stream { + Unencrypted(tokio::net::TcpStream), + Tls(Box>), + Broken, +} + +impl AsyncWrite for Stream { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Broken => unreachable!(), + } + } + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), + Self::Tls(stream) => Pin::new(stream).poll_flush(cx), + Self::Broken => unreachable!(), + } + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Broken => unreachable!(), + } + } +} +impl AsyncRead for Stream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Broken => unreachable!(), + } + } +} + +pub struct PostgresBackend { + stream: Stream, + // Output buffer. c.f. BeMessage::write why we are using BytesMut here. + buf_out: BytesMut, + + pub state: ProtoState, + + md5_salt: [u8; 4], + auth_type: AuthType, + + peer_addr: SocketAddr, + pub tls_config: Option>, +} + +pub fn query_from_cstring(query_string: Bytes) -> Vec { + let mut query_string = query_string.to_vec(); + if let Some(ch) = query_string.last() { + if *ch == 0 { + query_string.pop(); + } + } + query_string +} + +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); + std::str::from_utf8(without_null).map_err(|e| e.into()) +} + +impl PostgresBackend { + pub fn new( + socket: tokio::net::TcpStream, + auth_type: AuthType, + tls_config: Option>, + ) -> std::io::Result { + let peer_addr = socket.peer_addr()?; + + Ok(Self { + stream: Stream::Unencrypted(socket), + buf_out: BytesMut::with_capacity(10 * 1024), + state: ProtoState::Initialization, + md5_salt: [0u8; 4], + auth_type, + tls_config, + peer_addr, + }) + } + + pub fn get_peer_addr(&self) -> &SocketAddr { + &self.peer_addr + } + + /// Read full message or return None if connection is closed. + pub async fn read_message(&mut self) -> Result> { + use ProtoState::*; + match self.state { + Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, + Authentication | Established => FeMessage::read_fut(&mut self.stream).await, + Closed => Ok(None), + } + } + + /// Flush output buffer into the socket. + pub async fn flush(&mut self) -> std::io::Result<&mut Self> { + self.stream.write_all(&self.buf_out).await?; + self.buf_out.clear(); + Ok(self) + } + + /// Write message into internal output buffer. + pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> { + BeMessage::write(&mut self.buf_out, message)?; + Ok(self) + } + + // Wrapper for run_message_loop() that shuts down socket when we are done + pub async fn run(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + let ret = self.run_message_loop(handler, shutdown_watcher).await; + let _ = self.stream.shutdown(); + ret + } + + async fn run_message_loop( + &mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + trace!("postgres backend to {:?} started", self.peer_addr); + + tokio::select!( + biased; + + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received during handshake"); + return Ok(()) + }, + + result = async { + while self.state < ProtoState::Established { + if let Some(msg) = self.read_message().await? { + trace!("got message {msg:?} during handshake"); + + match self.process_handshake_message(handler, msg).await? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + } else { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + Ok::<(), anyhow::Error>(()) + } => { + // Handshake complete. + result?; + } + ); + + // Authentication completed + let mut query_string = Bytes::new(); + while let Some(msg) = tokio::select!( + biased; + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received in run_message_loop"); + Ok(None) + }, + msg = self.read_message() => { msg }, + )? { + trace!("got message {:?}", msg); + + let result = self.process_message(handler, msg, &mut query_string).await; + self.flush().await?; + match result? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => break, + } + } + + trace!("postgres backend to {:?} exited", self.peer_addr); + Ok(()) + } + + async fn start_tls(&mut self) -> anyhow::Result<()> { + if let Stream::Unencrypted(plain_stream) = + std::mem::replace(&mut self.stream, Stream::Broken) + { + let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap()); + let tls_stream = acceptor.accept(plain_stream).await?; + + self.stream = Stream::Tls(Box::new(tls_stream)); + return Ok(()); + }; + bail!("TLS already started"); + } + + async fn process_handshake_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + ) -> Result { + assert!(self.state < ProtoState::Established); + let have_tls = self.tls_config.is_some(); + match msg { + FeMessage::StartupPacket(m) => { + trace!("got startup message {m:?}"); + + match m { + FeStartupPacket::SslRequest => { + debug!("SSL requested"); + + self.write_message(&BeMessage::EncryptionResponse(have_tls))?; + if have_tls { + self.start_tls().await?; + self.state = ProtoState::Encrypted; + } + } + FeStartupPacket::GssEncRequest => { + debug!("GSS requested"); + self.write_message(&BeMessage::EncryptionResponse(false))?; + } + FeStartupPacket::StartupMessage { .. } => { + if have_tls && !matches!(self.state, ProtoState::Encrypted) { + self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; + bail!("client did not connect with TLS"); + } + + // NB: startup() may change self.auth_type -- we are using that in proxy code + // to bypass auth for new users. + handler.startup(self, &m)?; + + match self.auth_type { + AuthType::Trust => { + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + // The async python driver requires a valid server_version + .write_message(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion("14.1"), + ))? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + AuthType::MD5 => { + rand::thread_rng().fill(&mut self.md5_salt); + self.write_message(&BeMessage::AuthenticationMD5Password( + self.md5_salt, + ))?; + self.state = ProtoState::Authentication; + } + AuthType::ZenithJWT => { + self.write_message(&BeMessage::AuthenticationCleartextPassword)?; + self.state = ProtoState::Authentication; + } + } + } + FeStartupPacket::CancelRequest { .. } => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + } + + FeMessage::PasswordMessage(m) => { + trace!("got password message '{:?}'", m); + + assert!(self.state == ProtoState::Authentication); + + match self.auth_type { + AuthType::Trust => unreachable!(), + AuthType::MD5 => { + let (_, md5_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_md5(self, md5_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + AuthType::ZenithJWT => { + let (_, jwt_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_jwt(self, jwt_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + } + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + + _ => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + Ok(ProcessMsgResult::Continue) + } + + async fn process_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + unnamed_query_string: &mut Bytes, + ) -> Result { + // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth + // TODO: change that to proper top-level match of protocol state with separate message handling for each state + assert!(self.state == ProtoState::Established); + + match msg { + FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { + bail!("protocol violation"); + } + + FeMessage::Query(body) => { + // remove null terminator + let query_string = cstr_to_str(&body)?; + + trace!("got query {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + // ":?" uses the alternate formatting style, which makes anyhow display the + // full cause of the error, not just the top-level context + its trace. + // We don't want to send that in the ErrorResponse though, + // because it's not relevant to the compute node logs. + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + // TODO: untangle convoluted control flow + if e.to_string().contains("failed to run") { + return Ok(ProcessMsgResult::Break); + } + } + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Parse(m) => { + *unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete)?; + } + + FeMessage::Describe(_) => { + self.write_message(&BeMessage::ParameterDescription)? + .write_message(&BeMessage::NoData)?; + } + + FeMessage::Bind(_) => { + self.write_message(&BeMessage::BindComplete)?; + } + + FeMessage::Close(_) => { + self.write_message(&BeMessage::CloseComplete)?; + } + + FeMessage::Execute(_) => { + let query_string = cstr_to_str(unnamed_query_string)?; + trace!("got execute {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + } + // NOTE there is no ReadyForQuery message. This handler is used + // for basebackup and it uses CopyOut which doesn't require + // ReadyForQuery message and backend just switches back to + // processing mode after sending CopyDone or ErrorResponse. + } + + FeMessage::Sync => { + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Terminate => { + return Ok(ProcessMsgResult::Break); + } + + // We prefer explicit pattern matching to wildcards, because + // this helps us spot the places where new variants are missing + FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { + bail!("unexpected message type: {:?}", msg); + } + } + + Ok(ProcessMsgResult::Continue) + } +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index a531975d60..467b900a13 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -4,9 +4,10 @@ use std::cmp::{Eq, Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fmt::Debug; use std::mem; -use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Mutex; use std::time::Duration; +use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::time::timeout; /// An error happened while waiting for a number #[derive(Debug, PartialEq, Eq, thiserror::Error)] @@ -141,10 +142,10 @@ where /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. - pub fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { + pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown), + Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown), Err(e) => Err(e), } } @@ -156,13 +157,18 @@ where /// /// If that hasn't happened after the specified timeout duration, /// [`SeqWaitError::Timeout`] will be returned. - pub fn wait_for_timeout(&self, num: V, timeout_duration: Duration) -> Result<(), SeqWaitError> { + pub async fn wait_for_timeout( + &self, + num: V, + timeout_duration: Duration, + ) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e { - std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout, - std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown, - }), + Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await { + Ok(Ok(())) => Ok(()), + Ok(Err(_)) => Err(SeqWaitError::Shutdown), + Err(_) => Err(SeqWaitError::Timeout), + }, Err(e) => Err(e), } } @@ -179,7 +185,7 @@ where } // Create a new channel. - let (tx, rx) = channel(); + let (tx, rx) = channel(()); internal.waiters.push(Waiter { wake_num: num, wake_channel: tx, @@ -235,7 +241,6 @@ mod tests { use super::*; use std::sync::Arc; use std::thread::sleep; - use std::thread::spawn; use std::time::Duration; impl MonotonicCounter for i32 { @@ -248,25 +253,25 @@ mod tests { } } - #[test] - fn seqwait() { + #[tokio::test] + async fn seqwait() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - spawn(move || { - seq2.wait_for(42).expect("wait_for 42"); + tokio::task::spawn(async move { + seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).expect_err("no 999"); + seq2.wait_for(999).await.expect_err("no 999"); }); - spawn(move || { - seq3.wait_for(42).expect("wait_for 42"); - seq3.wait_for(0).expect("wait_for 0"); + tokio::task::spawn(async move { + seq3.wait_for(42).await.expect("wait_for 42"); + seq3.wait_for(0).await.expect("wait_for 0"); }); sleep(Duration::from_secs(1)); let old = seq.advance(99); assert_eq!(old, 0); - seq.wait_for(100).expect("wait_for 100"); + seq.wait_for(100).await.expect("wait_for 100"); // Calling advance with a smaller value is a no-op assert_eq!(seq.advance(98), 100); @@ -275,16 +280,16 @@ mod tests { seq.shutdown(); } - #[test] - fn seqwait_timeout() { + #[tokio::test] + async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - spawn(move || { + tokio::task::spawn(async move { let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout); + let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_secs(1)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); diff --git a/libs/utils/src/seqwait_async.rs b/libs/utils/src/seqwait_async.rs deleted file mode 100644 index f685e2b569..0000000000 --- a/libs/utils/src/seqwait_async.rs +++ /dev/null @@ -1,224 +0,0 @@ -//! -//! Async version of 'seqwait.rs' -//! -//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. -//! - -#![warn(missing_docs)] - -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::mem; -use std::sync::Mutex; -use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; -use tokio::time::timeout; - -/// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] -#[error("SeqWaitError")] -pub enum SeqWaitError { - /// The wait timeout was reached - Timeout, - /// [`SeqWait::shutdown`] was called - Shutdown, -} - -/// Internal components of a `SeqWait` -struct SeqWaitInt -where - T: Ord, -{ - waiters: BTreeMap, Receiver<()>)>, - current: T, - shutdown: bool, -} - -/// A tool for waiting on a sequence number -/// -/// This provides a way to await the arrival of a number. -/// As soon as the number arrives by another caller calling -/// [`advance`], then the waiter will be woken up. -/// -/// This implementation takes a blocking Mutex on both [`wait_for`] -/// and [`advance`], meaning there may be unexpected executor blocking -/// due to thread scheduling unfairness. There are probably better -/// implementations, but we can probably live with this for now. -/// -/// [`wait_for`]: SeqWait::wait_for -/// [`advance`]: SeqWait::advance -/// -pub struct SeqWait -where - T: Ord, -{ - internal: Mutex>, -} - -impl SeqWait -where - T: Ord + Debug + Copy, -{ - /// Create a new `SeqWait`, initialized to a particular number - pub fn new(starting_num: T) -> Self { - let internal = SeqWaitInt { - waiters: BTreeMap::new(), - current: starting_num, - shutdown: false, - }; - SeqWait { - internal: Mutex::new(internal), - } - } - - /// Shut down a `SeqWait`, causing all waiters (present and - /// future) to return an error. - pub fn shutdown(&self) { - let waiters = { - // Prevent new waiters; wake all those that exist. - // Wake everyone with an error. - let mut internal = self.internal.lock().unwrap(); - - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) - - // Drop the lock as we exit this scope. - }; - - // When we drop the waiters list, each Receiver will - // be woken with an error. - // This drop doesn't need to be explicit; it's done - // here to make it easier to read the code and understand - // the order of events. - drop(waiters); - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> { - let mut rx = { - let mut internal = self.internal.lock().unwrap(); - if internal.current >= num { - return Ok(()); - } - if internal.shutdown { - return Err(SeqWaitError::Shutdown); - } - - // If we already have a channel for waiting on this number, reuse it. - if let Some((_, rx)) = internal.waiters.get_mut(&num) { - // an Err from changed() means the sender was dropped. - rx.clone() - } else { - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.insert(num, (tx, rx.clone())); - rx - } - // Drop the lock as we exit this scope. - }; - rx.changed().await.map_err(|_| SeqWaitError::Shutdown) - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - /// - /// If that hasn't happened after the specified timeout duration, - /// [`SeqWaitError::Timeout`] will be returned. - pub async fn wait_for_timeout( - &self, - num: T, - timeout_duration: Duration, - ) -> Result<(), SeqWaitError> { - timeout(timeout_duration, self.wait_for(num)) - .await - .unwrap_or(Err(SeqWaitError::Timeout)) - } - - /// Announce a new number has arrived - /// - /// All waiters at this value or below will be woken. - /// - /// `advance` will panic if you send it a lower number than - /// a previous call. - pub fn advance(&self, num: T) { - let wake_these = { - let mut internal = self.internal.lock().unwrap(); - - if internal.current > num { - panic!( - "tried to advance backwards, from {:?} to {:?}", - internal.current, num - ); - } - internal.current = num; - - // split_off will give me all the high-numbered waiters, - // so split and then swap. Everything at or above `num` - // stays. - let mut split = internal.waiters.split_off(&num); - std::mem::swap(&mut split, &mut internal.waiters); - - // `split_at` didn't get the value at `num`; if it's - // there take that too. - if let Some(sleeper) = internal.waiters.remove(&num) { - split.insert(num, sleeper); - } - - split - }; - - for (_wake_num, (tx, _rx)) in wake_these { - // This can fail if there are no receivers. - // We don't care; discard the error. - let _ = tx.send(()); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::Arc; - use tokio::time::{sleep, Duration}; - - #[tokio::test] - async fn seqwait() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - let seq3 = Arc::clone(&seq); - tokio::spawn(async move { - seq2.wait_for(42).await.expect("wait_for 42"); - seq2.advance(100); - seq2.wait_for(999).await.expect_err("no 999"); - }); - tokio::spawn(async move { - seq3.wait_for(42).await.expect("wait_for 42"); - seq3.wait_for(0).await.expect("wait_for 0"); - }); - sleep(Duration::from_secs(1)).await; - seq.advance(99); - seq.wait_for(100).await.expect("wait_for 100"); - seq.shutdown(); - } - - #[tokio::test] - async fn seqwait_timeout() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - tokio::spawn(async move { - let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout).await; - assert_eq!(res, Err(SeqWaitError::Timeout)); - }); - sleep(Duration::from_secs(1)).await; - // This will attempt to wake, but nothing will happen - // because the waiter already dropped its Receiver. - seq.advance(99); - } -} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 902765f424..e73c73bd9c 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,8 @@ profiling = ["pprof"] failpoints = ["fail/failpoints"] [dependencies] +async-stream = "0.3" +async-trait = "0.1" chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" @@ -24,6 +26,7 @@ itertools = "0.10.3" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio-util = { version = "0.7.3", features = ["io", "io-util"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } @@ -43,7 +46,7 @@ pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallcl toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" const_format = "0.2.21" -tracing = "0.1.27" +tracing = "0.1.36" signal-hook = "0.3.10" url = "2" nix = "0.23" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index cd99c3c67d..61facc852d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -81,9 +81,8 @@ where // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { - // Backup was requested at a particular LSN. Wait for it to arrive. - info!("waiting for {}", req_lsn); - timeline.wait_lsn(req_lsn)?; + // Backup was requested at a particular LSN. The caller should've + // already checked that it's a valid LSN. // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5a43516728..ec71e5b320 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,7 +4,7 @@ use remote_storage::GenericRemoteStorage; use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use clap::{App, Arg}; use daemonize::Daemonize; @@ -12,13 +12,15 @@ use daemonize::Daemonize; use fail::FailScenario; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, - thread_mgr::ThreadKind, - virtual_file, LOG_FILE_NAME, + http, page_cache, page_service, profiling, task_mgr, + task_mgr::TaskKind, + task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, + }, + tenant_mgr, virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, - http::endpoint, logging, postgres_backend::AuthType, project_git_version, @@ -286,7 +288,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // start profiler (if enabled) let profiler_guard = profiling::init_profiler(conf); - pageserver::tenant_tasks::init_tenant_task_pool()?; + WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?; // initialize authentication for incoming connections let auth = match &conf.auth_type { @@ -307,35 +309,54 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }) .transpose() .context("Failed to init generic remote storage")?; + let remote_index = { + let _rt_guard = BACKGROUND_RUNTIME.enter(); + tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())? + }; - let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?; - - // Spawn a new thread for the http endpoint + // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME. // bind before launching separate thread so the error reported before startup exits - let auth_cloned = auth.clone(); - thread_mgr::spawn( - ThreadKind::HttpEndpointListener, - None, - None, - "http_endpoint_thread", - true, - move || { - let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?; - endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) - }, - )?; - // Spawn a thread to listen for libpq connections. It will spawn further threads + // Create a Service from the router above to handle incoming requests. + { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + + let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?; + let service = + utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(service) + .with_graceful_shutdown(task_mgr::shutdown_watcher()); + + task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::HttpEndpointListener, + None, + None, + "http endpoint listener", + true, + async { + server.await?; + Ok(()) + }, + ); + } + + // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. - thread_mgr::spawn( - ThreadKind::LibpqEndpointListener, + task_mgr::spawn( + COMPUTE_REQUEST_RUNTIME.handle(), + TaskKind::LibpqEndpointListener, None, None, - "libpq endpoint thread", + "libpq endpoint listener", true, - move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), - )?; + async move { + page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await + }, + ); + // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { info!( @@ -352,7 +373,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() signal.name() ); profiling::exit_profiler(conf, &profiler_guard); - pageserver::shutdown_pageserver(0); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } }) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 59142bd9b2..78f83511cb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -161,16 +161,14 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; @@ -184,9 +182,10 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists Err(err) => Err(err), } - }) - .await - .map_err(ApiError::from_err)??; + } + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) + .await + .map_err(ApiError::from_err)?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -426,12 +425,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; - tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_detach", tenant = %tenant_id).entered(); - tenant_mgr::detach_tenant(conf, tenant_id) - }) - .await - .map_err(ApiError::from_err)??; + tenant_mgr::detach_tenant(conf, tenant_id) + .instrument(info_span!("tenant_detach", tenant = %tenant_id)) + .await + .map_err(ApiError::from_err)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -583,7 +578,7 @@ async fn tenant_create_handler(mut request: Request) -> Result, - // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -284,7 +270,7 @@ impl Repository { } /// perform one garbage collection iteration, removing old data files from disk. - /// this function is periodically called by gc thread. + /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// /// 'timelineid' specifies the timeline to GC, or None for all. @@ -299,14 +285,6 @@ impl Repository { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down GC"); - return Ok(GcResult::default()); - } - }; - let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -319,18 +297,10 @@ impl Repository { } /// Perform one compaction iteration. - /// This function is periodically called by compactor thread. + /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> Result<()> { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down compaction"); - return Ok(()); - } - }; - // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -624,10 +594,7 @@ impl Repository { .load_layer_map(new_disk_consistent_lsn) .context("failed to load layermap")?; - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), new_timeline_id), - timeline: Arc::clone(&new_timeline), - }); + new_timeline.launch_wal_receiver()?; Ok(new_timeline) } @@ -642,7 +609,6 @@ impl Repository { ) -> Repository { Repository { tenant_id, - file_lock: RwLock::new(()), conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), @@ -846,7 +812,7 @@ impl Repository { // See comments in [`Repository::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { // We were requested to shut down. Stop and return with the progress we // made. break; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index aa9d636739..60abbe33e6 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -5,16 +5,17 @@ use bytes::Bytes; use fail::fail_point; use itertools::Itertools; use once_cell::sync::OnceCell; +use tokio::task::spawn_blocking; use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; +use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; -use std::{fs, thread}; use crate::layered_repository::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, @@ -46,8 +47,9 @@ use utils::{ use crate::repository::GcResult; use crate::repository::{Key, Value}; -use crate::thread_mgr; -use crate::walreceiver::IS_WAL_RECEIVER; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; @@ -56,7 +58,7 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - tenant_id: ZTenantId, + pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, pub layers: RwLock, @@ -110,11 +112,11 @@ pub struct Timeline { /// to avoid deadlock. write_lock: Mutex<()>, - /// Used to ensure that there is only one thread + /// Used to ensure that there is only task performing flushing at a time layer_flush_lock: Mutex<()>, /// Layer removal lock. - /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], /// and [`Repository::delete_timeline`]. layer_removal_cs: Mutex<()>, @@ -142,10 +144,7 @@ pub struct Timeline { /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, - // TODO task management should be done outside timeline, managed along with other tasks. - #[allow(clippy::type_complexity)] - initial_size_computation_task: - Mutex>, mpsc::Receiver<()>)>>, + initial_size_computation_started: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -413,23 +412,23 @@ impl Timeline { /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// - pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead + pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver, because that could lead // to a deadlock. ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection), + "wait_lsn cannot be called in WAL receiver" ); - self.metrics.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; + let _timer = self.metrics.wait_lsn_time_histo.start_timer(); + + self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await + .with_context(|| + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + )?; Ok(()) } @@ -587,7 +586,7 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - initial_size_computation_task: Mutex::new(None), + initial_size_computation_started: AtomicBool::new(false), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -598,6 +597,43 @@ impl Timeline { result } + pub fn launch_wal_receiver(self: &Arc) -> anyhow::Result<()> { + if !is_etcd_client_initialized() { + if cfg!(test) { + info!("not launching WAL receiver because etcd client hasn't been initialized"); + return Ok(()); + } else { + panic!("etcd client not initialized"); + } + } + + info!( + "launching WAL receiver for timeline {} of tenant {}", + self.timeline_id, self.tenant_id + ); + let tenant_conf_guard = self.tenant_conf.read().unwrap(); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); + let walreceiver_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); + drop(tenant_conf_guard); + let self_clone = Arc::clone(self); + let _ = spawn_connection_manager_task( + self.conf.broker_etcd_prefix.clone(), + self_clone, + walreceiver_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + )?; + + Ok(()) + } + /// /// Scan the timeline directory to populate the layer map. /// Returns all timeline-related files that were found and loaded. @@ -715,61 +751,34 @@ impl Timeline { fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { let timeline_id = self.timeline_id; - let mut task_guard = match self.initial_size_computation_task.try_lock() { - Ok(guard) => guard, - Err(_) => { - debug!("Skipping timeline logical size init: task lock is taken already"); - return; - } - }; - - if let Some((old_task, task_finish_signal)) = task_guard.take() { - // TODO rust 1.61 would allow to remove `task_finish_signal` entirely and call `old_task.is_finished()` instead - match task_finish_signal.try_recv() { - // task has either signaled successfully that it finished or panicked and dropped the sender part without signalling - Ok(()) | Err(mpsc::TryRecvError::Disconnected) => { - match old_task.join() { - // we're here due to OnceCell::get not returning the value - Ok(Ok(())) => { - error!("Timeline {timeline_id} size init task finished, yet the size was not updated, rescheduling the computation") - } - Ok(Err(task_error)) => { - error!("Error during timeline {timeline_id} size init: {task_error:?}") - } - Err(e) => error!("Timeline {timeline_id} size init task panicked: {e:?}"), - } - } - // task had not yet finished: no signal was sent and the sender channel is not dropped - Err(mpsc::TryRecvError::Empty) => { - // let the task finish - *task_guard = Some((old_task, task_finish_signal)); - return; - } - } - } - - if task_guard.is_none() { - let thread_timeline = Arc::clone(self); - let (finish_sender, finish_receiver) = mpsc::channel(); - - match thread::Builder::new() - .name(format!( - "Timeline {timeline_id} initial logical size calculation" - )) - .spawn(move || { - let _enter = info_span!("initial_logical_size_calculation", timeline = %timeline_id).entered(); - let calculated_size = thread_timeline.calculate_logical_size(init_lsn)?; - match thread_timeline.current_logical_size.initial_logical_size.set(calculated_size) { + // Atomically check if the timeline size calculation had already started. + // If the flag was not already set, this sets it. + if !self + .initial_size_computation_started + .swap(true, AtomicOrdering::SeqCst) + { + // We need to start the computation task. + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "initial size calculation", + false, + async move { + let calculated_size = self_clone.calculate_logical_size(init_lsn)?; + let result = spawn_blocking(move || { + self_clone.current_logical_size.initial_logical_size.set(calculated_size) + }).await?; + match result { Ok(()) => info!("Successfully calculated initial logical size"), Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), } - - finish_sender.send(()).ok(); Ok(()) - }) { - Ok(guard) => *task_guard = Some((guard, finish_receiver)), - Err(e) => error!("Failed to spawn timeline {timeline_id} size init task: {e}"), - } + } + .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id)) + ); } } @@ -1099,22 +1108,23 @@ impl Timeline { self.last_freeze_at.store(last_lsn); *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running + // Launch a task to flush the frozen layer to disk, unless + // a task was already running. (If the task was running // at the time that we froze the layer, it must've seen the // the layer we just froze before it exited; see comments // in flush_frozen_layers()) if let Ok(guard) = self.layer_flush_lock.try_lock() { drop(guard); let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, Some(self.tenant_id), Some(self.timeline_id), - "layer flush thread", + "layer flush task", false, - move || self_clone.flush_frozen_layers(false), - )?; + async move { self_clone.flush_frozen_layers(false) }, + ); } } } @@ -1123,8 +1133,8 @@ impl Timeline { /// Flush all frozen layers to disk. /// - /// Only one thread at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another thread is + /// Only one task at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another task is /// currently doing the flushing, this function will wait for it /// to finish. If 'wait' is false, this function will return /// immediately instead. diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 86bbf25b67..8b9251229e 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,10 +12,10 @@ pub mod profiling; pub mod reltag; pub mod repository; pub mod storage_sync; +pub mod task_mgr; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_tasks; -pub mod thread_mgr; pub mod timelines; pub mod virtual_file; pub mod walingest; @@ -28,7 +28,7 @@ use std::collections::HashMap; use tracing::info; use utils::zid::{ZTenantId, ZTimelineId}; -use crate::thread_mgr::ThreadKind; +use crate::task_mgr::TaskKind; /// Current storage format version /// @@ -52,30 +52,31 @@ pub enum CheckpointConfig { Forced, } -pub fn shutdown_pageserver(exit_code: i32) { - // Shut down the libpq endpoint thread. This prevents new connections from +pub async fn shutdown_pageserver(exit_code: i32) { + // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await; - // Shut down any page service threads. - thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); + // Shut down any page service tasks. + task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await; // Shut down all the tenants. This flushes everything to disk and kills - // the checkpoint and GC threads. - tenant_mgr::shutdown_all_tenants(); + // the checkpoint and GC tasks. + tenant_mgr::shutdown_all_tenants().await; // Stop syncing with remote storage. // - // FIXME: Does this wait for the sync thread to finish syncing what's queued up? + // FIXME: Does this wait for the sync tasks to finish syncing what's queued up? // Should it? - thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. - thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None); + // FIXME: We should probably stop accepting commands like attach/detach earlier. + task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await; // There should be nothing left, but let's be sure - thread_mgr::shutdown_threads(None, None, None); + task_mgr::shutdown_tasks(None, None, None).await; info!("Shut down successfully completed"); std::process::exit(exit_code); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 783fcb2412..149144bfe4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -11,17 +11,21 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use futures::{Stream, StreamExt}; use regex::Regex; -use std::io::{self, Read}; +use std::io; use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::Arc; +use tokio_util::io::StreamReader; +use tokio_util::io::SyncIoBridge; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, lsn::Lsn, - postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend}, + postgres_backend::AuthType, + postgres_backend_async::{self, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, simple_rcu::RcuReadGuard, zid::{ZTenantId, ZTimelineId}, @@ -35,9 +39,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; +use crate::task_mgr; +use crate::task_mgr::TaskKind; use crate::tenant_mgr; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; @@ -201,93 +205,49 @@ impl PagestreamBeMessage { } } -/// Implements Read for the server side of CopyIn -struct CopyInReader<'a> { - pgb: &'a mut PostgresBackend, +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { + async_stream::try_stream! { + loop { + let msg = tokio::select! { + biased; - /// Overflow buffer for bytes sent in CopyData messages - /// that the reader (caller of read) hasn't asked for yet. - /// TODO use BytesMut? - buf: Vec, + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + let msg = format!("pageserver is shutting down"); + let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg)); + Err(anyhow::anyhow!(msg)) + } - /// Bytes before `buf_begin` are considered as dropped. - /// This allows us to implement O(1) pop_front on Vec. - /// The Vec won't grow large because we only add to it - /// when it's empty. - buf_begin: usize, -} + msg = pgb.read_message() => { msg } + }; -impl<'a> CopyInReader<'a> { - // NOTE: pgb should be in copy in state already - fn new(pgb: &'a mut PostgresBackend) -> Self { - Self { - pgb, - buf: Vec::<_>::new(), - buf_begin: 0, - } - } -} - -impl<'a> Drop for CopyInReader<'a> { - fn drop(&mut self) { - // Finalize copy protocol so that self.pgb can be reused - // TODO instead, maybe take ownership of pgb and give it back at the end - let mut buf: Vec = vec![]; - let _ = self.read_to_end(&mut buf); - } -} - -impl<'a> Read for CopyInReader<'a> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - while !thread_mgr::is_shutdown_requested() { - // Return from buffer if nonempty - if self.buf_begin < self.buf.len() { - let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin); - buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]); - self.buf_begin += bytes_to_read; - return Ok(bytes_to_read); - } - - // Delete garbage - self.buf.clear(); - self.buf_begin = 0; - - // Wait for client to send CopyData bytes - match self.pgb.read_message() { + match msg { Ok(Some(message)) => { let copy_data_bytes = match message { FeMessage::CopyData(bytes) => bytes, - FeMessage::CopyDone => return Ok(0), + FeMessage::CopyDone => { break }, FeMessage::Sync => continue, m => { let msg = format!("unexpected message {:?}", m); - self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; + break; } }; - // Return as much as we can, saving the rest in self.buf - let mut reader = copy_data_bytes.reader(); - let bytes_read = reader.read(buf)?; - reader.read_to_end(&mut self.buf)?; - return Ok(bytes_read); + yield copy_data_bytes; } Ok(None) => { let msg = "client closed connection"; - self.pgb.write_message(&BeMessage::ErrorResponse(msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(msg))?; + pgb.flush().await?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; } Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(io::Error::new(io::ErrorKind::Other, e)); - } + Err(io::Error::new(io::ErrorKind::Other, e))?; } - } + }; } - - // Shutting down - let msg = "Importer thread was shut down"; - Err(io::Error::new(io::ErrorKind::Other, msg)) } } @@ -296,61 +256,49 @@ impl<'a> Read for CopyInReader<'a> { /// /// Main loop of the page service. /// -/// Listens for connections, and launches a new handler thread for each. +/// Listens for connections, and launches a new handler task for each. /// -pub fn thread_main( +pub async fn libpq_listener_main( conf: &'static PageServerConf, auth: Option>, listener: TcpListener, auth_type: AuthType, ) -> anyhow::Result<()> { listener.set_nonblocking(true)?; - let basic_rt = tokio::runtime::Builder::new_current_thread() - .enable_io() - .build()?; - - let tokio_listener = { - let _guard = basic_rt.enter(); - tokio::net::TcpListener::from_std(listener) - }?; + let tokio_listener = tokio::net::TcpListener::from_std(listener)?; // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = basic_rt.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - biased; + while let Some(res) = tokio::select! { + biased; - _ = shutdown_watcher => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + None } - }) { + + res = tokio_listener.accept() => { + Some(res) + } + } { match res { Ok((socket, peer_addr)) => { - // Connection established. Spawn a new thread to handle it. + // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - // PageRequestHandler threads are not associated with any particular - // timeline in the thread manager. In practice most connections will + // PageRequestHandler tasks are not associated with any particular + // timeline in the task manager. In practice most connections will // only deal with a particular timeline, but we don't know which one // yet. - if let Err(err) = thread_mgr::spawn( - ThreadKind::PageRequestHandler, + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::PageRequestHandler, None, None, - "serving Page Service thread", + "serving compute connection task", false, - move || page_service_conn_main(conf, local_auth, socket, auth_type), - ) { - // Thread creation failed. Log the error and continue. - error!("could not spawn page service thread: {:?}", err); - } + page_service_conn_main(conf, local_auth, socket, auth_type), + ); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -364,13 +312,13 @@ pub fn thread_main( Ok(()) } -fn page_service_conn_main( +async fn page_service_conn_main( conf: &'static PageServerConf, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, ) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on thread exit. + // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); @@ -379,22 +327,17 @@ fn page_service_conn_main( gauge.dec(); } - // We use Tokio to accept the connection, but the rest of the code works with a - // regular socket. Convert. - let socket = socket - .into_std() - .context("could not convert tokio::net:TcpStream to std::net::TcpStream")?; - socket - .set_nonblocking(false) - .context("could not put socket to blocking mode")?; - socket .set_nodelay(true) .context("could not set TCP_NODELAY")?; let mut conn_handler = PageServerHandler::new(conf, auth); - let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - match pgbackend.run(&mut conn_handler) { + let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + + let result = pgbackend + .run(&mut conn_handler, task_mgr::shutdown_watcher) + .await; + match result { Ok(()) => { // we've been requested to shut down Ok(()) @@ -435,92 +378,95 @@ impl PageServerHandler { } } - fn handle_pagerequests( + #[instrument(skip(self, pgb))] + async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - timeline_id: ZTimelineId, tenant_id: ZTenantId, + timeline_id: ZTimelineId, ) -> anyhow::Result<()> { - let _enter = - info_span!("pagestream", timeline = %timeline_id, tenant = %tenant_id).entered(); - // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; - /* switch client to COPYBOTH */ + // switch client to COPYBOTH pgb.write_message(&BeMessage::CopyBothResponse)?; + pgb.flush().await?; - while !thread_mgr::is_shutdown_requested() { - let msg = pgb.read_message(); + loop { + let msg = tokio::select! { + biased; - let profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - match msg { - Ok(message) => { - if let Some(message) = message { - trace!("query: {:?}", message); - - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - _ => continue, - }; - - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - - let response = match zenith_fe_msg { - PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_rel_exists_request(&timeline, &req) - }), - PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_nblocks_request(&timeline, &req) - }), - PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(&timeline, &req) - }), - PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME - .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_db_size_request(&timeline, &req) - }), - }; - - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough - error!("error reading relation or page version: {:?}", e); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); - - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; - } else { - break; - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + break; } - Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(e); - } + + msg = pgb.read_message() => { msg } + }; + + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(m) => { + bail!("unexpected message: {m:?} during COPY"); } - } - drop(profiling_guard); + None => break, // client disconnected + }; + + trace!("query: {:?}", copy_data_bytes); + + let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_str = tenant_id.to_string(); + let timeline_str = timeline_id.to_string(); + + let response = match zenith_fe_msg { + PagestreamFeMessage::Exists(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_exists", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_rel_exists_request(&timeline, &req).await + } + PagestreamFeMessage::Nblocks(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_nblocks_request(&timeline, &req).await + } + PagestreamFeMessage::GetPage(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_page_at_lsn", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_page_at_lsn_request(&timeline, &req).await + } + PagestreamFeMessage::DbSize(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_db_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_db_size_request(&timeline, &req).await + } + }; + + let response = response.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough + error!("error reading relation or page version: {:?}", e); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); + + pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.flush().await?; } Ok(()) } - fn handle_import_basebackup( + #[instrument(skip(self, pgb))] + async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -528,10 +474,7 @@ impl PageServerHandler { base_lsn: Lsn, _end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered(); - + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -550,8 +493,24 @@ impl PageServerHandler { // Import basebackup provided via CopyData info!("importing basebackup"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_basebackup_from_tar(&*timeline, reader, base_lsn)?; + pgb.flush().await?; + + // import_basebackup_from_tar() is not async, mainly because the Tar crate + // it uses is not async. So we need to jump through some hoops: + // - convert the input from client connection to a synchronous Read + // - use block_in_place() + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?; + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -563,11 +522,14 @@ impl PageServerHandler { info!("flushing layers"); timeline.checkpoint(CheckpointConfig::Flush)?; + timeline.launch_wal_receiver()?; + info!("done"); Ok(()) } - fn handle_import_wal( + #[instrument(skip(self, pgb))] + async fn handle_import_wal( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -575,9 +537,7 @@ impl PageServerHandler { start_lsn: Lsn, end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let timeline = repo @@ -591,8 +551,22 @@ impl PageServerHandler { // Import wal provided via CopyData info!("importing wal"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?; + pgb.flush().await?; + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| { + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn) + })?; + info!("wal import complete"); + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO Does it make sense to overshoot? ensure!(timeline.get_last_record_lsn() >= end_lsn); @@ -619,7 +593,7 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( + async fn wait_or_get_last_lsn( timeline: &Timeline, mut lsn: Lsn, latest: bool, @@ -647,7 +621,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -657,7 +631,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -667,15 +641,15 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_rel_exists_request( &self, timeline: &Timeline, req: &PagestreamExistsRequest, ) -> Result { - let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; @@ -684,14 +658,15 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_nblocks_request( &self, timeline: &Timeline, req: &PagestreamNblocksRequest, ) -> Result { - let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; @@ -700,14 +675,15 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + async fn handle_db_size_request( &self, timeline: &Timeline, req: &PagestreamDbSizeRequest, ) -> Result { - let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; @@ -719,15 +695,15 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, ) -> Result { - let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) - .entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; /* // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. @@ -736,6 +712,11 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ + + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -743,23 +724,23 @@ impl PageServerHandler { })) } - fn handle_basebackup_request( + #[instrument(skip(self, pgb))] + async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, + tenant_id: ZTenantId, timeline_id: ZTimelineId, lsn: Option, prev_lsn: Option, - tenant_id: ZTenantId, full_backup: bool, ) -> anyhow::Result<()> { - let span = info_span!("basebackup", timeline = %timeline_id, tenant = %tenant_id, lsn = field::Empty); - let _enter = span.enter(); - info!("starting"); - // check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { + // Backup was requested at a particular LSN. Wait for it to arrive. + info!("waiting for {}", lsn); + timeline.wait_lsn(lsn).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -767,18 +748,22 @@ impl PageServerHandler { // switch client to COPYOUT pgb.write_message(&BeMessage::CopyOutResponse)?; + pgb.flush().await?; /* Send a tarball of the latest layer on the timeline */ - { - let mut writer = CopyDataSink { pgb }; - + let mut writer = CopyDataSink { + pgb, + rt: tokio::runtime::Handle::current(), + }; + tokio::task::block_in_place(|| { let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; - span.record("lsn", &basebackup.lsn.to_string().as_str()); - basebackup.send_tarball()?; - } + tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str()); + basebackup.send_tarball() + })?; pgb.write_message(&BeMessage::CopyDone)?; - info!("done"); + pgb.flush().await?; + info!("basebackup complete"); Ok(()) } @@ -801,7 +786,8 @@ impl PageServerHandler { } } -impl postgres_backend::Handler for PageServerHandler { +#[async_trait::async_trait] +impl postgres_backend_async::Handler for PageServerHandler { fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, @@ -831,11 +817,7 @@ impl postgres_backend::Handler for PageServerHandler { Ok(()) } - fn is_shutdown_requested(&self) -> bool { - thread_mgr::is_shutdown_requested() - } - - fn process_query( + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, @@ -849,12 +831,13 @@ impl postgres_backend::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, timelineid, tenantid)?; + self.handle_pagerequests(pgb, tenant_id, timeline_id) + .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); @@ -864,10 +847,10 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; let lsn = if params.len() == 3 { Some(Lsn::from_str(params[2])?) @@ -876,8 +859,9 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -897,11 +881,11 @@ impl postgres_backend::Handler for PageServerHandler { let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), RowDescriptor::text_col(b"last_lsn"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(end_of_timeline.prev.to_string().as_bytes()), Some(end_of_timeline.last.to_string().as_bytes()), ]))? @@ -917,8 +901,8 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for fullbackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { @@ -932,11 +916,12 @@ impl postgres_backend::Handler for PageServerHandler { None }; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. // Assumes the tenant already exists on this pageserver. @@ -952,18 +937,21 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.starts_with("import wal ") { @@ -974,24 +962,27 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let start_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("failpoints ") { ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); @@ -1016,7 +1007,7 @@ impl postgres_backend::Handler for PageServerHandler { bail!("Invalid failpoints format"); } } - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -1024,7 +1015,7 @@ impl postgres_backend::Handler for PageServerHandler { ensure!(params.len() == 1, "invalid param number for config command"); let tenantid = ZTenantId::from_str(params[0])?; let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"compaction_target_size"), @@ -1035,7 +1026,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"image_creation_threshold"), RowDescriptor::int8_col(b"pitr_interval"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(repo.get_checkpoint_distance().to_string().as_bytes()), Some( repo.get_checkpoint_timeout() @@ -1072,10 +1063,10 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let gc_horizon: u64 = caps .get(4) @@ -1084,8 +1075,8 @@ impl postgres_backend::Handler for PageServerHandler { // Use tenant's pitr setting let pitr = repo.get_pitr_interval(); - let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), RowDescriptor::int8_col(b"layers_needed_by_pitr"), @@ -1094,7 +1085,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"layers_removed"), RowDescriptor::int8_col(b"elapsed"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(result.layers_total.to_string().as_bytes()), Some(result.layers_needed_by_cutoff.to_string().as_bytes()), Some(result.layers_needed_by_pitr.to_string().as_bytes()), @@ -1121,8 +1112,8 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("checkpoint ") { // Run checkpoint immediately on given timeline. @@ -1140,8 +1131,8 @@ impl postgres_backend::Handler for PageServerHandler { // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.checkpoint(CheckpointConfig::Forced)?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static @@ -1158,7 +1149,7 @@ impl postgres_backend::Handler for PageServerHandler { let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { @@ -1167,14 +1158,12 @@ impl postgres_backend::Handler for PageServerHandler { LsnForTimestamp::Past(_lsn) => "past".into(), LsnForTimestamp::NoData(_lsn) => "nodata".into(), }; - pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; + pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } - pgb.flush()?; - Ok(()) } } @@ -1194,6 +1183,7 @@ fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result< /// struct CopyDataSink<'a> { pgb: &'a mut PostgresBackend, + rt: tokio::runtime::Handle, } impl<'a> io::Write for CopyDataSink<'a> { @@ -1205,6 +1195,7 @@ impl<'a> io::Write for CopyDataSink<'a> { // FIXME: flush isn't really required, but makes it easier // to view in wireshark self.pgb.write_message(&BeMessage::CopyData(data))?; + self.rt.block_on(self.pgb.flush())?; trace!("CopyData sent for {} bytes!", data.len()); Ok(data.len()) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 57a964cb67..8ebfa6a935 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -37,7 +37,7 @@ //! | access to this storage | //! +------------------------+ //! -//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. +//! First, during startup, the pageserver inits the storage sync task with the async loop, or leaves the loop uninitialised, if configured so. //! The loop inits the storage connection and checks the remote files stored. //! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). //! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can @@ -158,7 +158,6 @@ use once_cell::sync::OnceCell; use remote_storage::GenericRemoteStorage; use tokio::{ fs, - runtime::Runtime, time::{Duration, Instant}, }; use tracing::*; @@ -174,9 +173,10 @@ use crate::{ exponential_backoff, layered_repository::metadata::{metadata_path, TimelineMetadata}, storage_sync::index::RemoteIndex, + task_mgr, + task_mgr::TaskKind, + task_mgr::BACKGROUND_RUNTIME, tenant_mgr::attach_local_tenants, - thread_mgr, - thread_mgr::ThreadKind, }; use crate::{ metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, @@ -264,7 +264,7 @@ impl SyncQueue { .unwrap() .0; - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { return (HashMap::new(), q.len()); } } @@ -574,7 +574,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub fn spawn_storage_sync_thread( +pub fn spawn_storage_sync_task( conf: &'static PageServerConf, local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, storage: GenericRemoteStorage, @@ -590,11 +590,6 @@ pub fn spawn_storage_sync_thread( None => bail!("Could not get sync queue during the sync loop step, aborting"), }; - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("Failed to create storage sync runtime")?; - // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: // * we need to list every timeline for tenant on S3, that might be a costly operation // * we need to download every timeline for the tenant, to activate it in memory @@ -616,7 +611,7 @@ pub fn spawn_storage_sync_thread( } } - let applicable_index_parts = runtime.block_on(download_index_parts( + let applicable_index_parts = BACKGROUND_RUNTIME.block_on(download_index_parts( conf, &storage, keys_for_index_part_downloads, @@ -625,7 +620,7 @@ pub fn spawn_storage_sync_thread( let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; let mut local_timeline_init_statuses = schedule_first_sync_tasks( - &mut runtime.block_on(remote_index.write()), + &mut BACKGROUND_RUNTIME.block_on(remote_index.write()), sync_queue, timelines_to_sync, ); @@ -634,31 +629,30 @@ pub fn spawn_storage_sync_thread( .extend(empty_tenants.0.into_iter()); let remote_index_clone = remote_index.clone(); - thread_mgr::spawn( - ThreadKind::StorageSync, + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::StorageSync, None, None, - "Remote storage sync thread", + "Remote storage sync task", false, - move || { + async move { storage_sync_loop( - runtime, conf, (storage, remote_index_clone, sync_queue), max_sync_errors, - ); + ) + .await; Ok(()) }, - ) - .context("Failed to spawn remote storage sync thread")?; + ); Ok(SyncStartupData { remote_index, local_timeline_init_statuses, }) } -fn storage_sync_loop( - runtime: Runtime, +async fn storage_sync_loop( conf: &'static PageServerConf, (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, @@ -669,7 +663,7 @@ fn storage_sync_loop( let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { info!("Shutdown requested, stopping"); break; } @@ -683,20 +677,19 @@ fn storage_sync_loop( } // Concurrently perform all the tasks in the batch - let loop_step = runtime.block_on(async { - tokio::select! { - step = process_batches( - conf, - max_sync_errors, - loop_storage, - &index, - batched_tasks, - sync_queue, - ) - .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step), - _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), - } - }); + let loop_step = tokio::select! { + step = process_batches( + conf, + max_sync_errors, + loop_storage, + &index, + batched_tasks, + sync_queue, + ) + .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step) + , + _ = task_mgr::shutdown_watcher() => ControlFlow::Break(()), + }; match loop_step { ControlFlow::Continue(updated_tenants) => { @@ -708,7 +701,7 @@ fn storage_sync_loop( updated_tenants.len() ); let mut timelines_to_attach = TenantTimelineValues::new(); - let index_accessor = runtime.block_on(index.read()); + let index_accessor = index.read().await; for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { Some(tenant_entry) => tenant_entry, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 7070f941f5..a4285e426b 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -153,7 +153,7 @@ pub(super) async fn upload_timeline_layers<'a>( // We have run the upload sync task, but the file we wanted to upload is gone. // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and - // run compaction/gc threads, removing redundant files from disk. + // run compaction/gc tasks, removing redundant files from disk. // It's not good to pause GC/compaction because of those and we would rather skip such uploads. // // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance). diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs new file mode 100644 index 0000000000..2aa803d119 --- /dev/null +++ b/pageserver/src/task_mgr.rs @@ -0,0 +1,463 @@ +//! +//! This module provides centralized handling of tokio tasks in the Page Server. +//! +//! We provide a few basic facilities: +//! - A global registry of tasks that lists what kind of tasks they are, and +//! which tenant or timeline they are working on +//! +//! - The ability to request a task to shut down. +//! +//! +//! # How it works? +//! +//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new +//! task is spawned, a PageServerTask entry is added there, and when a +//! task dies, it removes itself from the hashmap. If you want to kill a +//! task, you can scan the hashmap to find it. +//! +//! # Task shutdown +//! +//! To kill a task, we rely on co-operation from the victim. Each task is +//! expected to periodically call the `is_shutdown_requested()` function, and +//! if it returns true, exit gracefully. In addition to that, when waiting for +//! the network or other long-running operation, you can use +//! `shutdown_watcher()` function to get a Future that will become ready if +//! the current task has been requested to shut down. You can use that with +//! Tokio select!(). +//! +//! +//! TODO: This would be a good place to also handle panics in a somewhat sane way. +//! Depending on what task panics, we might want to kill the whole server, or +//! only a single tenant or timeline. +//! + +// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. +// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. +#![allow(clippy::declare_interior_mutable_const)] + +use std::collections::HashMap; +use std::future::Future; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; + +use futures::FutureExt; +use tokio::runtime::Runtime; +use tokio::sync::watch; +use tokio::task::JoinHandle; +use tokio::task_local; + +use tracing::{debug, error, info, warn}; + +use once_cell::sync::Lazy; + +use utils::zid::{ZTenantId, ZTimelineId}; + +use crate::shutdown_pageserver; + +// +// There are four runtimes: +// +// Compute request runtime +// - used to handle connections from compute nodes. Any tasks related to satisfying +// GetPage requests, base backups, import, and other such compute node operations +// are handled by the Compute request runtime +// - page_service.rs +// - this includes layer downloads from remote storage, if a layer is needed to +// satisfy a GetPage request +// +// Management request runtime +// - used to handle HTTP API requests +// +// WAL receiver runtime: +// - used to handle WAL receiver connections. +// - and to receiver updates from etcd +// +// Background runtime +// - layer flushing +// - garbage collection +// - compaction +// - remote storage uploads +// - initial tenant loading +// +// Everything runs in a tokio task. If you spawn new tasks, spawn it using the correct +// runtime. +// +// There might be situations when one task needs to wait for a task running in another +// Runtime to finish. For example, if a background operation needs a layer from remote +// storage, it will start to download it. If a background operation needs a remote layer, +// and the download was already initiated by a GetPage request, the background task +// will wait for the download - running in the Page server runtime - to finish. +// Another example: the initial tenant loading tasks are launched in the background ops +// runtime. If a GetPage request comes in before the load of a tenant has finished, the +// GetPage request will wait for the tenant load to finish. +// +// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to +// protect data structures. Let's keep it that way. Synchronous code is easier to debug +// and analyze, and there's a lot of hairy, low-level, performance critical code there. +// +// It's nice to have different runtimes, so that you can quickly eyeball how much CPU +// time each class of operations is taking, with 'top -H' or similar. +// +// It's also good to avoid hogging all threads that would be needed to process +// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't +// happen, but still. +// +pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("compute request worker") + .enable_all() + .build() + .expect("Failed to create compute request runtime") +}); + +pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("mgmt request worker") + .enable_all() + .build() + .expect("Failed to create mgmt request runtime") +}); + +pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("walreceiver worker") + .enable_all() + .build() + .expect("Failed to create walreceiver runtime") +}); + +pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("background op worker") + .enable_all() + .build() + .expect("Failed to create background op runtime") +}); + +pub struct PageserverTaskId(u64); + +/// Each task that we track is associated with a "task ID". It's just an +/// increasing number that we assign. Note that it is different from tokio::task::Id. +static NEXT_TASK_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); + +/// Global registry of tasks +static TASKS: Lazy>>> = + Lazy::new(|| Mutex::new(HashMap::new())); + +task_local! { + // There is a Tokio watch channel for each task, which can be used to signal the + // task that it needs to shut down. This task local variable holds the receiving + // end of the channel. The sender is kept in the global registry, so that anyone + // can send the signal to request task shutdown. + static SHUTDOWN_RX: watch::Receiver; + + // Each task holds reference to its own PageServerTask here. + static CURRENT_TASK: Arc; +} + +/// +/// There are many kinds of tasks in the system. Some are associated with a particular +/// tenant or timeline, while others are global. +/// +/// Note that we don't try to limit how many task of a certain kind can be running +/// at the same time. +/// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TaskKind { + // libpq listener task. It just accepts connection and spawns a + // PageRequestHandler task for each connection. + LibpqEndpointListener, + + // HTTP endpoint listener. + HttpEndpointListener, + + // Task that handles a single connection. A PageRequestHandler task + // starts detached from any particular tenant or timeline, but it can be + // associated with one later, after receiving a command from the client. + PageRequestHandler, + + // Manages the WAL receiver connection for one timeline. It subscribes to + // events from etcd, decides which safekeeper to connect to. It spawns a + // separate WalReceiverConnection task to handle each connection. + WalReceiverManager, + + // Handles a connection to a safekeeper, to stream WAL to a timeline. + WalReceiverConnection, + + // Garbage collection worker. One per tenant + GarbageCollector, + + // Compaction. One per tenant. + Compaction, + + // Initial logical size calculation + InitialLogicalSizeCalculation, + + // Task that flushes frozen in-memory layers to disk + LayerFlushTask, + + // Task that manages the remote upload queue + StorageSync, + + // task that handles the initial downloading of all tenants + InitialLoad, + + // task that handles attaching a tenant + Attach, +} + +#[derive(Default)] +struct MutableTaskState { + /// Tenant and timeline that this task is associated with. + tenant_id: Option, + timeline_id: Option, + + /// Handle for waiting for the task to exit. It can be None, if the + /// the task has already exited. + join_handle: Option>, +} + +struct PageServerTask { + #[allow(dead_code)] // unused currently + task_id: PageserverTaskId, + + kind: TaskKind, + + name: String, + + // To request task shutdown, send 'true' to the channel to notify the task. + shutdown_tx: watch::Sender, + + mutable: Mutex, +} + +/// Launch a new task +/// Note: if shutdown_process_on_error is set to true failure +/// of the task will lead to shutdown of entire process +pub fn spawn( + runtime: &tokio::runtime::Handle, + kind: TaskKind, + tenant_id: Option, + timeline_id: Option, + name: &str, + shutdown_process_on_error: bool, + future: F, +) -> PageserverTaskId +where + F: Future> + Send + 'static, +{ + let (shutdown_tx, shutdown_rx) = watch::channel(false); + let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); + let task = Arc::new(PageServerTask { + task_id: PageserverTaskId(task_id), + kind, + name: name.to_string(), + shutdown_tx, + mutable: Mutex::new(MutableTaskState { + tenant_id, + timeline_id, + join_handle: None, + }), + }); + + TASKS.lock().unwrap().insert(task_id, Arc::clone(&task)); + + let mut task_mut = task.mutable.lock().unwrap(); + + let task_name = name.to_string(); + let task_cloned = Arc::clone(&task); + let join_handle = runtime.spawn(task_wrapper( + task_name, + task_id, + task_cloned, + shutdown_rx, + shutdown_process_on_error, + future, + )); + task_mut.join_handle = Some(join_handle); + drop(task_mut); + + // The task is now running. Nothing more to do here + PageserverTaskId(task_id) +} + +/// This wrapper function runs in a newly-spawned task. It initializes the +/// task-local variables and calls the payload function. +async fn task_wrapper( + task_name: String, + task_id: u64, + task: Arc, + shutdown_rx: watch::Receiver, + shutdown_process_on_error: bool, + future: F, +) where + F: Future> + Send + 'static, +{ + debug!("Starting task '{}'", task_name); + + let result = SHUTDOWN_RX + .scope( + shutdown_rx, + CURRENT_TASK.scope(task, { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + AssertUnwindSafe(future).catch_unwind() + }), + ) + .await; + task_finish(result, task_name, task_id, shutdown_process_on_error).await; +} + +async fn task_finish( + result: std::result::Result< + anyhow::Result<()>, + std::boxed::Box, + >, + task_name: String, + task_id: u64, + shutdown_process_on_error: bool, +) { + // Remove our entry from the global hashmap. + let task = TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); + + let mut shutdown_process = false; + { + let task_mut = task.mutable.lock().unwrap(); + + match result { + Ok(Ok(())) => { + debug!("Task '{}' exited normally", task_name); + } + Ok(Err(err)) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + Err(err) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + } + } + + if shutdown_process { + shutdown_pageserver(1).await; + } +} + +// expected to be called from the task of the given id. +pub fn associate_with(tenant_id: Option, timeline_id: Option) { + CURRENT_TASK.with(|ct| { + let mut task_mut = ct.mutable.lock().unwrap(); + task_mut.tenant_id = tenant_id; + task_mut.timeline_id = timeline_id; + }); +} + +/// Is there a task running that matches the criteria + +/// Signal and wait for tasks to shut down. +/// +/// +/// The arguments are used to select the tasks to kill. Any None arguments are +/// ignored. For example, to shut down all WalReceiver tasks: +/// +/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None) +/// +/// Or to shut down all tasks for given timeline: +/// +/// shutdown_tasks(None, Some(tenantid), Some(timelineid)) +/// +pub async fn shutdown_tasks( + kind: Option, + tenant_id: Option, + timeline_id: Option, +) { + let mut victim_tasks = Vec::new(); + + { + let tasks = TASKS.lock().unwrap(); + for task in tasks.values() { + let task_mut = task.mutable.lock().unwrap(); + if (kind.is_none() || Some(task.kind) == kind) + && (tenant_id.is_none() || task_mut.tenant_id == tenant_id) + && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) + { + let _ = task.shutdown_tx.send_replace(true); + victim_tasks.push(Arc::clone(task)); + } + } + } + + for task in victim_tasks { + let join_handle = { + let mut task_mut = task.mutable.lock().unwrap(); + info!("waiting for {} to shut down", task.name); + let join_handle = task_mut.join_handle.take(); + drop(task_mut); + join_handle + }; + if let Some(join_handle) = join_handle { + let _ = join_handle.await; + } else { + // Possibly one of: + // * The task had not even fully started yet. + // * It was shut down concurrently and already exited + } + } +} + +pub fn current_task_kind() -> Option { + CURRENT_TASK.try_with(|ct| ct.kind).ok() +} + +/// A Future that can be used to check if the current task has been requested to +/// shut down. +pub async fn shutdown_watcher() { + let mut shutdown_rx = SHUTDOWN_RX + .try_with(|rx| rx.clone()) + .expect("shutdown_requested() called in an unexpected task or thread"); + + while !*shutdown_rx.borrow() { + if shutdown_rx.changed().await.is_err() { + break; + } + } +} + +/// Has the current task been requested to shut down? +pub fn is_shutdown_requested() -> bool { + if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) { + *shutdown_rx.borrow() + } else { + if !cfg!(test) { + warn!("is_shutdown_requested() called in an unexpected task or thread"); + } + false + } +} diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index baa58f5eb5..db256b0f65 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -5,14 +5,14 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; use crate::layered_repository::ephemeral_file::is_ephemeral_file; use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; -use crate::layered_repository::{Repository, Timeline}; +use crate::layered_repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::task_mgr::{self, TaskKind}; use crate::tenant_config::TenantConfOpt; -use crate::thread_mgr::ThreadKind; -use crate::walredo::PostgresRedoManager; -use crate::{thread_mgr, timelines, walreceiver, TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::Context; +use crate::walredo::{PostgresRedoManager, WalRedoManager}; +use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; +use anyhow::{ensure, Context}; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::{self, Entry}; @@ -21,34 +21,24 @@ use std::ffi::OsStr; use std::fmt; use std::path::{Path, PathBuf}; use std::sync::Arc; -use tokio::sync::mpsc; use tracing::*; -pub use tenants_state::try_send_timeline_update; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::crashsafe_dir; +use utils::zid::{ZTenantId, ZTimelineId}; mod tenants_state { - use anyhow::ensure; use once_cell::sync::Lazy; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; - use tokio::sync::mpsc; - use tracing::{debug, error}; use utils::zid::ZTenantId; - use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; + use crate::tenant_mgr::Tenant; static TENANTS: Lazy>> = Lazy::new(|| RwLock::new(HashMap::new())); - /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, - /// so that it can enable/disable corresponding processes. - static TIMELINE_UPDATE_SENDER: Lazy< - RwLock>>, - > = Lazy::new(|| RwLock::new(None)); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { TENANTS .read() @@ -60,39 +50,6 @@ mod tenants_state { .write() .expect("Failed to write() tenants lock, it got poisoned") } - - pub(super) fn set_timeline_update_sender( - timeline_updates_sender: mpsc::UnboundedSender, - ) -> anyhow::Result<()> { - let mut sender_guard = TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned"); - ensure!(sender_guard.is_none(), "Timeline update sender already set"); - *sender_guard = Some(timeline_updates_sender); - Ok(()) - } - - pub fn try_send_timeline_update(update: LocalTimelineUpdate) { - match TIMELINE_UPDATE_SENDER - .read() - .expect("Failed to read() timeline_update_sender lock, it got poisoned") - .as_ref() - { - Some(sender) => { - if let Err(e) = sender.send(update) { - error!("Failed to send timeline update: {}", e); - } - } - None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"), - } - } - - pub(super) fn stop_timeline_update_sender() { - TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned") - .take(); - } } struct Tenant { @@ -103,9 +60,6 @@ struct Tenant { #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] pub enum TenantState { - // All data for this tenant is complete on local disk, but we haven't loaded the Repository, - // Timeline and Layer structs into memory yet, so it cannot be accessed yet. - //Ready, // This tenant exists on local disk, and the layer map has been loaded into memory. // The local disk might have some newer files that don't exist in cloud storage yet. Active, @@ -139,10 +93,6 @@ pub fn init_tenant_mgr( remote_storage: Option, ) -> anyhow::Result { let _entered = info_span!("init_tenant_mgr").entered(); - let (timeline_updates_sender, timeline_updates_receiver) = - mpsc::unbounded_channel::(); - tenants_state::set_timeline_update_sender(timeline_updates_sender)?; - walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; let local_tenant_files = local_tenant_timeline_files(conf) .context("Failed to collect local tenant timeline files")?; @@ -156,7 +106,7 @@ pub fn init_tenant_mgr( let SyncStartupData { remote_index, local_timeline_init_statuses, - } = storage_sync::spawn_storage_sync_thread( + } = storage_sync::spawn_storage_sync_task( conf, local_tenant_files, storage, @@ -185,27 +135,6 @@ pub fn init_tenant_mgr( Ok(remote_index) } -pub enum LocalTimelineUpdate { - Detach { - id: ZTenantTimelineId, - // used to signal to the detach caller that walreceiver successfully terminated for specified id - join_confirmation_sender: std::sync::mpsc::Sender<()>, - }, - Attach { - id: ZTenantTimelineId, - timeline: Arc, - }, -} - -impl std::fmt::Debug for LocalTimelineUpdate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(), - Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(), - } - } -} - /// Reads local files to load tenants and their timelines given into pageserver's memory. /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", @@ -274,24 +203,26 @@ fn load_local_repo( /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// -pub fn shutdown_all_tenants() { - tenants_state::stop_timeline_update_sender(); - let mut m = tenants_state::write_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - match tenant.state { - TenantState::Active | TenantState::Idle | TenantState::Stopping => { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) +pub async fn shutdown_all_tenants() { + let tenantids = { + let mut m = tenants_state::write_tenants(); + let mut tenantids = Vec::new(); + for (tenantid, tenant) in m.iter_mut() { + match tenant.state { + TenantState::Active | TenantState::Idle | TenantState::Stopping => { + tenant.state = TenantState::Stopping; + tenantids.push(*tenantid) + } + TenantState::Broken => {} } - TenantState::Broken => {} } - } - drop(m); + drop(m); + tenantids + }; - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - // Ok, no background threads running anymore. Flush any remaining data in + // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from @@ -314,7 +245,40 @@ pub fn shutdown_all_tenants() { } } -pub fn create_tenant_repository( +fn create_repo( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: ZTenantId, + wal_redo_manager: Arc, + remote_index: RemoteIndex, +) -> anyhow::Result> { + let repo_dir = conf.tenant_path(&tenant_id); + ensure!( + !repo_dir.exists(), + "cannot create new tenant repo: '{}' directory already exists", + tenant_id + ); + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&repo_dir) + .with_context(|| format!("could not create directory {}", repo_dir.display()))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; + info!("created directory structure in {}", repo_dir.display()); + + // Save tenant's config + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + + Ok(Arc::new(Repository::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + ))) +} + +pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, @@ -327,17 +291,12 @@ pub fn create_tenant_repository( } Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let repo = timelines::create_repo( - conf, - tenant_conf, - tenant_id, - wal_redo_manager, - remote_index, - )?; + let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; v.insert(Tenant { - state: TenantState::Idle, + state: TenantState::Active, repo, }); + crate::tenant_tasks::start_background_loops(tenant_id); Ok(Some(tenant_id)) } } @@ -360,13 +319,15 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { } pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - let old_state = tenant.state; - tenant.state = new_state; - drop(m); + let old_state = { + let mut m = tenants_state::write_tenants(); + let tenant = m + .get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))?; + let old_state = tenant.state; + tenant.state = new_state; + old_state + }; match (old_state, new_state) { (TenantState::Broken, TenantState::Broken) @@ -389,24 +350,15 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. - // TODO maybe use tokio::sync::watch instead? - crate::tenant_tasks::start_compaction_loop(tenant_id)?; - crate::tenant_tasks::start_gc_loop(tenant_id)?; + crate::tenant_tasks::start_background_loops(tenant_id); } (TenantState::Idle, TenantState::Stopping) => { info!("stopping idle tenant {tenant_id}"); } (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { - info!("stopping tenant {tenant_id} threads due to new state {new_state}"); - thread_mgr::shutdown_threads( - Some(ThreadKind::WalReceiverManager), - Some(tenant_id), - None, - ); + info!("stopping tenant {tenant_id} tasks due to new state {new_state}"); - // Wait until all gc/compaction tasks finish - let repo = get_repository_for_tenant(tenant_id)?; - let _guard = repo.file_lock.write().unwrap(); + // Note: The caller is responsible for waiting for any tasks to finish. } } @@ -422,28 +374,28 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result<()> { +pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the threads to join + // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join // we cannot create new timelines and tenants, and that can take quite some time, // it can even become stuck due to a bug making whole pageserver unavailable for some operations // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests // will synchronize and either fail with the not found error or succeed - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - debug!("waiting for wal receiver to shutdown"); - let _ = receiver.recv(); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(tenant_id), + Some(timeline_id), + ) + .await; debug!("wal receiver shutdown confirmed"); - debug!("waiting for threads to shutdown"); - thread_mgr::shutdown_threads(None, None, Some(timeline_id)); - debug!("thread shutdown completed"); + + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; + info!("timeline task shutdown completed"); match tenants_state::read_tenants().get(&tenant_id) { Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), @@ -452,36 +404,17 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow Ok(()) } -pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> { +pub async fn detach_tenant( + conf: &'static PageServerConf, + tenant_id: ZTenantId, +) -> anyhow::Result<()> { set_tenant_state(tenant_id, TenantState::Stopping)?; - // shutdown the tenant and timeline threads: gc, compaction, page service threads) - thread_mgr::shutdown_threads(None, Some(tenant_id), None); + // shutdown all tenant and timeline tasks: gc, compaction, page service) + task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - let mut walreceiver_join_handles = Vec::new(); - let removed_tenant = { + { let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id) - }; - if let Some(tenant) = removed_tenant { - for (timeline_id, _) in tenant.repo.list_timelines() { - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - walreceiver_join_handles.push((timeline_id, receiver)); - } - } - - // wait for wal receivers to stop without holding the lock, because walreceiver - // will attempt to change tenant state which is protected by the same global tenants lock. - // TODO do we need a timeout here? how to handle it? - // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 - // need to use crossbeam-channel - for (timeline_id, join_handle) in walreceiver_join_handles { - info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); - join_handle.recv().ok(); - info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); + tenants_accessor.remove(&tenant_id); } // If removal fails there will be no way to successfully retry detach, diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 4e9a5fc6ec..9aaafe7f92 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -1,270 +1,130 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC -use std::collections::HashMap; -use std::ops::ControlFlow; use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant_mgr; use crate::tenant_mgr::TenantState; -use crate::thread_mgr::ThreadKind; -use crate::{tenant_mgr, thread_mgr}; -use anyhow::{self, Context}; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use once_cell::sync::OnceCell; -use tokio::sync::mpsc; -use tokio::sync::watch; use tracing::*; use utils::zid::ZTenantId; +pub fn start_background_loops(tenant_id: ZTenantId) { + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::Compaction, + Some(tenant_id), + None, + &format!("compactor for tenant {tenant_id}"), + false, + compaction_loop(tenant_id), + ); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::GarbageCollector, + Some(tenant_id), + None, + &format!("garbage collector for tenant {tenant_id}"), + false, + gc_loop(tenant_id), + ); +} + /// /// Compaction task's main loop /// -async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting compaction loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); + + // Run blocking part of the task - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run compaction - let compaction_period = repo.get_compaction_period(); - repo.compaction_iteration()?; - Ok(ControlFlow::Continue(compaction_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { + let mut sleep_duration = repo.get_compaction_period(); + if let Err(e) = repo.compaction_iteration() { error!("Compaction failed, retrying: {}", e); - Duration::from_secs(2) + sleep_duration = Duration::from_secs(2) } - Err(e) => { - error!("Compaction join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } } } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - trace!( + info!( "compaction loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); -} - -static START_GC_LOOP: OnceCell> = OnceCell::new(); -static START_COMPACTION_LOOP: OnceCell> = OnceCell::new(); - -/// Spawn a task that will periodically schedule garbage collection until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_GC_LOOP - .get() - .context("Failed to get START_GC_LOOP")? - .blocking_send(tenantid) - .context("Failed to send to START_GC_LOOP channel")?; - Ok(()) -} - -/// Spawn a task that will periodically schedule compaction until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_COMPACTION_LOOP - .get() - .context("failed to get START_COMPACTION_LOOP")? - .blocking_send(tenantid) - .context("failed to send to START_COMPACTION_LOOP")?; - Ok(()) -} - -/// Spawn the TenantTaskManager -/// This needs to be called before start_gc_loop or start_compaction_loop -pub fn init_tenant_task_pool() -> anyhow::Result<()> { - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("tenant-task-worker") - .enable_all() - .on_thread_start(|| { - thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker") - }) - .on_thread_stop(thread_mgr::deregister) - .build()?; - - let (gc_send, mut gc_recv) = mpsc::channel::(100); - START_GC_LOOP - .set(gc_send) - .expect("Failed to set START_GC_LOOP"); - - let (compaction_send, mut compaction_recv) = mpsc::channel::(100); - START_COMPACTION_LOOP - .set(compaction_send) - .expect("Failed to set START_COMPACTION_LOOP"); - - // TODO this is getting repetitive - let mut gc_loops = HashMap::>::new(); - let mut compaction_loops = HashMap::>::new(); - - thread_mgr::spawn( - ThreadKind::TenantTaskManager, - None, - None, - "Tenant task manager main thread", - true, - move || { - runtime.block_on(async move { - let mut futures = FuturesUnordered::new(); - loop { - tokio::select! { - _ = thread_mgr::shutdown_watcher() => { - // Send cancellation to all tasks - for (_, cancel) in gc_loops.drain() { - cancel.send(()).ok(); - } - for (_, cancel) in compaction_loops.drain() { - cancel.send(()).ok(); - } - - // Exit after all tasks finish - while let Some(result) = futures.next().await { - match result { - Ok(()) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Err(e) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - } - } - break; - }, - tenantid = gc_recv.recv() => { - let tenantid = tenantid.expect("Gc task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(gc_loop(tenantid, cancel_recv) - .instrument(info_span!("gc loop", tenant = %tenantid))); - if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - tenantid = compaction_recv.recv() => { - let tenantid = tenantid.expect("Compaction task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv) - .instrument(info_span!("compaction loop", tenant = %tenantid))); - if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - result = futures.next() => { - // Log and count any unhandled panics - match result { - Some(Ok(())) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Some(Err(e)) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - None => {}, - }; - }, - } - } - }); - Ok(()) - }, - )?; - - Ok(()) + result } /// /// GC task's main loop /// -async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting gc loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run gc let gc_period = repo.get_gc_period(); let gc_horizon = repo.get_gc_horizon(); + let mut sleep_duration = gc_period; if gc_horizon > 0 { - repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; + if let Err(e) = repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false) + { + error!("Gc failed, retrying: {}", e); + sleep_duration = Duration::from_secs(2) + } } - Ok(ControlFlow::Continue(gc_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { - error!("Gc failed, retrying: {}", e); - Duration::from_secs(2) + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, } - Err(e) => { - error!("Gc join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, } } - trace!( + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + info!( "GC loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); + result } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs deleted file mode 100644 index cdd38febbc..0000000000 --- a/pageserver/src/thread_mgr.rs +++ /dev/null @@ -1,409 +0,0 @@ -//! -//! This module provides centralized handling of threads in the Page Server. -//! -//! We provide a few basic facilities: -//! - A global registry of threads that lists what kind of threads they are, and -//! which tenant or timeline they are working on -//! -//! - The ability to request a thread to shut down. -//! -//! -//! # How it works? -//! -//! There is a global hashmap of all the threads (`THREADS`). Whenever a new -//! thread is spawned, a PageServerThread entry is added there, and when a -//! thread dies, it removes itself from the hashmap. If you want to kill a -//! thread, you can scan the hashmap to find it. -//! -//! # Thread shutdown -//! -//! To kill a thread, we rely on co-operation from the victim. Each thread is -//! expected to periodically call the `is_shutdown_requested()` function, and -//! if it returns true, exit gracefully. In addition to that, when waiting for -//! the network or other long-running operation, you can use -//! `shutdown_watcher()` function to get a Future that will become ready if -//! the current thread has been requested to shut down. You can use that with -//! Tokio select!(), but note that it relies on thread-local storage, so it -//! will only work with the "current-thread" Tokio runtime! -//! -//! -//! TODO: This would be a good place to also handle panics in a somewhat sane way. -//! Depending on what thread panics, we might want to kill the whole server, or -//! only a single tenant or timeline. -//! - -use std::cell::RefCell; -use std::collections::HashMap; -use std::panic; -use std::panic::AssertUnwindSafe; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::{Arc, Mutex}; -use std::thread; -use std::thread::JoinHandle; - -use tokio::sync::watch; - -use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - -use utils::zid::{ZTenantId, ZTimelineId}; - -use crate::shutdown_pageserver; - -/// Each thread that we track is associated with a "thread ID". It's just -/// an increasing number that we assign, not related to any system thread -/// id. -static NEXT_THREAD_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); - -/// Global registry of threads -static THREADS: Lazy>>> = - Lazy::new(|| Mutex::new(HashMap::new())); - -// There is a Tokio watch channel for each thread, which can be used to signal the -// thread that it needs to shut down. This thread local variable holds the receiving -// end of the channel. The sender is kept in the global registry, so that anyone -// can send the signal to request thread shutdown. -thread_local!(static SHUTDOWN_RX: RefCell>> = RefCell::new(None)); - -// Each thread holds reference to its own PageServerThread here. -thread_local!(static CURRENT_THREAD: RefCell>> = RefCell::new(None)); - -/// -/// There are many kinds of threads in the system. Some are associated with a particular -/// tenant or timeline, while others are global. -/// -/// Note that we don't try to limit how may threads of a certain kind can be running -/// at the same time. -/// -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ThreadKind { - // libpq listener thread. It just accepts connection and spawns a - // PageRequestHandler thread for each connection. - LibpqEndpointListener, - - // HTTP endpoint listener. - HttpEndpointListener, - - // Thread that handles a single connection. A PageRequestHandler thread - // starts detached from any particular tenant or timeline, but it can be - // associated with one later, after receiving a command from the client. - PageRequestHandler, - - // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. - WalReceiverManager, - - // Thread that schedules new compaction and gc jobs - TenantTaskManager, - - // Worker thread for tenant tasks thread pool - TenantTaskWorker, - - // Thread that flushes frozen in-memory layers to disk - LayerFlushThread, - - // Thread for synchronizing pageserver layer files with the remote storage. - // Shared by all tenants. - StorageSync, -} - -#[derive(Default)] -struct MutableThreadState { - /// Tenant and timeline that this thread is associated with. - tenant_id: Option, - timeline_id: Option, - - /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. OR if this thread is managed externally - /// and was not spawned through thread_mgr.rs::spawn function. - join_handle: Option>, -} - -struct PageServerThread { - thread_id: u64, - - kind: ThreadKind, - - name: String, - - // To request thread shutdown, set the flag, and send a dummy message to the - // channel to notify it. - shutdown_requested: AtomicBool, - shutdown_tx: watch::Sender<()>, - - mutable: Mutex, -} - -/// Launch a new thread -/// Note: if shutdown_process_on_error is set to true failure -/// of the thread will lead to shutdown of entire process -pub fn spawn( - kind: ThreadKind, - tenant_id: Option, - timeline_id: Option, - name: &str, - shutdown_process_on_error: bool, - f: F, -) -> std::io::Result -where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_string(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id, - timeline_id, - join_handle: None, - }), - }); - - THREADS - .lock() - .unwrap() - .insert(thread_id, Arc::clone(&thread)); - - let mut thread_mut = thread.mutable.lock().unwrap(); - - let thread_cloned = Arc::clone(&thread); - let thread_name = name.to_string(); - let join_handle = match thread::Builder::new() - .name(name.to_string()) - .spawn(move || { - thread_wrapper( - thread_name, - thread_id, - thread_cloned, - shutdown_rx, - shutdown_process_on_error, - f, - ) - }) { - Ok(handle) => handle, - Err(err) => { - error!("Failed to spawn thread '{}': {}", name, err); - // Could not spawn the thread. Remove the entry - THREADS.lock().unwrap().remove(&thread_id); - return Err(err); - } - }; - thread_mut.join_handle = Some(join_handle); - drop(thread_mut); - - // The thread is now running. Nothing more to do here - Ok(thread_id) -} - -/// This wrapper function runs in a newly-spawned thread. It initializes the -/// thread-local variables and calls the payload function -fn thread_wrapper( - thread_name: String, - thread_id: u64, - thread: Arc, - shutdown_rx: watch::Receiver<()>, - shutdown_process_on_error: bool, - f: F, -) where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - CURRENT_THREAD.with(|ct| { - *ct.borrow_mut() = Some(thread); - }); - - debug!("Starting thread '{}'", thread_name); - - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - let result = panic::catch_unwind(AssertUnwindSafe(f)); - - // Remove our entry from the global hashmap. - let thread = THREADS - .lock() - .unwrap() - .remove(&thread_id) - .expect("no thread in registry"); - - let thread_mut = thread.mutable.lock().unwrap(); - match result { - Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), - Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - } -} - -// expected to be called from the thread of the given id. -pub fn associate_with(tenant_id: Option, timeline_id: Option) { - CURRENT_THREAD.with(|ct| { - let borrowed = ct.borrow(); - let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap(); - thread_mut.tenant_id = tenant_id; - thread_mut.timeline_id = timeline_id; - }); -} - -/// Is there a thread running that matches the criteria - -/// Signal and wait for threads to shut down. -/// -/// -/// The arguments are used to select the threads to kill. Any None arguments are -/// ignored. For example, to shut down all WalReceiver threads: -/// -/// shutdown_threads(Some(ThreadKind::WalReceiver), None, None) -/// -/// Or to shut down all threads for given timeline: -/// -/// shutdown_threads(None, Some(timelineid), None) -/// -pub fn shutdown_threads( - kind: Option, - tenant_id: Option, - timeline_id: Option, -) { - let mut victim_threads = Vec::new(); - - let threads = THREADS.lock().unwrap(); - for thread in threads.values() { - let thread_mut = thread.mutable.lock().unwrap(); - if (kind.is_none() || Some(thread.kind) == kind) - && (tenant_id.is_none() || thread_mut.tenant_id == tenant_id) - && (timeline_id.is_none() || thread_mut.timeline_id == timeline_id) - { - thread.shutdown_requested.store(true, Ordering::Relaxed); - // FIXME: handle error? - let _ = thread.shutdown_tx.send(()); - victim_threads.push(Arc::clone(thread)); - } - } - drop(threads); - - for thread in victim_threads { - let mut thread_mut = thread.mutable.lock().unwrap(); - info!("waiting for {} to shut down", thread.name); - if let Some(join_handle) = thread_mut.join_handle.take() { - drop(thread_mut); - let _ = join_handle.join(); - } else { - // Possibly one of: - // * The thread had not even fully started yet. - // * It was shut down concurrently and already exited - // * Is managed through `register`/`deregister` fns without providing a join handle - } - } -} - -/// A Future that can be used to check if the current thread has been requested to -/// shut down. -pub async fn shutdown_watcher() { - let _ = SHUTDOWN_RX - .with(|rx| { - rx.borrow() - .as_ref() - .expect("shutdown_requested() called in an unexpected thread") - .clone() - }) - .changed() - .await; -} - -/// Has the current thread been requested to shut down? -pub fn is_shutdown_requested() -> bool { - CURRENT_THREAD.with(|ct| { - if let Some(ct) = ct.borrow().as_ref() { - ct.shutdown_requested.load(Ordering::Relaxed) - } else { - if !cfg!(test) { - warn!("is_shutdown_requested() called in an unexpected thread"); - } - false - } - }) -} - -/// Needed to register threads that were not spawned through spawn function. -/// For example tokio blocking threads. This function is expected to be used -/// in tandem with `deregister`. -/// NOTE: threads registered through this function cannot be joined -pub fn register(kind: ThreadKind, name: &str) { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - if borrowed.is_some() { - panic!("thread already registered") - }; - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_owned(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id: None, - timeline_id: None, - join_handle: None, - }), - }); - - *borrowed = Some(Arc::clone(&thread)); - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - - THREADS.lock().unwrap().insert(thread_id, thread); - }); -} - -// Expected to be used in tandem with `register`. See the doc for `register` for more details -pub fn deregister() { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - let thread = match borrowed.take() { - Some(thread) => thread, - None => panic!("calling deregister on unregistered thread"), - }; - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = None; - }); - - THREADS.lock().unwrap().remove(&thread.thread_id) - }); -} diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 9356893908..35dec54d5c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,7 +2,7 @@ //! Timeline management code // -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, Context, Result}; use remote_storage::path_with_suffix_extension; use std::{ @@ -14,21 +14,15 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::config::PageServerConf; +use crate::layered_repository::{Repository, Timeline}; use crate::tenant_mgr; use crate::CheckpointConfig; -use crate::{ - config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, -}; use crate::{import_datadir, TEMP_FILE_SUFFIX}; -use crate::{ - layered_repository::{Repository, Timeline}, - walredo::WalRedoManager, -}; #[derive(Debug, Clone, Copy)] pub struct PointInTime { @@ -36,39 +30,6 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn create_repo( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, - wal_redo_manager: Arc, - remote_index: RemoteIndex, -) -> Result> { - let repo_dir = conf.tenant_path(&tenant_id); - ensure!( - !repo_dir.exists(), - "cannot create new tenant repo: '{}' directory already exists", - tenant_id - ); - - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); - - // Save tenant's config - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; - - Ok(Arc::new(Repository::new( - conf, - tenant_conf, - wal_redo_manager, - tenant_id, - remote_index, - conf.remote_storage_config.is_some(), - ))) -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -158,7 +119,7 @@ fn bootstrap_timeline( /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, /// a new unique ID is generated. /// -pub(crate) fn create_timeline( +pub(crate) async fn create_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, new_timeline_id: Option, @@ -187,7 +148,7 @@ pub(crate) fn create_timeline( // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn)?; + ancestor_timeline.wait_lsn(*lsn).await?; let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index d6420e1d18..deac299747 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -23,131 +23,61 @@ mod connection_manager; mod walreceiver_connection; +use crate::config::PageServerConf; +use crate::task_mgr::WALRECEIVER_RUNTIME; + use anyhow::{ensure, Context}; use etcd_broker::Client; use itertools::Itertools; -use std::cell::Cell; -use std::collections::{hash_map, HashMap, HashSet}; +use once_cell::sync::OnceCell; use std::future::Future; -use std::num::NonZeroU64; use std::sync::Arc; -use std::thread_local; -use std::time::Duration; -use tokio::{ - select, - sync::{mpsc, watch}, - task::JoinHandle, -}; +use tokio::sync::watch; use tracing::*; use url::Url; -use crate::config::PageServerConf; -use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; -use crate::thread_mgr::{self, ThreadKind}; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +pub use connection_manager::spawn_connection_manager_task; -thread_local! { - // Boolean that is true only for WAL receiver threads - // - // This is used in `wait_lsn` to guard against usage that might lead to a deadlock. - pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); -} +static ETCD_CLIENT: OnceCell = OnceCell::new(); -/// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. -/// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. -pub fn init_wal_receiver_main_thread( - conf: &'static PageServerConf, - mut timeline_updates_receiver: mpsc::UnboundedReceiver, -) -> anyhow::Result<()> { +/// +/// Initialize the etcd client. This must be called once at page server startup. +/// +pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> { let etcd_endpoints = conf.broker_endpoints.clone(); ensure!( !etcd_endpoints.is_empty(), "Cannot start wal receiver: etcd endpoints are empty" ); - let broker_prefix = &conf.broker_etcd_prefix; - info!( - "Starting wal receiver main thread, etcd endpoints: {}", - etcd_endpoints.iter().map(Url::to_string).join(", ") - ); - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("wal-receiver-runtime-thread") - .enable_all() - .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) - .build() - .context("Failed to create storage sync runtime")?; - let etcd_client = runtime - .block_on(Client::connect(etcd_endpoints, None)) + let etcd_client = Client::connect(etcd_endpoints.clone(), None) + .await .context("Failed to connect to etcd")?; - thread_mgr::spawn( - ThreadKind::WalReceiverManager, - None, - None, - "WAL receiver manager main thread", - true, - move || { - runtime.block_on(async move { - let mut local_timeline_wal_receivers = HashMap::new(); - loop { - select! { - _ = thread_mgr::shutdown_watcher() => { - info!("Shutdown signal received"); - shutdown_all_wal_connections(&mut local_timeline_wal_receivers).await; - break; - }, - _ = wal_receiver_main_thread_loop_step( - broker_prefix, - &etcd_client, - &mut timeline_updates_receiver, - &mut local_timeline_wal_receivers, - ) => {}, - } - } - }.instrument(info_span!("wal_receiver_main"))); + // FIXME: Should we still allow the pageserver to start, if etcd + // doesn't work? It could still serve GetPage requests, with the + // data it has locally and from what it can download from remote + // storage + if ETCD_CLIENT.set(etcd_client).is_err() { + panic!("etcd already initialized"); + } - info!("Wal receiver main thread stopped"); - Ok(()) - }, - ) - .map(|_thread_id| ()) - .context("Failed to spawn wal receiver main thread") + info!( + "Initialized etcd client with endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") + ); + Ok(()) } -async fn shutdown_all_wal_connections( - local_timeline_wal_receivers: &mut HashMap>>, -) { - info!("Shutting down all WAL connections"); - let mut broker_join_handles = Vec::new(); - for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { - for (timeline_id, handles) in timelines { - handles.cancellation.send(()).ok(); - broker_join_handles.push(( - ZTenantTimelineId::new(tenant_id, timeline_id), - handles.handle, - )); - } - } +/// +/// Get a handle to the etcd client +/// +pub fn get_etcd_client() -> &'static etcd_broker::Client { + ETCD_CLIENT.get().expect("etcd client not initialized") +} - let mut tenants = HashSet::with_capacity(broker_join_handles.len()); - for (id, broker_join_handle) in broker_join_handles { - tenants.insert(id.tenant_id); - debug!("Waiting for wal broker for timeline {id} to finish"); - if let Err(e) = broker_join_handle.await { - error!("Failed to join on wal broker for timeline {id}: {e}"); - } - } - if let Err(e) = tokio::task::spawn_blocking(move || { - for tenant_id in tenants { - if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { - error!("Failed to make tenant {tenant_id} idle: {e:?}"); - } - } - }) - .await - { - error!("Failed to await a task to make all tenants idle: {e:?}"); - } +pub fn is_etcd_client_initialized() -> bool { + ETCD_CLIENT.get().is_some() } /// A handle of an asynchronous task. @@ -157,8 +87,7 @@ async fn shutdown_all_wal_connections( /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. /// That may lead to certain events not being observed by the listener. #[derive(Debug)] -struct TaskHandle { - handle: JoinHandle>, +pub struct TaskHandle { events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } @@ -167,7 +96,7 @@ struct TaskHandle { pub enum TaskEvent { Started, NewEvent(E), - End(Result<(), String>), + End, } impl TaskHandle { @@ -184,164 +113,28 @@ impl TaskHandle { let events_sender = Arc::new(events_sender); let sender = Arc::clone(&events_sender); - let handle = tokio::task::spawn(async move { + let _ = WALRECEIVER_RUNTIME.spawn(async move { events_sender.send(TaskEvent::Started).ok(); task(sender, cancellation_receiver).await }); TaskHandle { - handle, events_receiver, cancellation, } } async fn next_task_event(&mut self) -> TaskEvent { - select! { - next_task_event = self.events_receiver.changed() => match next_task_event { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await, - }, - task_completion_result = join_on_handle(&mut self.handle) => task_completion_result, + match self.events_receiver.changed().await { + Ok(()) => self.events_receiver.borrow().clone(), + Err(_task_channel_part_dropped) => TaskEvent::End, } } /// Aborts current task, waiting for it to finish. - async fn shutdown(self) { + pub async fn shutdown(mut self) { self.cancellation.send(()).ok(); - if let Err(e) = self.handle.await { - error!("Task failed to shut down: {e}") - } + // wait until the sender is dropped + while self.events_receiver.changed().await.is_ok() {} } } - -async fn join_on_handle(handle: &mut JoinHandle>) -> TaskEvent { - match handle.await { - Ok(task_result) => TaskEvent::End(task_result), - Err(e) => { - if e.is_cancelled() { - TaskEvent::End(Ok(())) - } else { - TaskEvent::End(Err(format!("WAL receiver task panicked: {e}"))) - } - } - } -} - -/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. -/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. -/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. -/// -/// Cannot fail, should always try to process the next timeline event even if the other one was not processed properly. -async fn wal_receiver_main_thread_loop_step<'a>( - broker_prefix: &'a str, - etcd_client: &'a Client, - timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, - local_timeline_wal_receivers: &'a mut HashMap>>, -) { - // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. - match timeline_updates_receiver.recv().await { - Some(update) => { - info!("Processing timeline update: {update:?}"); - match update { - // Timeline got detached, stop all related tasks and remove public timeline data. - LocalTimelineUpdate::Detach { - id, - join_confirmation_sender, - } => { - match local_timeline_wal_receivers.get_mut(&id.tenant_id) { - Some(wal_receivers) => { - if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { - o.remove().shutdown().await - } - if wal_receivers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { - error!("Failed to make tenant idle for id {id}: {e:#}"); - } - } - } - None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), - }; - if let Err(e) = join_confirmation_sender.send(()) { - warn!("cannot send wal_receiver shutdown confirmation {e}") - } else { - info!("confirm walreceiver shutdown for {id}"); - } - } - // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. - LocalTimelineUpdate::Attach { id, timeline } => { - let timeline_connection_managers = local_timeline_wal_receivers - .entry(id.tenant_id) - .or_default(); - - if timeline_connection_managers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await - { - error!("Failed to make tenant active for id {id}: {e:#}"); - return; - } - } - - let vacant_connection_manager_entry = - match timeline_connection_managers.entry(id.timeline_id) { - hash_map::Entry::Occupied(_) => { - debug!("Attepted to readd an existing timeline {id}, ignoring"); - return; - } - hash_map::Entry::Vacant(v) => v, - }; - - let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = - match fetch_tenant_settings(id.tenant_id).await { - Ok(settings) => settings, - Err(e) => { - error!("Failed to fetch tenant settings for id {id}: {e:#}"); - return; - } - }; - - vacant_connection_manager_entry.insert( - connection_manager::spawn_connection_manager_task( - id, - broker_prefix.to_owned(), - etcd_client.clone(), - timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - ), - ); - } - } - } - None => { - info!("Local timeline update channel closed"); - shutdown_all_wal_connections(local_timeline_wal_receivers).await; - } - } -} - -async fn fetch_tenant_settings( - tenant_id: ZTenantId, -) -> anyhow::Result<(Duration, Duration, NonZeroU64)> { - tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - Ok::<_, anyhow::Error>(( - repo.get_wal_receiver_connect_timeout(), - repo.get_lagging_wal_timeout(), - repo.get_max_lsn_wal_lag(), - )) - }) - .await - .with_context(|| format!("Failed to join on tenant {tenant_id} settings fetch task"))? -} - -async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - tokio::task::spawn_blocking(move || { - tenant_mgr::set_tenant_state(tenant_id, new_state) - .with_context(|| format!("Failed to activate tenant {tenant_id}")) - }) - .await - .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? -} diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 0261203049..1fcb768ddf 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -17,6 +17,9 @@ use std::{ }; use crate::layered_repository::Timeline; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::task_mgr::WALRECEIVER_RUNTIME; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -26,7 +29,10 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use crate::{ + exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, +}; use utils::{ lsn::Lsn, zid::{NodeId, ZTenantTimelineId}, @@ -35,29 +41,38 @@ use utils::{ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; /// Spawns the loop to take care of the timeline's WAL streaming connection. -pub(super) fn spawn_connection_manager_task( - id: ZTenantTimelineId, +pub fn spawn_connection_manager_task( broker_loop_prefix: String, - mut client: Client, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, -) -> TaskHandle<()> { - TaskHandle::spawn(move |_, mut cancellation| { +) -> anyhow::Result<()> { + let mut etcd_client = get_etcd_client().clone(); + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverManager, + Some(tenant_id), + Some(timeline_id), + &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + false, async move { info!("WAL receiver broker started, connecting to etcd"); let mut walreceiver_state = WalreceiverState::new( - id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, ); loop { select! { - _ = cancellation.changed() => { - info!("Broker subscription init cancelled, shutting down"); + _ = task_mgr::shutdown_watcher() => { + info!("WAL receiver shutdown requested, shutting down"); + // Kill current connection, if any if let Some(wal_connection) = walreceiver_state.wal_connection.take() { wal_connection.connection_task.shutdown().await; @@ -67,14 +82,15 @@ pub(super) fn spawn_connection_manager_task( _ = connection_manager_loop_step( &broker_loop_prefix, - &mut client, + &mut etcd_client, &mut walreceiver_state, ) => {}, } } } - .instrument(info_span!("wal_connection_manager", id = %id)) - }) + .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + ); + Ok(()) } /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. @@ -85,7 +101,10 @@ async fn connection_manager_loop_step( etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, ) { - let id = walreceiver_state.id; + let id = ZTenantTimelineId { + tenant_id: walreceiver_state.timeline.tenant_id, + timeline_id: walreceiver_state.timeline.timeline_id, + }; // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, // running the entire loop step as much as possible to an end. @@ -98,6 +117,14 @@ async fn connection_manager_loop_step( loop { let time_until_next_retry = walreceiver_state.time_until_next_retry(); + // These things are happening concurrently: + // + // - keep receiving WAL on the current connection + // - if the shared state says we need to change connection, disconnect and return + // - this runs in a separate task and we receive updates via a watch channel + // - change connection if the rules decide so, or if the current connection dies + // - receive updates from broker + // - this might change the current desired connection select! { broker_connection_result = &mut broker_subscription.watcher_handle => { cleanup_broker_connection(broker_connection_result, walreceiver_state); @@ -110,7 +137,8 @@ async fn connection_manager_loop_step( None => None, } } => { - let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); + let wal_connection = walreceiver_state.wal_connection.as_mut() + .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { TaskEvent::Started => {}, TaskEvent::NewEvent(status) => { @@ -123,16 +151,14 @@ async fn connection_manager_loop_step( } wal_connection.status = status; }, - TaskEvent::End(end_result) => { - match end_result { - Ok(()) => debug!("WAL receiving task finished"), - Err(e) => warn!("WAL receiving task failed: {e}"), - }; + TaskEvent::End => { + debug!("WAL receiving task finished"); walreceiver_state.drop_old_connection(false).await; }, } }, + // Got a new update from etcd broker_update = broker_subscription.value_updates.recv() => { match broker_update { Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), @@ -241,8 +267,9 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, + /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -299,15 +326,18 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( - id: ZTenantTimelineId, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, ) -> Self { + let id = ZTenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }; Self { id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, @@ -323,10 +353,11 @@ impl WalreceiverState { let id = self.id; let connect_timeout = self.wal_connect_timeout; + let timeline = Arc::clone(&self.timeline); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { super::walreceiver_connection::handle_walreceiver_connection( - id, + timeline, &new_wal_source_connstr, events_sender.as_ref(), cancellation, @@ -520,7 +551,7 @@ impl WalreceiverState { let current_lsn = match existing_wal_connection.status.streaming_lsn { Some(lsn) => lsn, - None => self.local_timeline.get_last_record_lsn(), + None => self.timeline.get_last_record_lsn(), }; let current_commit_lsn = existing_wal_connection .status @@ -1328,7 +1359,7 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - local_timeline: harness + timeline: harness .load() .create_empty_timeline(TIMELINE_ID, Lsn(0)) .expect("Failed to create an empty timeline for dummy wal connection manager"), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 4c30481e02..e8fa9f9aca 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -21,11 +21,17 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ - layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, + layered_repository::{Timeline, WalReceiverInfo}, + task_mgr, + task_mgr::TaskKind, + task_mgr::WALRECEIVER_RUNTIME, + tenant_mgr, + walingest::WalIngest, walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; +use utils::zid::ZTenantTimelineId; +use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; /// Status of the connection. #[derive(Debug, Clone)] @@ -48,7 +54,7 @@ pub struct WalConnectionStatus { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. pub async fn handle_walreceiver_connection( - id: ZTenantTimelineId, + timeline: Arc, wal_source_connstr: &str, events_sender: &watch::Sender>, mut cancellation: watch::Receiver<()>, @@ -83,24 +89,31 @@ pub async fn handle_walreceiver_connection( // The connection object performs the actual communication with the database, // so spawn it off to run on its own. let mut connection_cancellation = cancellation.clone(); - tokio::spawn( + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverConnection, + Some(timeline.tenant_id), + Some(timeline.timeline_id), + "walreceiver connection", + false, async move { select! { - connection_result = connection => match connection_result{ - Ok(()) => info!("Walreceiver db connection closed"), - Err(connection_error) => { - if connection_error.is_closed() { - info!("Connection closed regularly: {connection_error}") - } else { - warn!("Connection aborted: {connection_error}") - } - } - }, + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, - _ = connection_cancellation.changed() => info!("Connection cancelled"), + _ = connection_cancellation.changed() => info!("Connection cancelled"), } + Ok(()) } - .instrument(info_span!("safekeeper_handle_db")), + .instrument(info_span!("walreceiver connection")), ); // Immediately increment the gauge, then create a job to decrement it on task exit. @@ -117,10 +130,6 @@ pub async fn handle_walreceiver_connection( let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = id; connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); @@ -130,17 +139,10 @@ pub async fn handle_walreceiver_connection( return Ok(()); } - let (repo, timeline) = tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - let timeline = repo.get_timeline(timeline_id) - .with_context(|| { - format!("local timeline {timeline_id} not found for tenant {tenant_id}") - })?; - Ok::<_, anyhow::Error>((repo, timeline)) - }) - .await - .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??; + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; // // Start streaming the WAL, from where we left off previously. @@ -273,11 +275,12 @@ pub async fn handle_walreceiver_connection( } } - let timeline_to_check = Arc::clone(&timeline); - tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) - .await - .with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))? - .with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?; + timeline.check_checkpoint_distance().with_context(|| { + format!( + "Failed to check checkpoint distance for timeline {}", + timeline.timeline_id + ) + })?; if let Some(last_lsn) = status_update { let remote_index = repo.get_remote_index(); diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index befa4616be..315ec7f306 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_until from fixtures.types import ZTenantId, ZTimelineId @@ -39,9 +40,6 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) - def assert_idle(tenant): - assert get_state(tenant) == "Idle" - # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) @@ -51,18 +49,21 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Stop compute pg.stop() - # Detach all tenants and wait for them to go idle - # TODO they should be already idle since there are no active computes + # Delete all timelines on all tenants for tenant_info in client.tenant_list(): tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) - wait_until(10, 0.2, lambda: assert_idle(tenant_id)) - # Assert that all tasks finish quickly after tenants go idle + # Assert that all tasks finish quickly after tenant is detached + assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + client.tenant_detach(tenant) + client.tenant_detach(env.initial_tenant) + def assert_tasks_finish(): tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended assert tasks_panicked == 0 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index bfe61b9ced..096b3a5d70 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -47,9 +47,9 @@ scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } -tokio-util = { version = "0.7", features = ["codec", "io", "tracing"] } +tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] }