From 47b705cffe0e13182ec41df8da518f310444c8d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 8 Apr 2024 14:59:08 +0200 Subject: [PATCH 1/2] Remove async_trait from CompactionDeltaLayer (#7342) Removes usage of async_trait from the `CompactionDeltaLayer` trait. Split off from #7301 Related earlier work: https://github.com/neondatabase/neon/pull/6305, https://github.com/neondatabase/neon/pull/6464, https://github.com/neondatabase/neon/pull/7303 --- Cargo.lock | 1 - pageserver/compaction/Cargo.toml | 1 - pageserver/compaction/src/helpers.rs | 2 +- pageserver/compaction/src/interface.rs | 7 ++----- pageserver/compaction/src/simulator.rs | 2 -- pageserver/src/tenant/timeline/compaction.rs | 2 -- 6 files changed, 3 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dae406e4ae..67054cf2c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3616,7 +3616,6 @@ dependencies = [ "anyhow", "async-compression", "async-stream", - "async-trait", "byteorder", "bytes", "chrono", diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 47f318db63..0fd1d81845 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -11,7 +11,6 @@ default = [] anyhow.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true chrono = { workspace = true, features = ["serde"] } diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 22a410b4af..9de6363d6e 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -180,7 +180,7 @@ where match top.deref_mut() { LazyLoadLayer::Unloaded(ref mut l) => { let fut = l.load_keys(this.ctx); - this.load_future.set(Some(fut)); + this.load_future.set(Some(Box::pin(fut))); continue; } LazyLoadLayer::Loaded(ref mut entries) => { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 2bb2e749c0..5dc62e506f 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,7 +3,6 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use async_trait::async_trait; use futures::Future; use pageserver_api::{key::Key, keyspace::key_range_size}; use std::ops::Range; @@ -141,18 +140,16 @@ pub trait CompactionLayer { fn is_delta(&self) -> bool; } - -#[async_trait] pub trait CompactionDeltaLayer: CompactionLayer { type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> where Self: 'a; /// Return all keys in this delta layer. - async fn load_keys<'a>( + fn load_keys<'a>( &self, ctx: &E::RequestContext, - ) -> anyhow::Result>>; + ) -> impl Future>>> + Send; } pub trait CompactionImageLayer: CompactionLayer {} diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index def7983e75..6c00df3a65 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -2,7 +2,6 @@ mod draw; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; -use async_trait::async_trait; use futures::StreamExt; use rand::Rng; use tracing::info; @@ -139,7 +138,6 @@ impl interface::CompactionLayer for Arc { } } -#[async_trait] impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ab001bf10d..8075775bbc 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -12,7 +12,6 @@ use super::layer_manager::LayerManager; use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; use anyhow::{anyhow, Context}; -use async_trait::async_trait; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; @@ -1122,7 +1121,6 @@ impl CompactionLayer for ResidentDeltaLayer { } } -#[async_trait] impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; From 1081a4d2462d324961604b9114def1efea096f44 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 8 Apr 2024 16:27:08 +0200 Subject: [PATCH 2/2] pageserver: option to run with just one tokio runtime (#7331) This PR is an off-by-default revision v2 of the (since-reverted) PR #6555 / commit `3220f830b7fbb785d6db8a93775f46314f10a99b`. See that PR for details on why running with a single runtime is desirable and why we should be ready. We reverted #6555 because it showed regressions in prodlike cloudbench, see the revert commit message `ad072de4209193fd21314cf7f03f14df4fa55eb1` for more context. This PR makes it an opt-in choice via an env var. The default is to use the 4 separate runtimes that we have today, there shouldn't be any performance change. I tested manually that the env var & added metric works. ``` # undefined env var => no change to before this PR, uses 4 runtimes ./target/debug/neon_local start # defining the env var enables one-runtime mode, value defines that one runtime's configuration NEON_PAGESERVER_USE_ONE_RUNTIME=current_thread ./target/debug/neon_local start NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:1 ./target/debug/neon_local start NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:2 ./target/debug/neon_local start NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:default ./target/debug/neon_local start ``` I want to use this change to do more manualy testing and potentially testing in staging. Future Work ----------- Testing / deployment ergonomics would be better if this were a variable in `pageserver.toml`. It can be done, but, I don't need it right now, so let's stick with the env var. --- control_plane/src/background_process.rs | 14 ++- libs/utils/src/env.rs | 21 ++++ libs/utils/src/lib.rs | 2 + pageserver/src/metrics.rs | 21 ++++ pageserver/src/task_mgr.rs | 149 +++++++++++++++++------- pageserver/src/tenant/tasks.rs | 3 +- 6 files changed, 169 insertions(+), 41 deletions(-) create mode 100644 libs/utils/src/env.rs diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 2fced7d778..94666f2870 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -86,7 +86,10 @@ where .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { @@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_PAGESERVER_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs new file mode 100644 index 0000000000..b3e326bfd0 --- /dev/null +++ b/libs/utils/src/env.rs @@ -0,0 +1,21 @@ +//! Wrapper around `std::env::var` for parsing environment variables. + +use std::{fmt::Display, str::FromStr}; + +pub fn var(varname: &str) -> Option +where + V: FromStr, + E: Display, +{ + match std::env::var(varname) { + Ok(s) => Some( + s.parse() + .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .unwrap(), + ), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 04ce0626c8..cd5075613e 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -89,6 +89,8 @@ pub mod yielding_loop; pub mod zstd; +pub mod env; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ab9a2e8509..3160f204e2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2100,6 +2100,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { use futures::Future; use pin_project_lite::pin_project; use std::collections::HashMap; +use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; @@ -2669,6 +2670,26 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); } +static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tokio_executor_thread_configured_count", + "Total number of configued tokio executor threads in the process. + The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", + &["setup"], + ) + .unwrap() +}); + +pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { + static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _guard = SERIALIZE.lock().unwrap(); + TOKIO_EXECUTOR_THREAD_COUNT.reset(); + TOKIO_EXECUTOR_THREAD_COUNT + .get_metric_with_label_values(&[setup]) + .unwrap() + .set(u64::try_from(num_threads.get()).unwrap()); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 0cc5611a12..9a1e354ecf 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -33,13 +33,14 @@ use std::collections::HashMap; use std::fmt; use std::future::Future; +use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; use pageserver_api::shard::TenantShardId; -use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; @@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; +use utils::env; use utils::id::TimelineId; +use crate::metrics::set_tokio_runtime_setup; + // // There are four runtimes: // @@ -98,52 +102,119 @@ use utils::id::TimelineId; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") - // if you change the number of worker threads please change the constant below - .enable_all() - .build() - .expect("Failed to create background op runtime") -}); - -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { - // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); +pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. - std::env::var("TOKIO_WORKER_THREADS") - .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) + NonZeroUsize::new( + std::env::var("TOKIO_WORKER_THREADS") + .map(|s| s.parse::().unwrap()) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), + ) + .expect("the max() ensures that this is not zero") }); +enum TokioRuntimeMode { + SingleThreaded, + MultiThreaded { num_workers: NonZeroUsize }, +} + +impl FromStr for TokioRuntimeMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), + s => match s.strip_prefix("multi_thread:") { + Some("default") => Ok(TokioRuntimeMode::MultiThreaded { + num_workers: *TOKIO_WORKER_THREADS, + }), + Some(suffix) => { + let num_workers = suffix.parse::().map_err(|e| { + format!( + "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", + ) + })?; + Ok(TokioRuntimeMode::MultiThreaded { num_workers }) + } + None => Err(format!("invalid runtime config: {s:?}")), + }, + } + } +} + +static ONE_RUNTIME: Lazy> = Lazy::new(|| { + let thread_name = "pageserver-tokio"; + let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { + // If the env var is not set, leave this static as None. + set_tokio_runtime_setup( + "multiple-runtimes", + NUM_MULTIPLE_RUNTIMES + .checked_mul(*TOKIO_WORKER_THREADS) + .unwrap(), + ); + return None; + }; + Some(match mode { + TokioRuntimeMode::SingleThreaded => { + set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); + tokio::runtime::Builder::new_current_thread() + .thread_name(thread_name) + .enable_all() + .build() + .expect("failed to create one single runtime") + } + TokioRuntimeMode::MultiThreaded { num_workers } => { + set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .enable_all() + .worker_threads(num_workers.get()) + .build() + .expect("failed to create one multi-threaded runtime") + } + }) +}); + +/// Declare a lazy static variable named `$varname` that will resolve +/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` +/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation +/// declares a separate runtime and the lazy static variable `$varname` +/// will resolve to that separate runtime. +/// +/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if +/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime +/// otherwise. +macro_rules! pageserver_runtime { + ($varname:ident, $name:literal) => { + pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { + if let Some(runtime) = &*ONE_RUNTIME { + return runtime; + } + static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name($name) + .worker_threads(TOKIO_WORKER_THREADS.get()) + .enable_all() + .build() + .expect(std::concat!("Failed to create runtime ", $name)) + }); + &*RUNTIME + }); + }; +} + +pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); +pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); +pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); +pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); +// Bump this number when adding a new pageserver_runtime! +// SAFETY: it's obviously correct +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; + #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e4f5f75132..74ed677ffe 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -18,7 +18,7 @@ use utils::{backoff, completion}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation ); + // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"),