try: startup speedup (#4366)

Startup can take a long time. We suspect it's the initial logical size calculations. Long term solution is to not block the tokio executors but do most of I/O in spawn_blocking. See: #4025, #4183 Short-term solution to above: - Delay global background tasks until initial tenant loads complete - Just limit how many init logical size calculations can we have at the same time to `cores / 2` This PR is for trying in staging.
2026-01-05 20:42:54 +00:00 · 2023-05-29 21:48:38 +03:00
parent f4f300732a
commit cb83495744
5 changed files with 73 additions and 4 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -335,13 +335,36 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // All tenant load operations carry this while they are ongoing; it will be dropped once those
+    // operations finish either successfully or in some other manner. However, the initial load
+    // will be then done, and we can start the global background tasks.
+    let (init_done_tx, init_done_rx) = tokio::sync::mpsc::channel::<()>(1);
+    let init_done_rx = Arc::new(tokio::sync::Mutex::new(init_done_rx));
+
    // Scan the local 'tenants/' directory and start loading the tenants
+    let init_started_at = std::time::Instant::now();
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        broker_client.clone(),
        remote_storage.clone(),
+        init_done_tx,
    ))?;

+    BACKGROUND_RUNTIME.spawn({
+        let init_done_rx = init_done_rx.clone();
+        async move {
+            let init_done = async move { init_done_rx.lock().await.recv().await };
+            init_done.await;
+
+            let elapsed = init_started_at.elapsed();
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed."
+            );
+        }
+    });
+
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
    // is still accessible even if background task is not configured as long as remote storage has
@@ -353,6 +376,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            init_done_rx.clone(),
        )?;
    }

@@ -390,6 +414,7 @@ fn start_pageserver(
        );

        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let init_done_rx = init_done_rx;
            let metrics_ctx = RequestContext::todo_child(
                TaskKind::MetricsCollection,
                // This task itself shouldn't download anything.
@@ -405,6 +430,14 @@ fn start_pageserver(
                "consumption metrics collection",
                true,
                async move {
+                    // first wait for initial load to complete before first iteration.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let init_done = async move { init_done_rx.lock().await.recv().await };
+                    init_done.await;
+
                    pageserver::consumption_metrics::collect_metrics(
                        metric_collection_endpoint,
                        conf.metric_collection_interval,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -82,6 +82,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    init_done_rx: Arc<tokio::sync::Mutex<tokio::sync::mpsc::Receiver<()>>>,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
@@ -98,6 +99,10 @@ pub fn launch_disk_usage_global_eviction_task(
        "disk usage based eviction",
        false,
        async move {
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            let init_done = async move { init_done_rx.lock().await.recv().await };
+            init_done.await;
+
            disk_usage_eviction_task(
                &state,
                task_config,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -895,6 +895,7 @@ impl Tenant {
        tenant_id: TenantId,
        broker_client: storage_broker::BrokerClientChannel,
        remote_storage: Option<GenericRemoteStorage>,
+        init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
        let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
@@ -928,6 +929,9 @@ impl Tenant {
            "initial tenant load",
            false,
            async move {
+                // keep the sender alive as long as we have the initial load ongoing; it will be
+                // None for loads spawned after init_tenant_mgr.
+                let _init_done_tx = init_done_tx;
                match tenant_clone.load(&ctx).await {
                    Ok(()) => {
                        info!("load finished, activating");
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -66,6 +66,7 @@ pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_done_tx: tokio::sync::mpsc::Sender<()>,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();
@@ -122,6 +123,7 @@ pub async fn init_tenant_mgr(
                        &tenant_dir_path,
                        broker_client.clone(),
                        remote_storage.clone(),
+                        Some(init_done_tx.clone()),
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -157,6 +159,7 @@ pub fn schedule_local_tenant_processing(
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -210,7 +213,14 @@ pub fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, broker_client, remote_storage, ctx)
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            broker_client,
+            remote_storage,
+            init_done_tx,
+            ctx,
+        )
    };
    Ok(tenant)
 }
@@ -363,7 +373,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -509,7 +519,7 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -582,7 +592,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1910,6 +1910,23 @@ impl Timeline {
                // no cancellation here, because nothing really waits for this to complete compared
                // to spawn_ondemand_logical_size_calculation.
                let cancel = CancellationToken::new();
+
+                /// Ugly, but necessary until `spawn_blocking` is used for blocking I/O, otherwise
+                /// we could lock up all worker threads.
+                static GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE: once_cell::sync::Lazy<Arc<tokio::sync::Semaphore>> = once_cell::sync::Lazy::new(|| {
+                    let cores = std::thread::available_parallelism();
+                    // half rationale: we have other blocking work which will start later:
+                    // consumption metrics and per timeline eviction task. we however need to
+                    // be fast to accept page reads, so perhaps this is a suitable middle ground?
+                    let max_blocked_threads = cores.map(|count| count.get() / 2);
+                    let max_blocked_threads = max_blocked_threads.unwrap_or(1);
+                    let max_blocked_threads = std::cmp::max(1, max_blocked_threads);
+                    tracing::info!("using max {max_blocked_threads} threads for initial logical size");
+                    Arc::new(tokio::sync::Semaphore::new(max_blocked_threads))
+                });
+
+                let _permit = GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE.clone().acquire_owned().await.expect("global semaphore is never closed");
+
                let calculated_size = match self_clone
                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                    .await