From dae56ef60ca33643b3d80b4d2497fb6902620db0 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 6 Feb 2024 13:15:42 +0100 Subject: [PATCH] Do not suspend compute if there is an active logical replication subscription. (#6570) ## Problem the idea is to keep compute up and running if there are any active logical replication subscriptions. ### Rationale Rationale: - The Write-Ahead Logging (WAL) files, which contain the data changes, will need to be retained on the publisher side until the subscriber is able to connect again and apply these changes. This could potentially lead to increased disk usage on the publisher - and we do not want to disrupt the source - I think it is more pain for our customer to resolve storage issues on the source than to pay for the compute at the target. - Upon resuming the compute resources, the subscriber will start consuming and applying the changes from the retained WAL files. The time taken to catch up will depend on the volume of changes and the configured vCPUs. we can avoid explaining complex situations where we lag behind (in extreme cases we could lag behind hours, days or even months) - I think an important use case for logical replication from a source is a one-time migration or release upgrade. In this case the customer would not mind if we are not suspended for the duration of the migration. We need to document this in the release notes and the documentation in the context of logical replication where Neon is the target (subscriber) ### See internal discussion here https://neondb.slack.com/archives/C04DGM6SMTM/p1706793400746539?thread_ts=1706792628.701279&cid=C04DGM6SMTM --- compute_tools/src/monitor.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index f09bd02664..872a3f7750 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -138,6 +138,34 @@ fn watch_compute_activity(compute: &ComputeNode) { } } // + // Don't suspend compute if there is an active logical replication subscription + // + // `where pid is not null` – to filter out read only computes and subscription on branches + // + let logical_subscriptions_query = + "select count(*) from pg_stat_subscription where pid is not null;"; + match cli.query_one(logical_subscriptions_query, &[]) { + Ok(row) => match row.try_get::<&str, i64>("count") { + Ok(num_subscribers) => { + if num_subscribers > 0 { + compute.update_last_active(Some(Utc::now())); + continue; + } + } + Err(e) => { + warn!("failed to parse `pg_stat_subscription` count: {:?}", e); + continue; + } + }, + Err(e) => { + warn!( + "failed to get list of active logical replication subscriptions: {:?}", + e + ); + continue; + } + } + // // Do not suspend compute if autovacuum is running // let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";