From 49775d28e4ac831418d57adc43d1dba42b1a8443 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 12 Feb 2025 18:54:21 +0100
Subject: [PATCH] fix(compute): Respect skip_pg_catalog_updates in
 reconfigure() (#10696)

## Problem

We respect `skip_pg_catalog_updates` at the initial start, but ignore at
the follow-up `/configure`. Yet, it's used for storage->cplane->compute
notify requests after migrations, shard split, etc. So every time we get
them, applying the new config takes much longer than it should because
we go through Postgres catalog checks. Cplane sets this flag, when it
does serves notify attach call
https://github.com/neondatabase/cloud/commit/9068c7d7433f943af2bc350e9fd59772867e622c

Related to `inc-403`, for example

## Summary of changes

Look at `skip_pg_catalog_updates` in `compute.reconfigure()`
---
 compute_tools/src/compute.rs                  | 33 +++++-----
 .../regress/test_compute_reconfigure.py       | 62 +++++++++++++++++++
 2 files changed, 79 insertions(+), 16 deletions(-)
 create mode 100644 test_runner/regress/test_compute_reconfigure.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index cadc6f84d1..d323ea3dcd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1400,26 +1400,27 @@ impl ComputeNode {
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
 
-        let max_concurrent_connections = spec.reconfigure_concurrency;
+        if !spec.skip_pg_catalog_updates {
+            let max_concurrent_connections = spec.reconfigure_concurrency;
+            // Temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are reconfiguring:
+            // creating new extensions, roles, etc.
+            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
+                self.pg_reload_conf()?;
 
-        // Temporarily reset max_cluster_size in config
-        // to avoid the possibility of hitting the limit, while we are reconfiguring:
-        // creating new extensions, roles, etc.
-        config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
-            self.pg_reload_conf()?;
+                if spec.mode == ComputeMode::Primary {
+                    let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+                    conf.application_name("apply_config");
+                    let conf = Arc::new(conf);
 
-            if spec.mode == ComputeMode::Primary {
-                let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
-                conf.application_name("apply_config");
-                let conf = Arc::new(conf);
+                    let spec = Arc::new(spec.clone());
 
-                let spec = Arc::new(spec.clone());
+                    self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
+                }
 
-                self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
-            }
-
-            Ok(())
-        })?;
+                Ok(())
+            })?;
+        }
 
         self.pg_reload_conf()?;
 
diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py
new file mode 100644
index 0000000000..6619548811
--- /dev/null
+++ b/test_runner/regress/test_compute_reconfigure.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import wait_until
+
+
+def test_compute_reconfigure(neon_simple_env: NeonEnv):
+    """
+    Test that we can change postgresql.conf settings even if
+    skip_pg_catalog_updates=True is set.
+    """
+    env = neon_simple_env
+
+    TEST_LOG_LINE_PREFIX = "%m [%p] [test_compute_reconfigure]: "
+
+    endpoint = env.endpoints.create_start("main")
+
+    # Check that the log line prefix is not set
+    # or different from TEST_LOG_LINE_PREFIX
+    with endpoint.cursor() as cursor:
+        cursor.execute("SHOW log_line_prefix;")
+        row = cursor.fetchone()
+        assert row is not None
+        assert row[0] != TEST_LOG_LINE_PREFIX
+
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": True,
+            "cluster": {
+                "settings": [
+                    {
+                        "name": "log_line_prefix",
+                        "vartype": "string",
+                        "value": TEST_LOG_LINE_PREFIX,
+                    }
+                ]
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    # Check that in logs we see that it was actually reconfigured,
+    # not restarted or something else.
+    endpoint.log_contains("INFO request{method=POST uri=/configure")
+
+    # In /configure we only send SIGHUP at the end, so in theory
+    # it doesn't necessarily mean that Postgres already reloaded
+    # the new config; and it may race in some envs.
+    # So we wait until we see the log line that the config was changed.
+    def check_logs():
+        endpoint.log_contains(
+            f'[test_compute_reconfigure]: LOG:  parameter "log_line_prefix" changed to "{TEST_LOG_LINE_PREFIX}"'
+        )
+
+    wait_until(check_logs)
+
+    # Check that the log line prefix is set
+    with endpoint.cursor() as cursor:
+        cursor.execute("SHOW log_line_prefix;")
+        row = cursor.fetchone()
+        assert row is not None
+        assert row[0] == TEST_LOG_LINE_PREFIX