Merge remote-tracking branch 'origin/main' into communicator-rewrite

2026-05-31 20:10:38 +00:00 · 2025-07-05 16:59:51 +03:00
parent f3a6c0d8ff b568189f7b
commit e14bb4be39
141 changed files with 5475 additions and 2033 deletions
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -57,6 +57,8 @@ class EndpointHttpClient(requests.Session):
        self.auth = BearerAuth(jwt)

        self.mount("http://", HTTPAdapter())
+        self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
+        self.offload_url = f"http://localhost:{external_port}/lfc/offload"

    def dbs_and_roles(self):
        res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
@@ -64,33 +66,39 @@ class EndpointHttpClient(requests.Session):
        return res.json()

    def prewarm_lfc_status(self) -> dict[str, str]:
-        res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
+        res = self.get(self.prewarm_url)
        res.raise_for_status()
        json: dict[str, str] = res.json()
        return json

    def prewarm_lfc(self, from_endpoint_id: str | None = None):
-        url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
        params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
-        self.post(url, params=params).raise_for_status()
+        self.post(self.prewarm_url, params=params).raise_for_status()
+        self.prewarm_lfc_wait()

+    def prewarm_lfc_wait(self):
        def prewarmed():
            json = self.prewarm_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status == "completed", f"{status}, error {err}"
+            assert status == "completed", f"{status}, {err=}"

        wait_until(prewarmed, timeout=60)

-    def offload_lfc(self):
-        url = f"http://localhost:{self.external_port}/lfc/offload"
-        self.post(url).raise_for_status()
+    def offload_lfc_status(self) -> dict[str, str]:
+        res = self.get(self.offload_url)
+        res.raise_for_status()
+        json: dict[str, str] = res.json()
+        return json

+    def offload_lfc(self):
+        self.post(self.offload_url).raise_for_status()
+        self.offload_lfc_wait()
+
+    def offload_lfc_wait(self):
        def offloaded():
-            res = self.get(url)
-            res.raise_for_status()
-            json = res.json()
+            json = self.offload_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status == "completed", f"{status}, error {err}"
+            assert status == "completed", f"{status}, {err=}"

        wait_until(offloaded)

--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -567,6 +567,8 @@ class NeonLocalCli(AbstractNeonCli):
        basebackup_request_tries: int | None = None,
        timeout: str | None = None,
        env: dict[str, str] | None = None,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> subprocess.CompletedProcess[str]:
        args = [
            "endpoint",
@@ -592,6 +594,10 @@ class NeonLocalCli(AbstractNeonCli):
            args.extend(["--create-test-user"])
        if timeout is not None:
            args.extend(["--start-timeout", str(timeout)])
+        if autoprewarm:
+            args.extend(["--autoprewarm"])
+        if offload_lfc_interval_seconds is not None:
+            args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)])

        res = self.raw_cli(args, extra_env_vars)
        res.check_returncode()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -724,15 +724,21 @@ class NeonEnvBuilder:

        shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
        assert not (storcon_db_to_dir / "postgres.log").exists()
+
        # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
-        # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
-        # will currently reject re-attach requests from them because the NodeMetadata isn't identical.
+        # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
+        # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
        # So, from_repo_dir patches up the the storcon database.
        patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
        assert not patch_script_path.exists()
        patch_script = ""
+
        for ps in self.env.pageservers:
-            patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+            patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';\n"
+
+        for sk in self.env.safekeepers:
+            patch_script += f"UPDATE safekeepers SET http_port={sk.port.http}, port={sk.port.pg} WHERE id = '{sk.id}';\n"
+
        patch_script_path.write_text(patch_script)

        # Update the config with info about tenants and timelines
@@ -1861,6 +1867,7 @@ class PageserverSchedulingPolicy(StrEnum):
    FILLING = "Filling"
    PAUSE = "Pause"
    PAUSE_FOR_RESTART = "PauseForRestart"
+    DELETING = "Deleting"


 class StorageControllerLeadershipStatus(StrEnum):
@@ -2069,14 +2076,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

-    def node_delete(self, node_id):
-        log.info(f"node_delete({node_id})")
+    def node_delete_old(self, node_id):
+        log.info(f"node_delete_old({node_id})")
        self.request(
            "DELETE",
            f"{self.api}/control/v1/node/{node_id}",
            headers=self.headers(TokenScope.ADMIN),
        )

+    def node_delete(self, node_id):
+        log.info(f"node_delete({node_id})")
+        self.request(
+            "PUT",
+            f"{self.api}/control/v1/node/{node_id}/delete",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def cancel_node_delete(self, node_id):
+        log.info(f"cancel_node_delete({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.api}/control/v1/node/{node_id}/delete",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
    def tombstone_delete(self, node_id):
        log.info(f"tombstone_delete({node_id})")
        self.request(
@@ -4350,6 +4373,8 @@ class Endpoint(PgProtocol, LogUtils):
        basebackup_request_tries: int | None = None,
        timeout: str | None = None,
        env: dict[str, str] | None = None,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> Self:
        """
        Start the Postgres instance.
@@ -4374,6 +4399,8 @@ class Endpoint(PgProtocol, LogUtils):
            basebackup_request_tries=basebackup_request_tries,
            timeout=timeout,
            env=env,
+            autoprewarm=autoprewarm,
+            offload_lfc_interval_seconds=offload_lfc_interval_seconds,
        )
        self._running.release(1)
        self.log_config_value("shared_buffers")
@@ -4589,6 +4616,8 @@ class Endpoint(PgProtocol, LogUtils):
        pageserver_id: int | None = None,
        allow_multiple: bool = False,
        basebackup_request_tries: int | None = None,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> Self:
        """
        Create an endpoint, apply config, and start Postgres.
@@ -4609,6 +4638,8 @@ class Endpoint(PgProtocol, LogUtils):
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
            basebackup_request_tries=basebackup_request_tries,
+            autoprewarm=autoprewarm,
+            offload_lfc_interval_seconds=offload_lfc_interval_seconds,
        )

        return self
@@ -4693,6 +4724,8 @@ class EndpointFactory:
        remote_ext_base_url: str | None = None,
        pageserver_id: int | None = None,
        basebackup_request_tries: int | None = None,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -4714,6 +4747,8 @@ class EndpointFactory:
            remote_ext_base_url=remote_ext_base_url,
            pageserver_id=pageserver_id,
            basebackup_request_tries=basebackup_request_tries,
+            autoprewarm=autoprewarm,
+            offload_lfc_interval_seconds=offload_lfc_interval_seconds,
        )

    def create(
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -111,6 +111,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    ".*stalling layer flushes for compaction backpressure.*",
    ".*layer roll waiting for flush due to compaction backpressure.*",
    ".*BatchSpanProcessor.*",
+    ".*No broker updates received for a while.*",
    *(
        [
            r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -416,6 +416,8 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
        # timeline creation (uploads). mask it out here to avoid flakyness.
        del success_result["remote_consistent_lsn_visible"]
        del repeat_result["remote_consistent_lsn_visible"]
+        del success_result["walreceiver_status"]
+        del repeat_result["walreceiver_status"]
        assert repeat_result == success_result
    finally:
        env.pageserver.stop(immediate=True)
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -76,6 +76,7 @@ if TYPE_CHECKING:
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
 #    export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
+#    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
 #
 #    # Build previous version of binaries and store them somewhere:
 #    rm -rf pg_install target
@@ -102,6 +103,7 @@ if TYPE_CHECKING:
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
 #    export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
+#    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
 #    export NEON_BIN=target/${BUILD_TYPE}
 #    export POSTGRES_DISTRIB_DIR=pg_install
 #
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,34 +1,38 @@
 import random
 import threading
-import time
-from enum import Enum
+from enum import StrEnum
+from time import sleep
+from typing import Any

 import pytest
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.utils import USE_LFC
+from fixtures.utils import USE_LFC, wait_until
 from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl
+from psycopg2.extensions import cursor as Cursor


-class LfcQueryMethod(Enum):
-    COMPUTE_CTL = False
-    POSTGRES = True
+class PrewarmMethod(StrEnum):
+    POSTGRES = "postgres"
+    COMPUTE_CTL = "compute-ctl"
+    AUTOPREWARM = "autoprewarm"


-PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total"
-OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total"
-QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL
+PREWARM_LABEL = "compute_ctl_lfc_prewarms_total"
+OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
+METHOD_VALUES = [e for e in PrewarmMethod]
+METHOD_IDS = [e.value for e in PrewarmMethod]


-def check_pinned_entries(cur):
+def check_pinned_entries(cur: Cursor):
    # some LFC buffer can be temporary locked by autovacuum or background writer
    for _ in range(10):
        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
        n_pinned = cur.fetchall()[0][0]
        if n_pinned == 0:
            break
-        time.sleep(1)
+        sleep(1)
    assert n_pinned == 0


@@ -41,21 +45,68 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
    }


+def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
+    if method == PrewarmMethod.AUTOPREWARM:
+        client.offload_lfc_wait()
+    elif method == PrewarmMethod.COMPUTE_CTL:
+        status = client.prewarm_lfc_status()
+        assert status["status"] == "not_prewarmed"
+        assert "error" not in status
+        client.offload_lfc()
+        assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
+        assert prom_parse(client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
+    elif method == PrewarmMethod.POSTGRES:
+        cur.execute("select get_local_cache_state()")
+        return cur.fetchall()[0][0]
+    else:
+        raise AssertionError(f"{method} not in PrewarmMethod")
+
+
+def prewarm_endpoint(
+    method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
+):
+    if method == PrewarmMethod.AUTOPREWARM:
+        client.prewarm_lfc_wait()
+    elif method == PrewarmMethod.COMPUTE_CTL:
+        client.prewarm_lfc()
+    elif method == PrewarmMethod.POSTGRES:
+        cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+
+
+def check_prewarmed(
+    method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
+):
+    if method == PrewarmMethod.AUTOPREWARM:
+        assert client.prewarm_lfc_status() == desired_status
+        assert prom_parse(client)[PREWARM_LABEL] == 1
+    elif method == PrewarmMethod.COMPUTE_CTL:
+        assert client.prewarm_lfc_status() == desired_status
+        assert prom_parse(client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
+
+
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
-def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
+@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
+def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    env = neon_simple_env
    n_records = 1000000
-    endpoint = env.endpoints.create_start(
-        branch_name="main",
-        config_lines=[
-            "autovacuum = off",
-            "shared_buffers=1MB",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
-            "neon.file_cache_prewarm_limit=1000",
-        ],
-    )
+    cfg = [
+        "autovacuum = off",
+        "shared_buffers=1MB",
+        "neon.max_file_cache_size=1GB",
+        "neon.file_cache_size_limit=1GB",
+        "neon.file_cache_prewarm_limit=1000",
+    ]
+    offload_secs = 2
+
+    if method == PrewarmMethod.AUTOPREWARM:
+        endpoint = env.endpoints.create_start(
+            branch_name="main",
+            config_lines=cfg,
+            autoprewarm=True,
+            offload_lfc_interval_seconds=offload_secs,
+        )
+    else:
+        endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()
@@ -69,31 +120,21 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
    lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
    log.info(f"Inserted {n_records} rows")

-    http_client = endpoint.http_client()
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        status = http_client.prewarm_lfc_status()
-        assert status["status"] == "not_prewarmed"
-        assert "error" not in status
-        http_client.offload_lfc()
-        assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed"
-        assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
-    else:
-        pg_cur.execute("select get_local_cache_state()")
-        lfc_state = pg_cur.fetchall()[0][0]
+    client = endpoint.http_client()
+    lfc_state = offload_lfc(method, client, pg_cur)

    endpoint.stop()
-    endpoint.start()
+    if method == PrewarmMethod.AUTOPREWARM:
+        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
+    else:
+        endpoint.start()

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()

    lfc_conn = endpoint.connect(dbname="lfc")
    lfc_cur = lfc_conn.cursor()
-
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        http_client.prewarm_lfc()
-    else:
-        pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+    prewarm_endpoint(method, client, pg_cur, lfc_state)

    pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
    lfc_used_pages = pg_cur.fetchall()[0][0]
@@ -111,33 +152,32 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
        and prewarm_info[1] > 0
        and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
    )
-
    lfc_cur.execute("select sum(pk) from t")
    assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2

    check_pinned_entries(pg_cur)
-
    desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        assert http_client.prewarm_lfc_status() == desired
-        assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
+    check_prewarmed(method, client, desired)
+
+
+# autoprewarm isn't needed as we prewarm manually
+WORKLOAD_VALUES = METHOD_VALUES[:-1]
+WORKLOAD_IDS = METHOD_IDS[:-1]


@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
-def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod):
+@pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS)
+def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod):
    env = neon_simple_env
    n_records = 10000
    n_threads = 4
-    endpoint = env.endpoints.create_start(
-        branch_name="main",
-        config_lines=[
-            "shared_buffers=1MB",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
-            "neon.file_cache_prewarm_limit=1000000",
-        ],
-    )
+    cfg = [
+        "shared_buffers=1MB",
+        "neon.max_file_cache_size=1GB",
+        "neon.file_cache_size_limit=1GB",
+        "neon.file_cache_prewarm_limit=1000000",
+    ]
+    endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()
@@ -154,12 +194,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
    log.info(f"Inserted {n_records} rows")

    http_client = endpoint.http_client()
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        http_client.offload_lfc()
-    else:
-        pg_cur.execute("select get_local_cache_state()")
-        lfc_state = pg_cur.fetchall()[0][0]
-
+    lfc_state = offload_lfc(method, http_client, pg_cur)
    running = True
    n_prewarms = 0

@@ -170,8 +205,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
        while running:
            src = random.randint(1, n_records)
            dst = random.randint(1, n_records)
-            lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
-            lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
+            lfc_cur.execute(f"update accounts set balance=balance-100 where id={src}")
+            lfc_cur.execute(f"update accounts set balance=balance+100 where id={dst}")
            n_transfers += 1
        log.info(f"Number of transfers: {n_transfers}")

@@ -183,13 +218,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
            pg_cur.execute("select pg_reload_conf()")
            pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'")
            pg_cur.execute("select pg_reload_conf()")
-
-            if query is LfcQueryMethod.COMPUTE_CTL:
-                # Same thing as prewarm_lfc(), testing other method
-                http_client.prewarm_lfc(endpoint.endpoint_id)
-            else:
-                pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
-
+            prewarm_endpoint(method, http_client, pg_cur, lfc_state)
            nonlocal n_prewarms
            n_prewarms += 1
        log.info(f"Number of prewarms: {n_prewarms}")
@@ -203,7 +232,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
    prewarm_thread = threading.Thread(target=prewarm)
    prewarm_thread.start()

-    time.sleep(20)
+    def prewarmed():
+        assert n_prewarms > 5
+
+    wait_until(prewarmed)

    running = False
    for t in workload_threads:
@@ -215,5 +247,5 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
    assert total_balance == 0

    check_pinned_entries(pg_cur)
-    if query is LfcQueryMethod.COMPUTE_CTL:
+    if method != PrewarmMethod.POSTGRES:
        assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -180,7 +180,7 @@ def test_metric_collection(
    httpserver.check()

    # Check that at least one bucket output object is present, and that all
-    # can be decompressed and decoded.
+    # can be decompressed and decoded as NDJSON.
    bucket_dumps = {}
    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
    for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root):
@@ -188,7 +188,13 @@ def test_metric_collection(
            file_path = os.path.join(dirpath, file)
            log.info(file_path)
            if file.endswith(".gz"):
-                bucket_dumps[file_path] = json.load(gzip.open(file_path))
+                events = []
+                with gzip.open(file_path, "rt") as f:
+                    for line in f:
+                        line = line.strip()
+                        if line:
+                            events.append(json.loads(line))
+                bucket_dumps[file_path] = {"events": events}

    assert len(bucket_dumps) >= 1
    assert all("events" in data for data in bucket_dumps.values())
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -989,6 +989,102 @@ def test_storage_controller_compute_hook_retry(
    )


+@run_only_on_default_postgres("postgres behavior is not relevant")
+def test_storage_controller_compute_hook_keep_failing(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address: ListenAddress,
+):
+    neon_env_builder.num_pageservers = 4
+    neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False}
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}"
+
+    # Set up CP handler for compute notifications
+    status_by_tenant: dict[TenantId, int] = {}
+
+    def handler(request: Request):
+        notify_request = request.json
+        assert notify_request is not None
+        status = status_by_tenant[TenantId(notify_request["tenant_id"])]
+        log.info(f"Notify request[{status}]: {notify_request}")
+        return Response(status=status)
+
+    httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
+
+    # Run neon environment
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Create two tenants:
+    # - The first tenant is banned by CP and contains only one shard
+    # - The second tenant is allowed by CP and contains four shards
+    banned_tenant = TenantId.generate()
+    status_by_tenant[banned_tenant] = 200  # we will ban this tenant later
+    env.create_tenant(banned_tenant, placement_policy='{"Attached": 1}')
+
+    shard_count = 4
+    allowed_tenant = TenantId.generate()
+    status_by_tenant[allowed_tenant] = 200
+    env.create_tenant(allowed_tenant, shard_count=shard_count, placement_policy='{"Attached": 1}')
+
+    # Find the pageserver of the banned tenant
+    banned_tenant_ps = env.get_tenant_pageserver(banned_tenant)
+    assert banned_tenant_ps is not None
+    alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id]
+
+    # Stop pageserver and ban tenant to trigger failed reconciliation
+    status_by_tenant[banned_tenant] = 423
+    banned_tenant_ps.stop()
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+    env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
+    env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
+
+    # Migrate all allowed tenant shards to the first alive pageserver
+    # to trigger storage controller optimizations due to affinity rules
+    for shard_number in range(shard_count):
+        env.storage_controller.tenant_shard_migrate(
+            TenantShardId(allowed_tenant, shard_number, shard_count),
+            alive_pageservers[0].id,
+            config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True),
+        )
+
+    # Make some reconcile_all calls to trigger optimizations
+    # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
+    RECONCILE_COUNT = 12
+    for i in range(RECONCILE_COUNT):
+        try:
+            n = env.storage_controller.reconcile_all()
+            log.info(f"Reconciliation attempt {i} finished with success: {n}")
+        except StorageControllerApiException as e:
+            assert "Control plane tenant busy" in str(e)
+            log.info(f"Reconciliation attempt {i} finished with failure")
+
+        banned_descr = env.storage_controller.tenant_describe(banned_tenant)
+        assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
+        time.sleep(2)
+
+    # Check that the allowed tenant shards are optimized due to affinity rules
+    locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
+    not_optimized_shard_count = 0
+    for loc in locations:
+        tsi = TenantShardId.parse(loc[0])
+        if tsi.tenant_id != allowed_tenant:
+            continue
+        if loc[1]["mode"] == "AttachedSingle":
+            not_optimized_shard_count += 1
+        log.info(f"Shard {tsi} seen in mode {loc[1]['mode']}")
+
+    assert not_optimized_shard_count < shard_count, "At least one shard should be optimized"
+
+    # Unban the tenant and run reconciliations
+    status_by_tenant[banned_tenant] = 200
+    env.storage_controller.reconcile_all()
+    banned_descr = env.storage_controller.tenant_describe(banned_tenant)
+    assert banned_descr["shards"][0]["is_pending_compute_notification"] is False
+
+
@run_only_on_default_postgres("this test doesn't start an endpoint")
 def test_storage_controller_compute_hook_revert(
    httpserver: HTTPServer,
@@ -2522,7 +2618,7 @@ def test_storage_controller_node_deletion(
        wait_until(assert_shards_migrated)

    log.info(f"Deleting pageserver {victim.id}")
-    env.storage_controller.node_delete(victim.id)
+    env.storage_controller.node_delete_old(victim.id)

    if not while_offline:

@@ -2557,6 +2653,60 @@ def test_storage_controller_node_deletion(
    env.storage_controller.consistency_check()


+def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 3
+    neon_env_builder.num_azs = 3
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 12
+    shard_count_per_tenant = 16
+    tenant_ids = []
+
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    # Sanity check: initial creations should not leave the system in an unstable scheduling state
+    assert env.storage_controller.reconcile_all() == 0
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 3
+
+    env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
+
+    ps_id_to_delete = env.pageservers[0].id
+
+    env.storage_controller.warm_up_all_secondaries()
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_delete(ps_id),
+        ps_id_to_delete,
+        max_attempts=3,
+        backoff=2,
+    )
+
+    env.storage_controller.poll_node_status(
+        ps_id_to_delete,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.DELETING,
+        max_attempts=6,
+        backoff=2,
+    )
+
+    env.storage_controller.cancel_node_delete(ps_id_to_delete)
+
+    env.storage_controller.poll_node_status(
+        ps_id_to_delete,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.ACTIVE,
+        max_attempts=6,
+        backoff=2,
+    )
+
+
@pytest.mark.parametrize("shard_count", [None, 2])
 def test_storage_controller_metadata_health(
    neon_env_builder: NeonEnvBuilder,
@@ -3112,7 +3262,7 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
    assert_nodes_count(3)

    ps = env.pageservers[0]
-    env.storage_controller.node_delete(ps.id)
+    env.storage_controller.node_delete_old(ps.id)

    # After deletion, the node count must be reduced
    assert_nodes_count(2)
@@ -3530,18 +3680,21 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
    # some small tests for the scheduling policy querying and returning APIs
    newest_info = target.get_safekeeper(inserted["id"])
    assert newest_info
-    assert newest_info["scheduling_policy"] == "Pause"
-    target.safekeeper_scheduling_policy(inserted["id"], "Active")
-    newest_info = target.get_safekeeper(inserted["id"])
-    assert newest_info
-    assert newest_info["scheduling_policy"] == "Active"
-    # Ensure idempotency
-    target.safekeeper_scheduling_policy(inserted["id"], "Active")
-    newest_info = target.get_safekeeper(inserted["id"])
-    assert newest_info
-    assert newest_info["scheduling_policy"] == "Active"
-    # change back to paused again
+    assert (
+        newest_info["scheduling_policy"] == "Activating"
+        or newest_info["scheduling_policy"] == "Active"
+    )
    target.safekeeper_scheduling_policy(inserted["id"], "Pause")
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Pause"
+    # Ensure idempotency
+    target.safekeeper_scheduling_policy(inserted["id"], "Pause")
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Pause"
+    # change back to active again
+    target.safekeeper_scheduling_policy(inserted["id"], "Active")

    def storcon_heartbeat():
        assert env.storage_controller.log_contains(
@@ -3554,6 +3707,57 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")


+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_safekeeper_activating_to_active(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    fake_id = 5
+
+    target = env.storage_controller
+
+    assert target.get_safekeeper(fake_id) is None
+
+    start_sks = target.get_safekeepers()
+
+    sk_0 = env.safekeepers[0]
+
+    body = {
+        "active": True,
+        "id": fake_id,
+        "created_at": "2023-10-25T09:11:25Z",
+        "updated_at": "2024-08-28T11:32:43Z",
+        "region_id": "aws-eu-central-1",
+        "host": "localhost",
+        "port": sk_0.port.pg,
+        "http_port": sk_0.port.http,
+        "https_port": None,
+        "version": 5957,
+        "availability_zone_id": "eu-central-1a",
+    }
+
+    target.on_safekeeper_deploy(fake_id, body)
+
+    inserted = target.get_safekeeper(fake_id)
+    assert inserted is not None
+    assert target.get_safekeepers() == start_sks + [inserted]
+    assert eq_safekeeper_records(body, inserted)
+
+    def safekeeper_is_active():
+        newest_info = target.get_safekeeper(inserted["id"])
+        assert newest_info
+        assert newest_info["scheduling_policy"] == "Active"
+
+    wait_until(safekeeper_is_active)
+
+    target.safekeeper_scheduling_policy(inserted["id"], "Activating")
+
+    wait_until(safekeeper_is_active)
+
+    # Now decomission it
+    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+
+
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
    compared = [dict(a), dict(b)]

--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -324,7 +324,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
            # it is to be in line with the deletion timestamp.. well, almost.
            when = original_ancestor[2][:26]
            when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC)
-            now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC)
+            now = datetime.datetime.now(datetime.UTC)
            assert when_ts < now
            assert len(lineage.get("reparenting_history", [])) == 0
        elif expected_ancestor == timeline_id:
@@ -458,19 +458,20 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots

    env.pageserver.quiesce_tenants()

-    # checking the ancestor after is much faster than waiting for the endpoint not start
+    # checking the ancestor after is much faster than waiting for the endpoint to start
    expected_result = [
-        ("main", env.initial_timeline, None, 24576, 1),
-        ("after", after, env.initial_timeline, 24576, 1),
-        ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1),
-        ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1),
-        ("branch_to_detach", branch_to_detach, None, 16384, 1),
-        ("earlier", earlier, env.initial_timeline, 0, 1),
+        # (branch_name, queried_timeline, expected_ancestor, rows, starts, read_only)
+        ("main", env.initial_timeline, None, 24576, 1, False),
+        ("after", after, env.initial_timeline, 24576, 1, False),
+        ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1, True),
+        ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1, False),
+        ("branch_to_detach", branch_to_detach, None, 16384, 1, False),
+        ("earlier", earlier, env.initial_timeline, 0, 1, False),
    ]

    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)

-    for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result:
+    for branch_name, queried_timeline, expected_ancestor, _, _, _ in expected_result:
        details = client.timeline_detail(env.initial_tenant, queried_timeline)
        ancestor_timeline_id = details["ancestor_timeline_id"]
        if expected_ancestor is None:
@@ -508,13 +509,17 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
            assert len(lineage.get("original_ancestor", [])) == 0
            assert len(lineage.get("reparenting_history", [])) == 0

-    for branch_name, queried_timeline, _, rows, starts in expected_result:
-        details = client.timeline_detail(env.initial_tenant, queried_timeline)
-        log.info(f"reading data from branch {branch_name}")
-        # specifying the lsn makes the endpoint read-only and not connect to safekeepers
+    for branch_name, queried_timeline, _, rows, starts, read_only in expected_result:
+        last_record_lsn = None
+        if read_only:
+            # specifying the lsn makes the endpoint read-only and not connect to safekeepers
+            details = client.timeline_detail(env.initial_tenant, queried_timeline)
+            last_record_lsn = Lsn(details["last_record_lsn"])
+
+        log.info(f"reading data from branch {branch_name} at {last_record_lsn}")
        with env.endpoints.create(
            branch_name,
-            lsn=Lsn(details["last_record_lsn"]),
+            lsn=last_record_lsn,
        ) as ep:
            ep.start(safekeeper_generation=1)
            assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
@@ -1884,6 +1889,31 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])


+def test_detach_ancestors_with_no_writes(
+    neon_env_builder: NeonEnvBuilder,
+):
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    endpoint.safe_psql(
+        "SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
+    )
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    endpoint.stop()
+
+    for i in range(0, 5):
+        if i == 0:
+            ancestor_name = "main"
+        else:
+            ancestor_name = f"b{i}"
+
+        tlid = env.create_branch(f"b{i + 1}", ancestor_branch_name=ancestor_name)
+
+        client = env.pageserver.http_client()
+        client.detach_ancestor(tenant_id=env.initial_tenant, timeline_id=tlid)
+
+
 # TODO:
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2740,3 +2740,85 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
            raise Exception("Uneviction did not happen on source safekeeper yet")

    wait_until(unevicted)
+
+
+def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that the timeline disk usage circuit breaker works as expected. We test that:
+    1. The circuit breaker kicks in when the timeline's disk usage exceeds the configured limit,
+       and it causes writes to hang.
+    2. The hanging writes unblock when the issue resolves (by restarting the safekeeper in the
+       test to simulate a more realistic production troubleshooting scenario).
+    3. We can continue to write as normal after the issue resolves.
+    4. There is no data corruption throughout the test.
+    """
+    # Set up environment with a very small disk usage limit (1KB)
+    neon_env_builder.num_safekeepers = 1
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
+
+    # Set a very small disk usage limit (1KB)
+    neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"]
+
+    env = neon_env_builder.init_start()
+
+    # Create a timeline and endpoint
+    env.create_branch("test_timeline_disk_usage_limit")
+    endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit")
+
+    # Get the safekeeper
+    sk = env.safekeepers[0]
+
+    # Inject a failpoint to stop WAL backup
+    with sk.http_client() as http_cli:
+        http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")])
+
+    # Write some data that will exceed the 1KB limit. While the failpoint is active, this operation
+    # will hang as Postgres encounters safekeeper-returned errors and retries.
+    def run_hanging_insert():
+        with closing(endpoint.connect()) as bg_conn:
+            with bg_conn.cursor() as bg_cur:
+                # This should generate more than 1KB of WAL
+                bg_cur.execute("create table t(key int, value text)")
+                bg_cur.execute("insert into t select generate_series(1,2000), 'payload'")
+
+    # Start the inserts in a background thread
+    bg_thread = threading.Thread(target=run_hanging_insert)
+    bg_thread.start()
+
+    # Wait for the error message to appear in the compute log
+    def error_logged():
+        return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
+
+    wait_until(error_logged)
+    log.info("Found expected error message in compute log, resuming.")
+
+    # Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we
+    # implemented didn't work as expected.
+    time.sleep(2)
+    assert bg_thread.is_alive(), (
+        "The hanging insert somehow unblocked without resolving the disk usage issue!"
+    )
+
+    log.info("Restarting the safekeeper to resume WAL backup.")
+    # Restart the safekeeper with defaults to both clear the failpoint and resume the larger disk usage limit.
+    for sk in env.safekeepers:
+        sk.stop().start(extra_opts=[])
+
+    # The hanging insert will now complete. Join the background thread so that we can
+    # verify that the insert completed successfully.
+    bg_thread.join(timeout=120)
+    assert not bg_thread.is_alive(), "Hanging insert did not complete after safekeeper restart"
+    log.info("Hanging insert unblocked.")
+
+    # Verify we can continue to write as normal
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("insert into t select generate_series(2001,3000), 'payload'")
+
+    # Sanity check data correctness
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("select count(*) from t")
+            # 2000 rows from first insert + 1000 from last insert
+            assert cur.fetchone() == (3000,)
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -13,50 +13,6 @@ if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder


-# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
-# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
-def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
-    # we assert below that the walreceiver is not active before data writes.
-    # with manually created timelines, it is active.
-    # FIXME: remove this test once we remove timelines_onto_safekeepers
-    neon_env_builder.storage_controller_config = {
-        "timelines_onto_safekeepers": False,
-    }
-
-    # Trigger WAL wait timeout faster
-    neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
-    env = neon_env_builder.init_start()
-    env.pageserver.http_client()
-
-    # In this test we force 'Timed out while waiting for WAL record error' while
-    # fetching basebackup and don't want any retries.
-    os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
-
-    tenant_id, timeline_id = env.create_tenant()
-    expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
-    env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
-
-    try:
-        trigger_wait_lsn_timeout(env, tenant_id)
-    except Exception as e:
-        exception_string = str(e)
-        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
-        assert "WalReceiver status: Not active" in exception_string, (
-            "Walreceiver should not be active before any data writes"
-        )
-
-    insert_test_elements(env, tenant_id, start=0, count=1_000)
-    try:
-        trigger_wait_lsn_timeout(env, tenant_id)
-    except Exception as e:
-        exception_string = str(e)
-        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
-        assert "WalReceiver status: Not active" not in exception_string, (
-            "Should not be inactive anymore after INSERTs are made"
-        )
-        assert "WalReceiver status" in exception_string, "But still should have some other status"
-
-
 # Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout.
 # Kills one of the safekeepers and ensures that only the active ones are printed in the state.
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/sql_regress/expected/neon-subxacts.out
+++ b/test_runner/sql_regress/expected/neon-subxacts.out
@@ -0,0 +1,21 @@
+DO $$
+DECLARE
+i numeric;
+BEGIN
+  create role somebody;
+  FOR i IN 1..1000000 LOOP
+    BEGIN
+	  IF i % 1000 = 0 THEN
+	    alter role somebody password 'welcome';
+	  ELSE
+        PERFORM 1;
+	  END IF;
+    EXCEPTION WHEN OTHERS THEN
+      RAISE WARNING 'error';
+    END;
+    IF I = 1000000 THEN
+      PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
+    END IF;
+  END LOOP;
+END;
+$$;
--- a/test_runner/sql_regress/parallel_schedule
+++ b/test_runner/sql_regress/parallel_schedule
@@ -10,3 +10,4 @@ test: neon-clog
 test: neon-test-utils
 test: neon-vacuum-full
 test: neon-event-triggers
+test: neon-subxacts
--- a/test_runner/sql_regress/sql/neon-subxacts.sql
+++ b/test_runner/sql_regress/sql/neon-subxacts.sql
@@ -0,0 +1,21 @@
+DO $$
+DECLARE
+i numeric;
+BEGIN
+  create role somebody;
+  FOR i IN 1..1000000 LOOP
+    BEGIN
+	  IF i % 1000 = 0 THEN
+	    alter role somebody password 'welcome';
+	  ELSE
+        PERFORM 1;
+	  END IF;
+    EXCEPTION WHEN OTHERS THEN
+      RAISE WARNING 'error';
+    END;
+    IF I = 1000000 THEN
+      PERFORM pg_log_backend_memory_contexts(pg_backend_pid());
+    END IF;
+  END LOOP;
+END;
+$$;