offload_lfc_interval_seconds in ComputeSpec (#12447)

- Add ComputeSpec flag `offload_lfc_interval_seconds` controlling whether LFC should be offloaded to endpoint storage. Default value (None) means "don't offload". - Add glue code around it for `neon_local` and integration tests. - Add `autoprewarm` mode for `test_lfc_prewarm` testing `offload_lfc_interval_seconds` and `autoprewarm` flags in conjunction. - Rename `compute_ctl_lfc_prewarm_requests_total` and `compute_ctl_lfc_offload_requests_total` to `compute_ctl_lfc_prewarms_total` and `compute_ctl_lfc_offloads_total` to reflect we count prewarms and offloads, not `compute_ctl` requests of those. Don't count request in metrics if there is a prewarm/offload already ongoing. https://github.com/neondatabase/cloud/issues/19011 Resolves: https://github.com/neondatabase/cloud/issues/30770
2026-07-06 21:50:37 +00:00 · 2025-07-04 19:49:57 +01:00
parent 3a44774227
commit 7ed4530618
13 changed files with 296 additions and 178 deletions
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -57,6 +57,8 @@ class EndpointHttpClient(requests.Session):
        self.auth = BearerAuth(jwt)

        self.mount("http://", HTTPAdapter())
+        self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm"
+        self.offload_url = f"http://localhost:{external_port}/lfc/offload"

    def dbs_and_roles(self):
        res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth)
@@ -64,33 +66,39 @@ class EndpointHttpClient(requests.Session):
        return res.json()

    def prewarm_lfc_status(self) -> dict[str, str]:
-        res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
+        res = self.get(self.prewarm_url)
        res.raise_for_status()
        json: dict[str, str] = res.json()
        return json

    def prewarm_lfc(self, from_endpoint_id: str | None = None):
-        url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
        params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
-        self.post(url, params=params).raise_for_status()
+        self.post(self.prewarm_url, params=params).raise_for_status()
+        self.prewarm_lfc_wait()

+    def prewarm_lfc_wait(self):
        def prewarmed():
            json = self.prewarm_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status == "completed", f"{status}, error {err}"
+            assert status == "completed", f"{status}, {err=}"

        wait_until(prewarmed, timeout=60)

-    def offload_lfc(self):
-        url = f"http://localhost:{self.external_port}/lfc/offload"
-        self.post(url).raise_for_status()
+    def offload_lfc_status(self) -> dict[str, str]:
+        res = self.get(self.offload_url)
+        res.raise_for_status()
+        json: dict[str, str] = res.json()
+        return json

+    def offload_lfc(self):
+        self.post(self.offload_url).raise_for_status()
+        self.offload_lfc_wait()
+
+    def offload_lfc_wait(self):
        def offloaded():
-            res = self.get(url)
-            res.raise_for_status()
-            json = res.json()
+            json = self.offload_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status == "completed", f"{status}, error {err}"
+            assert status == "completed", f"{status}, {err=}"

        wait_until(offloaded)

--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -568,6 +568,8 @@ class NeonLocalCli(AbstractNeonCli):
        timeout: str | None = None,
        env: dict[str, str] | None = None,
        dev: bool = False,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> subprocess.CompletedProcess[str]:
        args = [
            "endpoint",
@@ -593,6 +595,10 @@ class NeonLocalCli(AbstractNeonCli):
            args.extend(["--create-test-user"])
        if timeout is not None:
            args.extend(["--start-timeout", str(timeout)])
+        if autoprewarm:
+            args.extend(["--autoprewarm"])
+        if offload_lfc_interval_seconds is not None:
+            args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)])
        if dev:
            args.extend(["--dev"])

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4362,6 +4362,8 @@ class Endpoint(PgProtocol, LogUtils):
        basebackup_request_tries: int | None = None,
        timeout: str | None = None,
        env: dict[str, str] | None = None,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> Self:
        """
        Start the Postgres instance.
@@ -4386,6 +4388,8 @@ class Endpoint(PgProtocol, LogUtils):
            basebackup_request_tries=basebackup_request_tries,
            timeout=timeout,
            env=env,
+            autoprewarm=autoprewarm,
+            offload_lfc_interval_seconds=offload_lfc_interval_seconds,
        )
        self._running.release(1)
        self.log_config_value("shared_buffers")
@@ -4601,6 +4605,8 @@ class Endpoint(PgProtocol, LogUtils):
        pageserver_id: int | None = None,
        allow_multiple: bool = False,
        basebackup_request_tries: int | None = None,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> Self:
        """
        Create an endpoint, apply config, and start Postgres.
@@ -4621,6 +4627,8 @@ class Endpoint(PgProtocol, LogUtils):
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
            basebackup_request_tries=basebackup_request_tries,
+            autoprewarm=autoprewarm,
+            offload_lfc_interval_seconds=offload_lfc_interval_seconds,
        )

        return self
@@ -4705,6 +4713,8 @@ class EndpointFactory:
        remote_ext_base_url: str | None = None,
        pageserver_id: int | None = None,
        basebackup_request_tries: int | None = None,
+        autoprewarm: bool = False,
+        offload_lfc_interval_seconds: int | None = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -4726,6 +4736,8 @@ class EndpointFactory:
            remote_ext_base_url=remote_ext_base_url,
            pageserver_id=pageserver_id,
            basebackup_request_tries=basebackup_request_tries,
+            autoprewarm=autoprewarm,
+            offload_lfc_interval_seconds=offload_lfc_interval_seconds,
        )

    def create(
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,34 +1,38 @@
 import random
 import threading
-import time
-from enum import Enum
+from enum import StrEnum
+from time import sleep
+from typing import Any

 import pytest
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.utils import USE_LFC
+from fixtures.utils import USE_LFC, wait_until
 from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl
+from psycopg2.extensions import cursor as Cursor


-class LfcQueryMethod(Enum):
-    COMPUTE_CTL = False
-    POSTGRES = True
+class PrewarmMethod(StrEnum):
+    POSTGRES = "postgres"
+    COMPUTE_CTL = "compute-ctl"
+    AUTOPREWARM = "autoprewarm"


-PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total"
-OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total"
-QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL
+PREWARM_LABEL = "compute_ctl_lfc_prewarms_total"
+OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
+METHOD_VALUES = [e for e in PrewarmMethod]
+METHOD_IDS = [e.value for e in PrewarmMethod]


-def check_pinned_entries(cur):
+def check_pinned_entries(cur: Cursor):
    # some LFC buffer can be temporary locked by autovacuum or background writer
    for _ in range(10):
        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
        n_pinned = cur.fetchall()[0][0]
        if n_pinned == 0:
            break
-        time.sleep(1)
+        sleep(1)
    assert n_pinned == 0


@@ -41,21 +45,68 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
    }


+def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
+    if method == PrewarmMethod.AUTOPREWARM:
+        client.offload_lfc_wait()
+    elif method == PrewarmMethod.COMPUTE_CTL:
+        status = client.prewarm_lfc_status()
+        assert status["status"] == "not_prewarmed"
+        assert "error" not in status
+        client.offload_lfc()
+        assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
+        assert prom_parse(client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
+    elif method == PrewarmMethod.POSTGRES:
+        cur.execute("select get_local_cache_state()")
+        return cur.fetchall()[0][0]
+    else:
+        raise AssertionError(f"{method} not in PrewarmMethod")
+
+
+def prewarm_endpoint(
+    method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
+):
+    if method == PrewarmMethod.AUTOPREWARM:
+        client.prewarm_lfc_wait()
+    elif method == PrewarmMethod.COMPUTE_CTL:
+        client.prewarm_lfc()
+    elif method == PrewarmMethod.POSTGRES:
+        cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+
+
+def check_prewarmed(
+    method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
+):
+    if method == PrewarmMethod.AUTOPREWARM:
+        assert client.prewarm_lfc_status() == desired_status
+        assert prom_parse(client)[PREWARM_LABEL] == 1
+    elif method == PrewarmMethod.COMPUTE_CTL:
+        assert client.prewarm_lfc_status() == desired_status
+        assert prom_parse(client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
+
+
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
-def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
+@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
+def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    env = neon_simple_env
    n_records = 1000000
-    endpoint = env.endpoints.create_start(
-        branch_name="main",
-        config_lines=[
-            "autovacuum = off",
-            "shared_buffers=1MB",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
-            "neon.file_cache_prewarm_limit=1000",
-        ],
-    )
+    cfg = [
+        "autovacuum = off",
+        "shared_buffers=1MB",
+        "neon.max_file_cache_size=1GB",
+        "neon.file_cache_size_limit=1GB",
+        "neon.file_cache_prewarm_limit=1000",
+    ]
+    offload_secs = 2
+
+    if method == PrewarmMethod.AUTOPREWARM:
+        endpoint = env.endpoints.create_start(
+            branch_name="main",
+            config_lines=cfg,
+            autoprewarm=True,
+            offload_lfc_interval_seconds=offload_secs,
+        )
+    else:
+        endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()
@@ -69,31 +120,21 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
    lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
    log.info(f"Inserted {n_records} rows")

-    http_client = endpoint.http_client()
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        status = http_client.prewarm_lfc_status()
-        assert status["status"] == "not_prewarmed"
-        assert "error" not in status
-        http_client.offload_lfc()
-        assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed"
-        assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
-    else:
-        pg_cur.execute("select get_local_cache_state()")
-        lfc_state = pg_cur.fetchall()[0][0]
+    client = endpoint.http_client()
+    lfc_state = offload_lfc(method, client, pg_cur)

    endpoint.stop()
-    endpoint.start()
+    if method == PrewarmMethod.AUTOPREWARM:
+        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
+    else:
+        endpoint.start()

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()

    lfc_conn = endpoint.connect(dbname="lfc")
    lfc_cur = lfc_conn.cursor()
-
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        http_client.prewarm_lfc()
-    else:
-        pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+    prewarm_endpoint(method, client, pg_cur, lfc_state)

    pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
    lfc_used_pages = pg_cur.fetchall()[0][0]
@@ -111,33 +152,32 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
        and prewarm_info[1] > 0
        and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
    )
-
    lfc_cur.execute("select sum(pk) from t")
    assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2

    check_pinned_entries(pg_cur)
-
    desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        assert http_client.prewarm_lfc_status() == desired
-        assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
+    check_prewarmed(method, client, desired)
+
+
+# autoprewarm isn't needed as we prewarm manually
+WORKLOAD_VALUES = METHOD_VALUES[:-1]
+WORKLOAD_IDS = METHOD_IDS[:-1]


@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
-def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod):
+@pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS)
+def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod):
    env = neon_simple_env
    n_records = 10000
    n_threads = 4
-    endpoint = env.endpoints.create_start(
-        branch_name="main",
-        config_lines=[
-            "shared_buffers=1MB",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
-            "neon.file_cache_prewarm_limit=1000000",
-        ],
-    )
+    cfg = [
+        "shared_buffers=1MB",
+        "neon.max_file_cache_size=1GB",
+        "neon.file_cache_size_limit=1GB",
+        "neon.file_cache_prewarm_limit=1000000",
+    ]
+    endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)

    pg_conn = endpoint.connect()
    pg_cur = pg_conn.cursor()
@@ -154,12 +194,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
    log.info(f"Inserted {n_records} rows")

    http_client = endpoint.http_client()
-    if query is LfcQueryMethod.COMPUTE_CTL:
-        http_client.offload_lfc()
-    else:
-        pg_cur.execute("select get_local_cache_state()")
-        lfc_state = pg_cur.fetchall()[0][0]
-
+    lfc_state = offload_lfc(method, http_client, pg_cur)
    running = True
    n_prewarms = 0

@@ -170,8 +205,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
        while running:
            src = random.randint(1, n_records)
            dst = random.randint(1, n_records)
-            lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
-            lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
+            lfc_cur.execute(f"update accounts set balance=balance-100 where id={src}")
+            lfc_cur.execute(f"update accounts set balance=balance+100 where id={dst}")
            n_transfers += 1
        log.info(f"Number of transfers: {n_transfers}")

@@ -183,13 +218,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
            pg_cur.execute("select pg_reload_conf()")
            pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'")
            pg_cur.execute("select pg_reload_conf()")
-
-            if query is LfcQueryMethod.COMPUTE_CTL:
-                # Same thing as prewarm_lfc(), testing other method
-                http_client.prewarm_lfc(endpoint.endpoint_id)
-            else:
-                pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
-
+            prewarm_endpoint(method, http_client, pg_cur, lfc_state)
            nonlocal n_prewarms
            n_prewarms += 1
        log.info(f"Number of prewarms: {n_prewarms}")
@@ -203,7 +232,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
    prewarm_thread = threading.Thread(target=prewarm)
    prewarm_thread.start()

-    time.sleep(20)
+    def prewarmed():
+        assert n_prewarms > 5
+
+    wait_until(prewarmed)

    running = False
    for t in workload_threads:
@@ -215,5 +247,5 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
    assert total_balance == 0

    check_pinned_entries(pg_cur)
-    if query is LfcQueryMethod.COMPUTE_CTL:
+    if method != PrewarmMethod.POSTGRES:
        assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}