Fix test_lfc_prewarm: reduce number of prewarms, sleep before LFC offloading (#12515)

Fixes: - Sleep before LFC offloading in `test_lfc_prewarm[autoprewarm]` to ensure offloaded LFC is the one exported after all writes finish - Reduce number of prewarms and increase timeout in `test_lfc_prewarm_under_workload` as debug builds were failing due to timeout. Additional changes: - Remove `check_pinned_entries`: https://github.com/neondatabase/neon/pull/12447#discussion_r2185946210 - Fix LFC error metrics description: https://github.com/neondatabase/neon/pull/12486#discussion_r2190763107
2025-12-22 21:59:59 +00:00 · 2025-07-10 12:11:53 +01:00
parent f4b03ddd7b
commit bdca5b500b
2 changed files with 21 additions and 27 deletions
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "compute_ctl_lfc_prewarm_errors_total",
-        "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
+        "Total number of LFC prewarm errors",
    )
    .expect("failed to define a metric")
 });
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "compute_ctl_lfc_offload_errors_total",
-        "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
+        "Total number of LFC offload errors",
    )
    .expect("failed to define a metric")
 });
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,6 +1,7 @@
 import random
 import threading
 from enum import StrEnum
+from time import sleep
 from typing import Any

 import pytest
@@ -24,18 +25,7 @@ OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
 OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
 METHOD_VALUES = [e for e in PrewarmMethod]
 METHOD_IDS = [e.value for e in PrewarmMethod]
-
-
-def check_pinned_entries(cur: Cursor):
-    """
-    Wait till none of LFC buffers are pinned
-    """
-
-    def none_pinned():
-        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
-        assert cur.fetchall()[0][0] == 0
-
-    wait_until(none_pinned)
+AUTOOFFLOAD_INTERVAL_SECS = 2


 def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
@@ -49,9 +39,18 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:


 def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
+    if method == PrewarmMethod.POSTGRES:
+        cur.execute("select get_local_cache_state()")
+        return cur.fetchall()[0][0]
+
    if method == PrewarmMethod.AUTOPREWARM:
+        # With autoprewarm, we need to be sure LFC was offloaded after all writes
+        # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
+        sleep(AUTOOFFLOAD_INTERVAL_SECS)
        client.offload_lfc_wait()
-    elif method == PrewarmMethod.COMPUTE_CTL:
+        return
+
+    if method == PrewarmMethod.COMPUTE_CTL:
        status = client.prewarm_lfc_status()
        assert status["status"] == "not_prewarmed"
        assert "error" not in status
@@ -60,11 +59,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
        parsed = prom_parse(client)
        desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
        assert parsed == desired, f"{parsed=} != {desired=}"
-    elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select get_local_cache_state()")
-        return cur.fetchall()[0][0]
-    else:
-        raise AssertionError(f"{method} not in PrewarmMethod")
+        return
+
+    raise AssertionError(f"{method} not in PrewarmMethod")


 def prewarm_endpoint(
@@ -106,14 +103,13 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
        "neon.file_cache_size_limit=1GB",
        "neon.file_cache_prewarm_limit=1000",
    ]
-    offload_secs = 2

    if method == PrewarmMethod.AUTOPREWARM:
        endpoint = env.endpoints.create_start(
            branch_name="main",
            config_lines=cfg,
            autoprewarm=True,
-            offload_lfc_interval_seconds=offload_secs,
+            offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS,
        )
    else:
        endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
@@ -135,7 +131,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):

    endpoint.stop()
    if method == PrewarmMethod.AUTOPREWARM:
-        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
+        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS)
    else:
        endpoint.start()

@@ -162,7 +158,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    lfc_cur.execute("select sum(pk) from t")
    assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2

-    check_pinned_entries(pg_cur)
    desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
    check_prewarmed(method, client, desired)

@@ -243,9 +238,9 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
    prewarm_thread.start()

    def prewarmed():
-        assert n_prewarms > 5
+        assert n_prewarms > 3

-    wait_until(prewarmed)
+    wait_until(prewarmed, timeout=40)  # debug builds don't finish in 20s

    running = False
    for t in workload_threads:
@@ -256,7 +251,6 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
    total_balance = lfc_cur.fetchall()[0][0]
    assert total_balance == 0

-    check_pinned_entries(pg_cur)
    if method == PrewarmMethod.POSTGRES:
        return
    desired = {