Fix test_lfc_prewarm: reduce number of prewarms, sleep before LFC offloading (#12515)

Fixes:
- Sleep before LFC offloading in `test_lfc_prewarm[autoprewarm]` to
ensure offloaded LFC is the one exported after all writes finish
- Reduce number of prewarms and increase timeout in
`test_lfc_prewarm_under_workload` as debug builds were failing due to
timeout.

Additional changes:
- Remove `check_pinned_entries`:
https://github.com/neondatabase/neon/pull/12447#discussion_r2185946210
- Fix LFC error metrics description:
https://github.com/neondatabase/neon/pull/12486#discussion_r2190763107
This commit is contained in:
Mikhail
2025-07-10 12:11:53 +01:00
committed by GitHub
parent f4b03ddd7b
commit bdca5b500b
2 changed files with 21 additions and 27 deletions

View File

@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| { pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!( register_int_counter!(
"compute_ctl_lfc_prewarm_errors_total", "compute_ctl_lfc_prewarm_errors_total",
"Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option", "Total number of LFC prewarm errors",
) )
.expect("failed to define a metric") .expect("failed to define a metric")
}); });
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| { pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!( register_int_counter!(
"compute_ctl_lfc_offload_errors_total", "compute_ctl_lfc_offload_errors_total",
"Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option", "Total number of LFC offload errors",
) )
.expect("failed to define a metric") .expect("failed to define a metric")
}); });

View File

@@ -1,6 +1,7 @@
import random import random
import threading import threading
from enum import StrEnum from enum import StrEnum
from time import sleep
from typing import Any from typing import Any
import pytest import pytest
@@ -24,18 +25,7 @@ OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total" OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
METHOD_VALUES = [e for e in PrewarmMethod] METHOD_VALUES = [e for e in PrewarmMethod]
METHOD_IDS = [e.value for e in PrewarmMethod] METHOD_IDS = [e.value for e in PrewarmMethod]
AUTOOFFLOAD_INTERVAL_SECS = 2
def check_pinned_entries(cur: Cursor):
"""
Wait till none of LFC buffers are pinned
"""
def none_pinned():
cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
assert cur.fetchall()[0][0] == 0
wait_until(none_pinned)
def prom_parse(client: EndpointHttpClient) -> dict[str, float]: def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
@@ -49,9 +39,18 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any: def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
if method == PrewarmMethod.POSTGRES:
cur.execute("select get_local_cache_state()")
return cur.fetchall()[0][0]
if method == PrewarmMethod.AUTOPREWARM: if method == PrewarmMethod.AUTOPREWARM:
# With autoprewarm, we need to be sure LFC was offloaded after all writes
# finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
sleep(AUTOOFFLOAD_INTERVAL_SECS)
client.offload_lfc_wait() client.offload_lfc_wait()
elif method == PrewarmMethod.COMPUTE_CTL: return
if method == PrewarmMethod.COMPUTE_CTL:
status = client.prewarm_lfc_status() status = client.prewarm_lfc_status()
assert status["status"] == "not_prewarmed" assert status["status"] == "not_prewarmed"
assert "error" not in status assert "error" not in status
@@ -60,11 +59,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
parsed = prom_parse(client) parsed = prom_parse(client)
desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0} desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
assert parsed == desired, f"{parsed=} != {desired=}" assert parsed == desired, f"{parsed=} != {desired=}"
elif method == PrewarmMethod.POSTGRES: return
cur.execute("select get_local_cache_state()")
return cur.fetchall()[0][0] raise AssertionError(f"{method} not in PrewarmMethod")
else:
raise AssertionError(f"{method} not in PrewarmMethod")
def prewarm_endpoint( def prewarm_endpoint(
@@ -106,14 +103,13 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
"neon.file_cache_size_limit=1GB", "neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=1000", "neon.file_cache_prewarm_limit=1000",
] ]
offload_secs = 2
if method == PrewarmMethod.AUTOPREWARM: if method == PrewarmMethod.AUTOPREWARM:
endpoint = env.endpoints.create_start( endpoint = env.endpoints.create_start(
branch_name="main", branch_name="main",
config_lines=cfg, config_lines=cfg,
autoprewarm=True, autoprewarm=True,
offload_lfc_interval_seconds=offload_secs, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS,
) )
else: else:
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
@@ -135,7 +131,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
endpoint.stop() endpoint.stop()
if method == PrewarmMethod.AUTOPREWARM: if method == PrewarmMethod.AUTOPREWARM:
endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs) endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS)
else: else:
endpoint.start() endpoint.start()
@@ -162,7 +158,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
lfc_cur.execute("select sum(pk) from t") lfc_cur.execute("select sum(pk) from t")
assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
check_pinned_entries(pg_cur)
desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
check_prewarmed(method, client, desired) check_prewarmed(method, client, desired)
@@ -243,9 +238,9 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
prewarm_thread.start() prewarm_thread.start()
def prewarmed(): def prewarmed():
assert n_prewarms > 5 assert n_prewarms > 3
wait_until(prewarmed) wait_until(prewarmed, timeout=40) # debug builds don't finish in 20s
running = False running = False
for t in workload_threads: for t in workload_threads:
@@ -256,7 +251,6 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
total_balance = lfc_cur.fetchall()[0][0] total_balance = lfc_cur.fetchall()[0][0]
assert total_balance == 0 assert total_balance == 0
check_pinned_entries(pg_cur)
if method == PrewarmMethod.POSTGRES: if method == PrewarmMethod.POSTGRES:
return return
desired = { desired = {