test_runner: use LFC by default (#8613)

## Problem
LFC is not enabled by default in tests, but it is enabled in production.
This increases the risk of errors in the production environment, which
were not found during the routine workflow.
However, enabling LFC for all the tests may overload the disk on our
servers and increase the number of failures.
So, we try enabling  LFC in one case to evaluate the possible risk.

## Summary of changes
A new environment variable, USE_LFC is introduced. If it is set to true,
LFC is enabled by default in all the tests.
In our workflow, we enable LFC for PG17, release, x86-64, and disabled
for all other combinations.

---------

Co-authored-by: Alexey Masterov <alexeymasterov@neon.tech>
Co-authored-by: a-masterov <72613290+a-masterov@users.noreply.github.com>
This commit is contained in:
Alexander Bayandin
2024-11-25 09:01:05 +00:00
committed by GitHub
parent 450be26bbb
commit 6f7aeaa1c5
20 changed files with 158 additions and 49 deletions

View File

@@ -19,8 +19,8 @@ on:
description: 'debug or release'
required: true
type: string
pg-versions:
description: 'a json array of postgres versions to run regression tests on'
test-cfg:
description: 'a json object of postgres versions and lfc states to run regression tests on'
required: true
type: string
@@ -276,14 +276,14 @@ jobs:
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
fail-fast: false
matrix:
pg_version: ${{ fromJson(inputs.pg-versions) }}
matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Pytest regression tests
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
with:
@@ -300,6 +300,7 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ inputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540

View File

@@ -253,7 +253,14 @@ jobs:
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
# run without LFC on v17 release only
test-cfg: |
${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
{"pg_version":"v15", "lfc_state": "without-lfc"},
{"pg_version":"v16", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "with-lfc"}]'
|| '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
secrets: inherit
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking

View File

@@ -31,6 +31,7 @@ CREATE TABLE IF NOT EXISTS results (
duration INT NOT NULL,
flaky BOOLEAN NOT NULL,
arch arch DEFAULT 'X64',
lfc BOOLEAN DEFAULT false NOT NULL,
build_type TEXT NOT NULL,
pg_version INT NOT NULL,
run_id BIGINT NOT NULL,
@@ -54,6 +55,7 @@ class Row:
duration: int
flaky: bool
arch: str
lfc: bool
build_type: str
pg_version: int
run_id: int
@@ -132,6 +134,7 @@ def ingest_test_result(
if p["name"].startswith("__")
}
arch = parameters.get("arch", "UNKNOWN").strip("'")
lfc = parameters.get("lfc", "False") == "True"
build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
labels = {label["name"]: label["value"] for label in test["labels"]}
@@ -145,6 +148,7 @@ def ingest_test_result(
duration=test["time"]["duration"],
flaky=test["flaky"] or test["retriesStatusChange"],
arch=arch,
lfc=lfc,
build_type=build_type,
pg_version=pg_version,
run_id=run_id,

View File

@@ -90,10 +90,12 @@ from fixtures.safekeeper.utils import wait_walreceivers_absent
from fixtures.utils import (
ATTACHMENT_NAME_REGEX,
COMPONENT_BINARIES,
USE_LFC,
allure_add_grafana_links,
assert_no_errors,
get_dir_size,
print_gc_result,
size_to_bytes,
subprocess_capture,
wait_until,
)
@@ -3742,12 +3744,45 @@ class Endpoint(PgProtocol, LogUtils):
self.pgdata_dir = self.env.repo_dir / path
self.logfile = self.endpoint_path() / "compute.log"
config_lines = config_lines or []
# set small 'max_replication_write_lag' to enable backpressure
# and make tests more stable.
config_lines = ["max_replication_write_lag=15MB"] + config_lines
# Delete file cache if it exists (and we're recreating the endpoint)
if USE_LFC:
if (lfc_path := Path(self.lfc_path())).exists():
lfc_path.unlink()
else:
lfc_path.parent.mkdir(parents=True, exist_ok=True)
for line in config_lines:
if (
line.find("neon.max_file_cache_size") > -1
or line.find("neon.file_cache_size_limit") > -1
):
m = re.search(r"=\s*(\S+)", line)
assert m is not None, f"malformed config line {line}"
size = m.group(1)
assert size_to_bytes(size) >= size_to_bytes(
"1MB"
), "LFC size cannot be set less than 1MB"
# shared_buffers = 512kB to make postgres use LFC intensively
# neon.max_file_cache_size and neon.file_cache size limit are
# set to 1MB because small LFC is better for testing (helps to find more problems)
config_lines = [
"shared_buffers = 512kB",
f"neon.file_cache_path = '{self.lfc_path()}'",
"neon.max_file_cache_size = 1MB",
"neon.file_cache_size_limit = 1MB",
] + config_lines
else:
for line in config_lines:
assert (
line.find("neon.max_file_cache_size") == -1
), "Setting LFC parameters is not allowed when LFC is disabled"
assert (
line.find("neon.file_cache_size_limit") == -1
), "Setting LFC parameters is not allowed when LFC is disabled"
self.config(config_lines)
return self
@@ -3781,6 +3816,9 @@ class Endpoint(PgProtocol, LogUtils):
basebackup_request_tries=basebackup_request_tries,
)
self._running.release(1)
self.log_config_value("shared_buffers")
self.log_config_value("neon.max_file_cache_size")
self.log_config_value("neon.file_cache_size_limit")
return self
@@ -3806,6 +3844,10 @@ class Endpoint(PgProtocol, LogUtils):
"""Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
return self.endpoint_path() / "postgresql.conf"
def lfc_path(self) -> Path:
"""Path to the lfc file"""
return self.endpoint_path() / "file_cache" / "file.cache"
def config(self, lines: list[str]) -> Self:
"""
Add lines to postgresql.conf.
@@ -3984,16 +4026,46 @@ class Endpoint(PgProtocol, LogUtils):
assert self.pgdata_dir is not None # please mypy
return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
def clear_shared_buffers(self, cursor: Any | None = None):
def clear_buffers(self, cursor: Any | None = None):
"""
Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.'
Might also clear LFC.
It clears LFC as well by setting neon.file_cache_size_limit to 0 and then returning it to the previous value,
if LFC is enabled
"""
if cursor is not None:
cursor.execute("select clear_buffer_cache()")
if not USE_LFC:
return
cursor.execute("SHOW neon.file_cache_size_limit")
res = cursor.fetchone()
assert res, "Cannot get neon.file_cache_size_limit"
file_cache_size_limit = res[0]
if file_cache_size_limit == 0:
return
cursor.execute("ALTER SYSTEM SET neon.file_cache_size_limit=0")
cursor.execute("SELECT pg_reload_conf()")
cursor.execute(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
cursor.execute("SELECT pg_reload_conf()")
else:
self.safe_psql("select clear_buffer_cache()")
if not USE_LFC:
return
file_cache_size_limit = self.safe_psql_scalar(
"SHOW neon.file_cache_size_limit", log_query=False
)
if file_cache_size_limit == 0:
return
self.safe_psql("ALTER SYSTEM SET neon.file_cache_size_limit=0")
self.safe_psql("SELECT pg_reload_conf()")
self.safe_psql(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
self.safe_psql("SELECT pg_reload_conf()")
def log_config_value(self, param):
"""
Writes the config value param to log
"""
res = self.safe_psql_scalar(f"SHOW {param}", log_query=False)
log.info("%s = %s", param, res)
class EndpointFactory:

View File

@@ -116,5 +116,6 @@ def pytest_runtest_makereport(*args, **kwargs):
}.get(os.uname().machine, "UNKNOWN")
arch = os.getenv("RUNNER_ARCH", uname_m)
allure.dynamic.parameter("__arch", arch)
allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false")
yield

View File

@@ -57,6 +57,10 @@ VERSIONS_COMBINATIONS = (
)
# fmt: on
# If the environment variable USE_LFC is set and its value is "false", then LFC is disabled for tests.
# If it is not set or set to a value not equal to "false", LFC is enabled by default.
USE_LFC = os.environ.get("USE_LFC") != "false"
def subprocess_capture(
capture_dir: Path,
@@ -653,6 +657,23 @@ def allpairs_versions():
return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}
def size_to_bytes(hr_size: str) -> int:
"""
Gets human-readable size from postgresql.conf (e.g. 512kB, 10MB)
returns size in bytes
"""
units = {"B": 1, "kB": 1024, "MB": 1024**2, "GB": 1024**3, "TB": 1024**4, "PB": 1024**5}
match = re.search(r"^\'?(\d+)\s*([kMGTP]?B)?\'?$", hr_size)
assert match is not None, f'"{hr_size}" is not a well-formatted human-readable size'
number, unit = match.groups()
if unit:
amp = units[unit]
else:
amp = 8192
return int(number) * amp
def skip_on_postgres(version: PgVersion, reason: str):
return pytest.mark.skipif(
PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,

View File

@@ -193,7 +193,7 @@ class Workload:
def validate(self, pageserver_id: int | None = None):
endpoint = self.endpoint(pageserver_id)
endpoint.clear_shared_buffers()
endpoint.clear_buffers()
result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}")
log.info(f"validate({self.expect_rows}): {result}")

View File

@@ -5,12 +5,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"shared_buffers='1MB'",
],
)
endpoint = env.endpoints.create_start("main")
conn = endpoint.connect()
cur = conn.cursor()
@@ -36,7 +31,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
# Clear the cache, so that we exercise reconstructing the pages
# from WAL
endpoint.clear_shared_buffers()
endpoint.clear_buffers()
# Check that the cursor opened earlier still works. If the
# combocids are not restored correctly, it won't.
@@ -65,12 +60,7 @@ def test_combocid_lock(neon_env_builder: NeonEnvBuilder):
def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"shared_buffers='1MB'",
],
)
endpoint = env.endpoints.create_start("main")
conn = endpoint.connect()
cur = conn.cursor()
@@ -98,7 +88,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
cur.execute("delete from t")
# Clear the cache, so that we exercise reconstructing the pages
# from WAL
endpoint.clear_shared_buffers()
endpoint.clear_buffers()
# Check that the cursor opened earlier still works. If the
# combocids are not restored correctly, it won't.

View File

@@ -2,10 +2,13 @@ from __future__ import annotations
from pathlib import Path
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import USE_LFC
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
env = neon_simple_env
@@ -16,8 +19,6 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='128MB'",
"neon.file_cache_size_limit='64MB'",
],

View File

@@ -170,7 +170,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
# re-execute the query, it will make GetPage
# requests. This does not clear the last-written LSN cache
# so we still remember the LSNs of the pages.
secondary.clear_shared_buffers(cursor=s_cur)
secondary.clear_buffers(cursor=s_cur)
if pause_apply:
s_cur.execute("SELECT pg_wal_replay_pause()")

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import os
import random
import re
import subprocess
@@ -10,20 +9,24 @@ import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, PgBin
from fixtures.utils import USE_LFC
@pytest.mark.timeout(600)
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
"""
Test resizing the Local File Cache
"""
env = neon_simple_env
cache_dir = env.repo_dir / "file_cache"
cache_dir.mkdir(exist_ok=True)
env.create_branch("test_lfc_resize")
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"neon.file_cache_path='file.cache'",
"neon.max_file_cache_size=512MB",
"neon.file_cache_size_limit=512MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
],
)
n_resize = 10
@@ -63,8 +66,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
cur.execute("select pg_reload_conf()")
nretries = 10
while True:
lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
lfc_file_size = os.path.getsize(lfc_file_path)
lfc_file_path = endpoint.lfc_path()
lfc_file_size = lfc_file_path.stat().st_size
res = subprocess.run(
["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
)

View File

@@ -3,11 +3,13 @@ from __future__ import annotations
import time
from pathlib import Path
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import query_scalar
from fixtures.utils import USE_LFC, query_scalar
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
env = neon_simple_env
@@ -18,8 +20,6 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='128MB'",
"neon.file_cache_size_limit='64MB'",
],
@@ -72,9 +72,10 @@ WITH (fillfactor='100');
# verify working set size after some index access of a few select pages only
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
log.info(f"working set size after some index access of a few select pages only {blocks}")
assert blocks < 10
assert blocks < 12
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
env = neon_simple_env

View File

@@ -6,10 +6,12 @@ import random
import threading
import time
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.utils import query_scalar
from fixtures.utils import USE_LFC, query_scalar
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
@@ -19,8 +21,6 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='64MB'",
"neon.file_cache_size_limit='10MB'",
],

View File

@@ -12,7 +12,7 @@ from fixtures.neon_fixtures import (
logical_replication_sync,
wait_for_last_flush_lsn,
)
from fixtures.utils import wait_until
from fixtures.utils import USE_LFC, wait_until
if TYPE_CHECKING:
from fixtures.neon_fixtures import (
@@ -576,7 +576,15 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
# We want all data to fit into shared_buffers because later we stop
# safekeeper and insert more; this shouldn't cause page requests as they
# will be stuck.
sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"])
sub = env.endpoints.create(
"subscriber",
config_lines=[
"neon.max_file_cache_size = 32MB",
"neon.file_cache_size_limit = 32MB",
]
if USE_LFC
else [],
)
sub.start()
with vanilla_pg.cursor() as pcur:

View File

@@ -39,7 +39,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder):
oid = cur.fetchall()[0][0]
log.info(f"t2.relfilenode={oid}")
endpoint.clear_shared_buffers(cursor=cur)
endpoint.clear_buffers(cursor=cur)
cur.execute("SELECT x from t1")
assert cur.fetchone() == (1,)

View File

@@ -54,7 +54,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
log.info("Clear buffer cache to ensure no stale pages are brought into the cache")
endpoint.clear_shared_buffers(cursor=c)
endpoint.clear_buffers(cursor=c)
cache_entries = query_scalar(
c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"

View File

@@ -230,7 +230,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
return offset
# Insert some records on main branch
with env.endpoints.create_start("main") as ep_main:
with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main:
with ep_main.cursor() as cur:
cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
lsn = Lsn(0)

View File

@@ -416,7 +416,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None
ep.clear_shared_buffers()
ep.clear_buffers()
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
ep.stop()

View File

@@ -63,7 +63,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
# Clear the buffer cache, to force the VM page to be re-fetched from
# the page server
endpoint.clear_shared_buffers(cursor=cur)
endpoint.clear_buffers(cursor=cur)
# Check that an index-only scan doesn't see the deleted row. If the
# clearing of the VM bit was not replayed correctly, this would incorrectly

View File

@@ -2446,7 +2446,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
# generate some data to commit WAL on safekeepers
endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
# clear the buffers
endpoint.clear_shared_buffers()
endpoint.clear_buffers()
# read data to fetch pages from pageserver
endpoint.safe_psql("select sum(i) from t")