From af5c54ed14f34dfee477659af39628c0d7ec3502 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 24 Sep 2024 23:38:16 +0300 Subject: [PATCH] test: Make test_lfc_resize more robust (#9117) 1. Increase statement_timeout. It defaults to 120 s, which is not quite enough on slow or busy systems with debug build. On my laptop, the index creation takes about 100 s. On buildfarm, we've seen failures, e.g: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9084/10997888708/index.html#suites/821f97908a487f1d7d3a2a4dd1571e99/db1834bddfe8c5b9/ 2. Keep twiddling the LFC size through the whole test. Before, we would do it for the first 10 seconds, but that only covers a small part of the pgbench initialization phase. Change the loop so that the pgbench run time determines how long the test runs, and we keep changing the LFC for the whole time. In the passing, also fix bogus test description, copy-pasted from a completely unrelated test. --- test_runner/regress/test_lfc_resize.py | 51 ++++++++++++++++++-------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index cb0b30d9c6..0f791e9247 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -10,11 +10,11 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin -# -# Test branching, when a transaction is in prepared state -# @pytest.mark.timeout(600) def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): + """ + Test resizing the Local File Cache + """ env = neon_simple_env endpoint = env.endpoints.create_start( "main", @@ -32,27 +32,48 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr]) - thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) + # Initializing the pgbench database can be very slow, especially on debug builds. + connstr = endpoint.connstr(options="-cstatement_timeout=300s") + + thread = threading.Thread(target=run_pgbench, args=(connstr,), daemon=True) thread.start() conn = endpoint.connect() cur = conn.cursor() - for _ in range(n_resize): + # For as long as pgbench is running, twiddle the LFC size once a second. + # Note that we launch this immediately, already while the "pgbench -i" + # initialization step is still running. That's quite a different workload + # than the actual pgbench benchamark run, so this gives us coverage of both. + while thread.is_alive(): size = random.randint(1, 512) cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) - - cur.execute("alter system set neon.file_cache_size_limit='100MB'") - cur.execute("select pg_reload_conf()") - thread.join() - lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" - lfc_file_size = os.path.getsize(lfc_file_path) - res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True) - lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] - log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") - assert lfc_file_size <= 512 * 1024 * 1024 + # At the end, set it at 100 MB, and perform a final check that the disk usage + # of the file is in that ballbark. + # + # We retry the check a few times, because it might take a while for the + # system to react to changing the setting and shrinking the file. + cur.execute("alter system set neon.file_cache_size_limit='100MB'") + cur.execute("select pg_reload_conf()") + nretries = 10 + while True: + lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" + lfc_file_size = os.path.getsize(lfc_file_path) + res = subprocess.run( + ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True + ) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + assert lfc_file_size <= 512 * 1024 * 1024 + + if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0: + break + + nretries = nretries - 1 + time.sleep(1) + assert int(lfc_file_blocks) <= 128 * 1024