From 842c3d8c10e08dcebe76e55bc06d2cac065bc6a6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 24 Jul 2024 11:26:24 +0100 Subject: [PATCH] tests: simplify code around unstable `test_basebackup_with_high_slru_count` (#8477) ## Problem In `test_basebackup_with_high_slru_count`, the pageserver is sometimes mysteriously hanging on startup, having been started+stopped earlier in the test setup while populating template tenant data. - #7586 We can't see why this is hanging in this particular test. The test does some weird stuff though, like attaching a load of broken tenants and then doing a SIGQUIT kill of a pageserver. ## Summary of changes - Attach tenants normally instead of doing a failpoint dance to attach them as broken - Shut the pageserver down gracefully during init instead of using immediate mode - Remove the "sequential" variant of the unstable test, as this is going away soon anyway - Log before trying to acquire lock file, so that if it hangs we have a clearer sense of if that's really where it's hanging. It seems like it is, but that code does a non-blocking flock so it's surprising. --- pageserver/src/bin/pageserver.rs | 1 + .../fixtures/pageserver/many_tenants.py | 35 +++++++------------ .../pagebench/test_large_slru_basebackup.py | 4 +-- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index db27a77ec6..7a96c86ded 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -290,6 +290,7 @@ fn start_pageserver( // Create and lock PID file. This ensures that there cannot be more than one // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); + info!("Claiming pid file at {lock_file_path:?}..."); let lock_file = utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("Claimed pid file at {lock_file_path:?}"); diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index c437258c6f..3e0ffabf74 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -1,5 +1,4 @@ import concurrent.futures -import time from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.remote_storage @@ -9,9 +8,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.utils import ( - wait_until_tenant_state, -) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind @@ -46,38 +42,33 @@ def single_timeline( log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + log.info("python-side on-demand download the layer files into local tenant dir") + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + log.info("attach duplicated tenants to pageserver") # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. # However, on-demand downloads are quite slow ATM. # => do the on-demand downloads in Python. assert ps_http.tenant_list() == [] - # make the attach fail after it created enough on-disk state to retry loading - # the tenant next startup, but before it can start background loops that would start download - ps_http.configure_failpoints(("attach-before-activate", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed, setting tenant state to Broken: attach-before-activate.*" - ) - def attach_broken(tenant): + def attach(tenant): env.pageserver.tenant_attach( tenant, config=template_config.copy(), generation=100, override_storage_controller_generation=True, ) - time.sleep(0.1) - wait_until_tenant_state(ps_http, tenant, "Broken", 10) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: - executor.map(attach_broken, tenants) + executor.map(attach, tenants) - env.pageserver.stop( - immediate=True - ) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout - tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) - log.info("python-side on-demand download the layer files into local tenant dir") - fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( - env, tenant_timelines - ) + # Benchmarks will start the pageserver explicitly themselves + env.pageserver.stop() return env diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index b41ae60197..3258d4dcfa 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -17,13 +17,11 @@ from performance.pageserver.util import ( @pytest.mark.parametrize("duration", [30]) @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [10]) -@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) @pytest.mark.timeout(1000) def test_basebackup_with_high_slru_count( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, - get_vectored_impl: str, n_tenants: int, pgbench_scale: int, duration: int, @@ -47,7 +45,7 @@ def test_basebackup_with_high_slru_count( max_file_descriptors = 500000 neon_env_builder.pageserver_config_override = ( f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " - f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + f"get_vectored_impl='vectored'; validate_vectored_get=false" ) params.update( {