tests: simplify code around unstable test_basebackup_with_high_slru_count (#8477)

## Problem In `test_basebackup_with_high_slru_count`, the pageserver is sometimes mysteriously hanging on startup, having been started+stopped earlier in the test setup while populating template tenant data. - #7586 We can't see why this is hanging in this particular test. The test does some weird stuff though, like attaching a load of broken tenants and then doing a SIGQUIT kill of a pageserver. ## Summary of changes - Attach tenants normally instead of doing a failpoint dance to attach them as broken - Shut the pageserver down gracefully during init instead of using immediate mode - Remove the "sequential" variant of the unstable test, as this is going away soon anyway - Log before trying to acquire lock file, so that if it hangs we have a clearer sense of if that's really where it's hanging. It seems like it is, but that code does a non-blocking flock so it's surprising.
2025-12-22 21:59:59 +00:00 · 2024-07-24 11:26:24 +01:00
parent c698b7b010
commit 842c3d8c10
3 changed files with 15 additions and 25 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -290,6 +290,7 @@ fn start_pageserver(
    // Create and lock PID file. This ensures that there cannot be more than one
    // pageserver process running at the same time.
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    info!("Claiming pid file at {lock_file_path:?}...");
    let lock_file =
        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
    info!("Claimed pid file at {lock_file_path:?}");
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -1,5 +1,4 @@
 import concurrent.futures
-import time
 from typing import Any, Callable, Dict, Tuple

 import fixtures.pageserver.remote_storage
@@ -9,9 +8,6 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
 )
-from fixtures.pageserver.utils import (
-    wait_until_tenant_state,
-)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind


@@ -46,38 +42,33 @@ def single_timeline(
    log.info(f"duplicating template tenant {ncopies} times in S3")
    tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)

+    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
+    # However, on-demand downloads are quite slow ATM.
+    # => do the on-demand downloads in Python.
+    log.info("python-side on-demand download the layer files into local tenant dir")
+    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+        env, tenant_timelines
+    )
+
    log.info("attach duplicated tenants to pageserver")
    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
    # However, on-demand downloads are quite slow ATM.
    # => do the on-demand downloads in Python.
    assert ps_http.tenant_list() == []
-    # make the attach fail after it created enough on-disk state to retry loading
-    # the tenant next startup, but before it can start background loops that would start download
-    ps_http.configure_failpoints(("attach-before-activate", "return"))
-    env.pageserver.allowed_errors.append(
-        ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
-    )

-    def attach_broken(tenant):
+    def attach(tenant):
        env.pageserver.tenant_attach(
            tenant,
            config=template_config.copy(),
            generation=100,
            override_storage_controller_generation=True,
        )
-        time.sleep(0.1)
-        wait_until_tenant_state(ps_http, tenant, "Broken", 10)

    with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
-        executor.map(attach_broken, tenants)
+        executor.map(attach, tenants)

-    env.pageserver.stop(
-        immediate=True
-    )  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
-    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
-    log.info("python-side on-demand download the layer files into local tenant dir")
-    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
-        env, tenant_timelines
-    )
+    # Benchmarks will start the pageserver explicitly themselves
+    env.pageserver.stop()

    return env
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -17,13 +17,11 @@ from performance.pageserver.util import (
@pytest.mark.parametrize("duration", [30])
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
@pytest.mark.parametrize("n_tenants", [10])
-@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
@pytest.mark.timeout(1000)
 def test_basebackup_with_high_slru_count(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
    pg_bin: PgBin,
-    get_vectored_impl: str,
    n_tenants: int,
    pgbench_scale: int,
    duration: int,
@@ -47,7 +45,7 @@ def test_basebackup_with_high_slru_count(
    max_file_descriptors = 500000
    neon_env_builder.pageserver_config_override = (
        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
-        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
+        f"get_vectored_impl='vectored'; validate_vectored_get=false"
    )
    params.update(
        {