From 842c3d8c10e08dcebe76e55bc06d2cac065bc6a6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 24 Jul 2024 11:26:24 +0100
Subject: [PATCH] tests: simplify code around unstable
 `test_basebackup_with_high_slru_count` (#8477)

## Problem

In `test_basebackup_with_high_slru_count`, the pageserver is sometimes
mysteriously hanging on startup, having been started+stopped earlier in
the test setup while populating template tenant data.

- #7586

We can't see why this is hanging in this particular test. The test does
some weird stuff though, like attaching a load of broken tenants and
then doing a SIGQUIT kill of a pageserver.

## Summary of changes

- Attach tenants normally instead of doing a failpoint dance to attach
them as broken
- Shut the pageserver down gracefully during init instead of using
immediate mode
- Remove the "sequential" variant of the unstable test, as this is going
away soon anyway
- Log before trying to acquire lock file, so that if it hangs we have a
clearer sense of if that's really where it's hanging. It seems like it
is, but that code does a non-blocking flock so it's surprising.
---
 pageserver/src/bin/pageserver.rs              |  1 +
 .../fixtures/pageserver/many_tenants.py       | 35 +++++++------------
 .../pagebench/test_large_slru_basebackup.py   |  4 +--
 3 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index db27a77ec6..7a96c86ded 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -290,6 +290,7 @@ fn start_pageserver(
     // Create and lock PID file. This ensures that there cannot be more than one
     // pageserver process running at the same time.
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    info!("Claiming pid file at {lock_file_path:?}...");
     let lock_file =
         utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
     info!("Claimed pid file at {lock_file_path:?}");
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index c437258c6f..3e0ffabf74 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -1,5 +1,4 @@
 import concurrent.futures
-import time
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.remote_storage
@@ -9,9 +8,6 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
-from fixtures.pageserver.utils import (
-    wait_until_tenant_state,
-)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 
 
@@ -46,38 +42,33 @@ def single_timeline(
     log.info(f"duplicating template tenant {ncopies} times in S3")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
 
+    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
+    # However, on-demand downloads are quite slow ATM.
+    # => do the on-demand downloads in Python.
+    log.info("python-side on-demand download the layer files into local tenant dir")
+    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+        env, tenant_timelines
+    )
+
     log.info("attach duplicated tenants to pageserver")
     # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
     # However, on-demand downloads are quite slow ATM.
     # => do the on-demand downloads in Python.
     assert ps_http.tenant_list() == []
-    # make the attach fail after it created enough on-disk state to retry loading
-    # the tenant next startup, but before it can start background loops that would start download
-    ps_http.configure_failpoints(("attach-before-activate", "return"))
-    env.pageserver.allowed_errors.append(
-        ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
-    )
 
-    def attach_broken(tenant):
+    def attach(tenant):
         env.pageserver.tenant_attach(
             tenant,
             config=template_config.copy(),
             generation=100,
             override_storage_controller_generation=True,
         )
-        time.sleep(0.1)
-        wait_until_tenant_state(ps_http, tenant, "Broken", 10)
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
-        executor.map(attach_broken, tenants)
+        executor.map(attach, tenants)
 
-    env.pageserver.stop(
-        immediate=True
-    )  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
-    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
-    log.info("python-side on-demand download the layer files into local tenant dir")
-    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
-        env, tenant_timelines
-    )
+    # Benchmarks will start the pageserver explicitly themselves
+    env.pageserver.stop()
 
     return env
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index b41ae60197..3258d4dcfa 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -17,13 +17,11 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("duration", [30])
 @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
 @pytest.mark.parametrize("n_tenants", [10])
-@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
-    get_vectored_impl: str,
     n_tenants: int,
     pgbench_scale: int,
     duration: int,
@@ -47,7 +45,7 @@ def test_basebackup_with_high_slru_count(
     max_file_descriptors = 500000
     neon_env_builder.pageserver_config_override = (
         f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
-        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
+        f"get_vectored_impl='vectored'; validate_vectored_get=false"
     )
     params.update(
         {