mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
tests: simplify code around unstable test_basebackup_with_high_slru_count (#8477)
## Problem In `test_basebackup_with_high_slru_count`, the pageserver is sometimes mysteriously hanging on startup, having been started+stopped earlier in the test setup while populating template tenant data. - #7586 We can't see why this is hanging in this particular test. The test does some weird stuff though, like attaching a load of broken tenants and then doing a SIGQUIT kill of a pageserver. ## Summary of changes - Attach tenants normally instead of doing a failpoint dance to attach them as broken - Shut the pageserver down gracefully during init instead of using immediate mode - Remove the "sequential" variant of the unstable test, as this is going away soon anyway - Log before trying to acquire lock file, so that if it hangs we have a clearer sense of if that's really where it's hanging. It seems like it is, but that code does a non-blocking flock so it's surprising.
This commit is contained in:
@@ -290,6 +290,7 @@ fn start_pageserver(
|
||||
// Create and lock PID file. This ensures that there cannot be more than one
|
||||
// pageserver process running at the same time.
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
info!("Claiming pid file at {lock_file_path:?}...");
|
||||
let lock_file =
|
||||
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("Claimed pid file at {lock_file_path:?}");
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import concurrent.futures
|
||||
import time
|
||||
from typing import Any, Callable, Dict, Tuple
|
||||
|
||||
import fixtures.pageserver.remote_storage
|
||||
@@ -9,9 +8,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
|
||||
|
||||
@@ -46,38 +42,33 @@ def single_timeline(
|
||||
log.info(f"duplicating template tenant {ncopies} times in S3")
|
||||
tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
|
||||
|
||||
# In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
|
||||
# However, on-demand downloads are quite slow ATM.
|
||||
# => do the on-demand downloads in Python.
|
||||
log.info("python-side on-demand download the layer files into local tenant dir")
|
||||
tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
|
||||
fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
|
||||
env, tenant_timelines
|
||||
)
|
||||
|
||||
log.info("attach duplicated tenants to pageserver")
|
||||
# In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
|
||||
# However, on-demand downloads are quite slow ATM.
|
||||
# => do the on-demand downloads in Python.
|
||||
assert ps_http.tenant_list() == []
|
||||
# make the attach fail after it created enough on-disk state to retry loading
|
||||
# the tenant next startup, but before it can start background loops that would start download
|
||||
ps_http.configure_failpoints(("attach-before-activate", "return"))
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*attach failed, setting tenant state to Broken: attach-before-activate.*"
|
||||
)
|
||||
|
||||
def attach_broken(tenant):
|
||||
def attach(tenant):
|
||||
env.pageserver.tenant_attach(
|
||||
tenant,
|
||||
config=template_config.copy(),
|
||||
generation=100,
|
||||
override_storage_controller_generation=True,
|
||||
)
|
||||
time.sleep(0.1)
|
||||
wait_until_tenant_state(ps_http, tenant, "Broken", 10)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
|
||||
executor.map(attach_broken, tenants)
|
||||
executor.map(attach, tenants)
|
||||
|
||||
env.pageserver.stop(
|
||||
immediate=True
|
||||
) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
|
||||
tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
|
||||
log.info("python-side on-demand download the layer files into local tenant dir")
|
||||
fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
|
||||
env, tenant_timelines
|
||||
)
|
||||
# Benchmarks will start the pageserver explicitly themselves
|
||||
env.pageserver.stop()
|
||||
|
||||
return env
|
||||
|
||||
@@ -17,13 +17,11 @@ from performance.pageserver.util import (
|
||||
@pytest.mark.parametrize("duration", [30])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
|
||||
@pytest.mark.parametrize("n_tenants", [10])
|
||||
@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_basebackup_with_high_slru_count(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
get_vectored_impl: str,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
@@ -47,7 +45,7 @@ def test_basebackup_with_high_slru_count(
|
||||
max_file_descriptors = 500000
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
|
||||
f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
|
||||
f"get_vectored_impl='vectored'; validate_vectored_get=false"
|
||||
)
|
||||
params.update(
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user