From dfb4160403ed9f50990c4e2d67844a4b533f2a17 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 20 Nov 2022 21:52:22 +0200 Subject: [PATCH] Don't remove timelines directory while pageserver is running. The test removes all the timelines from local disk, to test how pageserver startup works when it's missing. But if the pageserver hasn't finished loading the timeline yet, you can get an error: > 2022-11-20T01:30:41.053207Z INFO load{tenant_id=0f6ba053925a997b99b5eb45f9c548ac}:load_local_timeline{timeline_id=308ada17f4c3d790b631805d2dd51807}: no index file was found on the remote > 2022-11-20T01:30:41.054045Z ERROR load{tenant_id=0f6ba053925a997b99b5eb45f9c548ac}:load_local_timeline{timeline_id=308ada17f4c3d790b631805d2dd51807}: Failed to initialize timeline 0f6ba053925a997b99b5eb45f9c548ac/308ada17f4c3d790b631805d2dd51807: Failed to load layermap for timeline 0f6ba053925a997b99b5eb45f9c548ac/308ada17f4c3d790b631805d2dd51807 > > Caused by: > No such file or directory (os error 2) I saw this in CI, here: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-2785/debug/3505805425/index.html#suites/ec4311502db344eee91f1354e9dc839b/725c7d0ecec1ec4d/ And was able to reproduce it with this: > --- a/pageserver/src/tenant.rs > +++ b/pageserver/src/tenant.rs > @@ -946,6 +946,8 @@ impl Tenant { > None => None, > }; > > + tokio::time::sleep(std::time::Duration::from_secs(2)).await; > + > self.setup_timeline( > timeline_id, > remote_client, Even on 'main', it's pretty sketchy to remote the directory while the pageserver is still running, but it didn't lead to an error because the pagesever finished loading the local layer maps before starting up. Now that that's spawned into background, the directory might get removed before the loading finishes. --- test_runner/regress/test_tenants.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 37c9fe951b..65b29c1f38 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -223,9 +223,6 @@ def test_pageserver_with_empty_tenants( client = env.pageserver.http_client() - tenant_without_timelines_dir = env.initial_tenant - shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") - tenant_with_empty_timelines_dir = client.tenant_create() temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir) for temp_timeline in temp_timelines: @@ -245,6 +242,10 @@ def test_pageserver_with_empty_tenants( # Trigger timeline reinitialization after pageserver restart env.postgres.stop_all() env.pageserver.stop() + + tenant_without_timelines_dir = env.initial_tenant + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + env.pageserver.start() client = env.pageserver.http_client()