Move tenant & timeline dir method to NeonPageserver and use them everywhere (#5262)

## Problem
In many places in test code, paths are built manually from what
NeonEnv.tenant_dir and NeonEnv.timeline_dir could do.

## Summary of changes
1. NeonEnv.tenant_dir and NeonEnv.timeline_dir moved under class
NeonPageserver as the path they use is per-pageserver instance.
2. Used these everywhere to replace manual path building

Closes #5258

---------

Signed-off-by: Rahul Modpur <rmodpur2@gmail.com>
This commit is contained in:
Rahul Modpur
2023-09-15 15:47:18 +05:30
committed by GitHub
parent e400a38fb9
commit e6985bd098
18 changed files with 68 additions and 94 deletions

View File

@@ -847,18 +847,6 @@ class NeonEnv:
"""Get list of safekeeper endpoints suitable for safekeepers GUC"""
return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
def timeline_dir(
self, tenant_id: TenantId, timeline_id: TimelineId, pageserver_id: Optional[int] = None
) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
return (
self.tenant_dir(tenant_id, pageserver_id=pageserver_id) / "timelines" / str(timeline_id)
)
def tenant_dir(self, tenant_id: TenantId, pageserver_id: Optional[int] = None) -> Path:
"""Get a tenant directory's path based on the repo directory of the test environment"""
return self.get_pageserver(pageserver_id).workdir / "tenants" / str(tenant_id)
def get_pageserver_version(self) -> str:
bin_pageserver = str(self.neon_binpath / "pageserver")
res = subprocess.run(
@@ -1580,6 +1568,21 @@ class NeonPageserver(PgProtocol):
'.*registered custom resource manager "neon".*',
]
def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
if timeline_id is None:
return self.tenant_dir(tenant_id) / "timelines"
return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
def tenant_dir(
self,
tenant_id: Optional[TenantId] = None,
) -> Path:
"""Get a tenant directory's path based on the repo directory of the test environment"""
if tenant_id is None:
return self.workdir / "tenants"
return self.workdir / "tenants" / str(tenant_id)
def start(
self,
overrides: Tuple[str, ...] = (),

View File

@@ -44,7 +44,7 @@ def measure_recovery_time(env: NeonCompare):
# Stop pageserver and remove tenant data
env.env.pageserver.stop()
timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
timeline_dir = env.env.pageserver.timeline_dir(env.tenant, env.timeline)
shutil.rmtree(timeline_dir)
# Start pageserver

View File

@@ -135,7 +135,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
@@ -166,7 +166,7 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

View File

@@ -417,7 +417,7 @@ def poor_mans_du(
largest_layer = 0
smallest_layer = None
for tenant_id, timeline_id in timelines:
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
total = 0
for file in timeline_dir.iterdir():

View File

@@ -271,7 +271,7 @@ def _import(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)

View File

@@ -55,7 +55,7 @@ def test_basic_eviction(
for sk in env.safekeepers:
sk.stop()
timeline_path = env.timeline_dir(tenant_id, timeline_id)
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
@@ -243,7 +243,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
assert by_kind["Image"] > 0
assert by_kind["Delta"] > 0
assert by_kind["InMemory"] == 0
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
log.info("resident layers count before eviction: %s", len(resident_layers))
log.info("evict all layers")
@@ -251,7 +251,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
def ensure_resident_and_remote_size_metrics():
log.info("ensure that all the layers are gone")
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
# we have disabled all background loops, so, this should hold
assert len(resident_layers) == 0

View File

@@ -38,7 +38,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
new_temp_layer_files = list(
filter(
lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
[path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
[path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
)
)
@@ -84,7 +84,7 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
new_temp_layer_files = list(
filter(
lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
[path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
[path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
)
)

View File

@@ -3,7 +3,6 @@
import time
from collections import defaultdict
from pathlib import Path
from typing import Any, DefaultDict, Dict, Tuple
import pytest
@@ -115,7 +114,7 @@ def test_ondemand_download_large_rel(
env.pageserver.stop()
# remove all the layer files
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer}")
layer.unlink()
@@ -237,7 +236,7 @@ def test_ondemand_download_timetravel(
env.pageserver.stop()
# remove all the layer files
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer}")
layer.unlink()
@@ -368,7 +367,7 @@ def test_download_remote_layers_api(
# remove all the layer files
# XXX only delete some of the layer files, to show that it really just downloads all the layers
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer.name}")
layer.unlink()

View File

@@ -6,7 +6,6 @@ import queue
import shutil
import threading
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pytest
@@ -137,7 +136,7 @@ def test_remote_storage_backup_and_restore(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -353,7 +352,7 @@ def test_remote_storage_upload_queue_retries(
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -488,7 +487,7 @@ def test_remote_timeline_client_calls_started_metric(
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -533,7 +532,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
timeline_path = env.timeline_dir(tenant_id, timeline_id)
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
client = env.pageserver.http_client()
@@ -704,7 +703,9 @@ def test_empty_branch_remote_storage_upload_on_restart(
# index upload is now hitting the failpoint, it should block the shutdown
env.pageserver.stop(immediate=True)
local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
local_metadata = (
env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
)
assert local_metadata.is_file()
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)

View File

@@ -299,7 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
# tenant is created with defaults, as in without config file
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
config_path = env.pageserver.workdir / "tenants" / str(tenant_id) / "config"
config_path = env.pageserver.tenant_dir(tenant_id) / "config"
assert config_path.exists(), "config file is always initially created"
http_client = env.pageserver.http_client()

View File

@@ -89,7 +89,7 @@ def test_tenant_delete_smoke(
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
tenant_path = env.tenant_dir(tenant_id=tenant_id)
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
@@ -269,7 +269,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
tenant_dir = env.tenant_dir(tenant_id)
tenant_dir = env.pageserver.tenant_dir(tenant_id)
# Check local is empty
assert not tenant_dir.exists()
@@ -366,7 +366,7 @@ def test_tenant_delete_is_resumed_on_attach(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = env.pageserver.workdir / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -379,7 +379,7 @@ def test_tenant_delete_is_resumed_on_attach(
wait_tenant_status_404(ps_http, tenant_id, iterations)
# we shouldn've created tenant dir on disk
tenant_path = env.tenant_dir(tenant_id=tenant_id)
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():

View File

@@ -286,7 +286,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
)
# assert tenant exists on disk
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert env.pageserver.tenant_dir(tenant_id).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
@@ -329,7 +329,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
log.info("gc thread returned")
# check that nothing is left on disk for deleted tenant
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert not env.pageserver.tenant_dir(tenant_id).exists()
with pytest.raises(
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
@@ -354,7 +354,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
)
# assert tenant exists on disk
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert env.pageserver.tenant_dir(tenant_id).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
@@ -383,7 +383,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
log.info("ignored tenant detached without error")
# check that nothing is left on disk for deleted tenant
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert not env.pageserver.tenant_dir(tenant_id).exists()
# assert the tenant does not exists in the Pageserver
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
@@ -410,7 +410,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
)
# assert tenant exists on disk
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert env.pageserver.tenant_dir(tenant_id).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
@@ -427,7 +427,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
log.info("regular tenant detached without error")
# check that nothing is left on disk for deleted tenant
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert not env.pageserver.tenant_dir(tenant_id).exists()
# assert the tenant does not exists in the Pageserver
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
@@ -528,7 +528,7 @@ def test_ignored_tenant_reattach(
pageserver_http = env.pageserver.http_client()
ignored_tenant_id, _ = env.neon_cli.create_tenant()
tenant_dir = env.pageserver.workdir / "tenants" / str(ignored_tenant_id)
tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_ignore.sort()
timelines_before_ignore = [
@@ -619,7 +619,7 @@ def test_ignored_tenant_download_missing_layers(
# ignore the tenant and remove its layers
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
layers_removed = False
for dir_entry in timeline_dir.iterdir():
if dir_entry.name.startswith("00000"):
@@ -672,7 +672,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
# ignore the tenant and remove its metadata
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
metadata_removed = False
for dir_entry in timeline_dir.iterdir():
if dir_entry.name == "metadata":

View File

@@ -216,7 +216,7 @@ def switch_pg_to_new_pageserver(
endpoint.start()
timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id)
timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
files_before_detach = os.listdir(timeline_to_detach_local_path)
assert (
"metadata" in files_before_detach
@@ -561,7 +561,7 @@ def test_emergency_relocate_with_branches_slow_replay(
# simpler than initializing a new one from scratch, but the effect on the single tenant
# is the same.
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
env.pageserver.start()
# This fail point will pause the WAL ingestion on the main branch, after the
@@ -709,7 +709,7 @@ def test_emergency_relocate_with_branches_createdb(
# Kill the pageserver, remove the tenant directory, and restart
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
env.pageserver.start()
# Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original

View File

@@ -27,7 +27,7 @@ from prometheus_client.samples import Sample
def test_tenant_creation_fails(neon_simple_env: NeonEnv):
tenants_dir = Path(neon_simple_env.pageserver.workdir) / "tenants"
tenants_dir = neon_simple_env.pageserver.tenant_dir()
initial_tenants = sorted(
map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
)
@@ -320,13 +320,7 @@ def test_pageserver_with_empty_tenants(
)
files_in_timelines_dir = sum(
1
for _p in Path.iterdir(
Path(env.pageserver.workdir)
/ "tenants"
/ str(tenant_with_empty_timelines)
/ "timelines"
)
1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines))
)
assert (
files_in_timelines_dir == 0
@@ -337,9 +331,7 @@ def test_pageserver_with_empty_tenants(
env.pageserver.stop()
tenant_without_timelines_dir = env.initial_tenant
shutil.rmtree(
Path(env.pageserver.workdir) / "tenants" / str(tenant_without_timelines_dir) / "timelines"
)
shutil.rmtree(env.pageserver.timeline_dir(tenant_without_timelines_dir))
env.pageserver.start()

View File

@@ -179,9 +179,7 @@ def test_tenants_attached_after_download(
env.pageserver.stop()
timeline_dir = (
Path(env.pageserver.workdir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
local_layer_deleted = False
for path in Path.iterdir(timeline_dir):
if path.name.startswith("00000"):
@@ -259,7 +257,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
env.endpoints.stop_all()
env.pageserver.stop()
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
local_layer_truncated = None
for path in Path.iterdir(timeline_dir):
if path.name.startswith("00000"):

View File

@@ -3,7 +3,6 @@ import os
import queue
import shutil
import threading
from pathlib import Path
import pytest
import requests
@@ -72,13 +71,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
"test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
)
timeline_path = (
env.pageserver.workdir
/ "tenants"
/ str(env.initial_tenant)
/ "timelines"
/ str(parent_timeline_id)
)
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
with pytest.raises(
PageserverApiException, match="Cannot delete timeline which has child timelines"
@@ -89,13 +82,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
assert exc.value.status_code == 412
timeline_path = (
env.pageserver.workdir
/ "tenants"
/ str(env.initial_tenant)
/ "timelines"
/ str(leaf_timeline_id)
)
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
assert timeline_path.exists()
# retry deletes when compaction or gc is running in pageserver
@@ -336,7 +323,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
),
)
timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
# Check local is empty
assert not timeline_dir.exists()
# Check no delete mark present
@@ -416,7 +403,7 @@ def test_timeline_resurrection_on_attach(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -467,13 +454,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
"test_timeline_delete_fail_before_local_delete",
)
leaf_timeline_path = (
env.pageserver.workdir
/ "tenants"
/ str(env.initial_tenant)
/ "timelines"
/ str(leaf_timeline_id)
)
leaf_timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
timeline_info = wait_until_timeline_state(
@@ -921,7 +902,7 @@ def test_timeline_delete_resumed_on_attach(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -933,7 +914,7 @@ def test_timeline_delete_resumed_on_attach(
# delete should be resumed
wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
tenant_path = env.timeline_dir(tenant_id=tenant_id, timeline_id=timeline_id)
tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():

View File

@@ -518,7 +518,7 @@ def test_timeline_size_metrics(
).value
# assert that the physical size metric matches the actual physical size on disk
timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, new_timeline_id)
assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)
# Check that the logical size metric is sane, and matches
@@ -658,7 +658,7 @@ def get_physical_size_values(
)
res.api_current_physical = detail["current_physical_size"]
timeline_path = env.timeline_dir(tenant_id, timeline_id)
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path)
return res

View File

@@ -43,7 +43,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
tenant_id, _ = env.neon_cli.create_tenant()
# assert tenant exists on disk
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert (env.pageserver.tenant_dir(tenant_id)).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -101,7 +101,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
pytest.fail(f"could not detach tenant: {last_error}")
# check that nothing is left on disk for deleted tenant
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert not env.pageserver.tenant_dir(tenant_id).exists()
# Pageserver schedules kill+wait of the WAL redo process to the background runtime,
# asynchronously to tenant detach. Cut it some slack to complete kill+wait before