improve overlayfs cleanup code

This commit is contained in:
Christian Schwarz
2024-01-05 12:33:17 +00:00
parent a748d67915
commit 72da46dd5a
3 changed files with 97 additions and 36 deletions

View File

@@ -70,6 +70,7 @@ from fixtures.utils import (
subprocess_capture,
wait_until,
)
from fixtures import overlayfs
"""
This file contains pytest fixtures. A fixture is a test resource that can be
@@ -425,6 +426,7 @@ class NeonEnvBuilder:
pg_version: PgVersion,
test_name: str,
test_output_dir: Path,
test_overlay_dir: Optional[Path] = None,
pageserver_remote_storage: Optional[RemoteStorage] = None,
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 1,
@@ -469,7 +471,8 @@ class NeonEnvBuilder:
self.initial_timeline = initial_timeline or TimelineId.generate()
self.scrub_on_exit = False
self.test_output_dir = test_output_dir
self.overlay_mounts: List[Tuple[str, Path]] = []
self.test_overlay_dir = test_overlay_dir
self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
assert test_name.startswith(
"test_"
@@ -522,7 +525,6 @@ class NeonEnvBuilder:
repo_dir: Path,
neon_binpath: Optional[Path] = None,
pg_distrib_dir: Optional[Path] = None,
use_overlay: bool = False,
) -> NeonEnv:
"""
A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir.
@@ -550,10 +552,10 @@ class NeonEnvBuilder:
tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"
log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
if not use_overlay:
if not self.test_overlay_dir:
shutil.copytree(tenants_from_dir, tenants_to_dir)
else:
self.mount_overlay(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir)
self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir)
for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"):
sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name
@@ -562,12 +564,12 @@ class NeonEnvBuilder:
shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid"))
shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
if not use_overlay:
if not self.test_overlay_dir:
shutil.copytree(
repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
)
else:
self.mount_overlay("local_fs_remote_storage",
self.overlay_mount("local_fs_remote_storage",
repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage")
if (attachments_json := Path(repo_dir / "attachments.json")).exists():
@@ -585,16 +587,16 @@ class NeonEnvBuilder:
return self.env
@property
def overlay_state_dir(self):
return self.test_output_dir / "overlay-state"
def mount_overlay(self, ident: str, srcdir: Path, dstdir: Path):
assert self.overlay_state_dir.parent in dstdir.parents # so the post-cleanup assertion in self.cleanup_overlay is simpler
self.overlay_state_dir.mkdir(exist_ok=True)
def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path):
"""
Mount `srcdir` as an overlayfs mount at `dstdir`.
The overlayfs `upperdir` and `workdir` will be placed in test_overlay_dir.
"""
assert self.test_overlay_dir
assert self.test_output_dir in dstdir.parents # so that teardown & test_overlay_dir fixture work
assert srcdir.is_dir()
dstdir.mkdir(exist_ok=False, parents=False)
ident_state_dir = self.overlay_state_dir / ident
ident_state_dir = self.test_overlay_dir / ident
upper = ident_state_dir / "upper"
work = ident_state_dir / "work"
ident_state_dir.mkdir(exist_ok=False, parents=False) # exists_ok=False also checks uniqueness in self.overlay_mounts
@@ -603,27 +605,28 @@ class NeonEnvBuilder:
cmd = [ "sudo", "mount", "-t", "overlay", "overlay",
"-o", f"lowerdir={srcdir},upperdir={upper},workdir={work}", str(dstdir) ]
log.info(f"Mounting overlayfs srcdir={srcdir} dstdir={dstdir}: {cmd}")
subprocess_capture(self.overlay_state_dir, cmd, check=True, echo_stderr=True, echo_stdout=True)
self.overlay_mounts.append((ident, dstdir))
subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True)
self.overlay_mounts_created_by_us.append((ident, dstdir))
def cleanup_overlay(self):
while len(self.overlay_mounts) > 0:
(ident, mountpoint) = self.overlay_mounts.pop()
ident_state_dir = self.overlay_state_dir / ident
def overlay_cleanup_teardown(self):
"""
Unmount the overlayfs mounts created by `self.overlay_mount()`.
Supposed to be called during env teardown.
"""
if not self.test_overlay_dir:
return
while len(self.overlay_mounts_created_by_us) > 0:
(ident, mountpoint) = self.overlay_mounts_created_by_us.pop()
ident_state_dir = self.test_overlay_dir / ident
cmd = [ "sudo", "umount", str(mountpoint) ]
log.info(f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}")
subprocess_capture(self.overlay_state_dir, cmd, check=True, echo_stderr=True, echo_stdout=True)
subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True)
log.info(f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}")
cmd = [ "sudo", "rm", "-rf", str(ident_state_dir)]
subprocess_capture(self.overlay_state_dir, cmd, check=True, echo_stderr=True, echo_stdout=True)
self.overlay_state_dir.rmdir() # fails if empty, which acts as an assertion that above cleanup is complete
subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True)
# assert all overlayfs mounts in our test directory are gone
for part in psutil.disk_partitions():
if part.fstype == "overlay":
mountpoint = Path(part.mountpoint)
assert not self.test_output_dir in mountpoint.parents
assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir))
def enable_scrub_on_exit(self):
"""
@@ -731,7 +734,10 @@ class NeonEnvBuilder:
sk.stop(immediate=True)
for pageserver in self.env.pageservers:
pageserver.assert_no_metric_errors()
# if the test threw an exception, don't check for errors
# as a failing assertion would cause the cleanup below to fail
if exc_type is not None:
pageserver.assert_no_metric_errors()
pageserver.stop(immediate=True)
@@ -747,7 +753,7 @@ class NeonEnvBuilder:
cleanup_error = e
try:
self.cleanup_overlay()
self.overlay_cleanup_teardown()
except Exception as e:
log.error(f"Error cleaning up overlay state: {e}")
if cleanup_error is not None:
@@ -1074,6 +1080,7 @@ def neon_env_builder(
default_broker: NeonBroker,
run_id: uuid.UUID,
request: FixtureRequest,
test_overlay_dir: Path,
) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
@@ -1104,6 +1111,7 @@ def neon_env_builder(
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
test_name=request.node.name,
test_output_dir=test_output_dir,
test_overlay_dir=test_overlay_dir,
) as builder:
yield builder
@@ -3240,10 +3248,10 @@ class S3Scrubber:
raise
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
"""Compute the working directory for an individual test."""
test_name = request.node.name
test_dir = top_output_dir / test_name.replace("/", "-")
test_dir = top_output_dir / (prefix+test_name.replace("/", "-"))
# We rerun flaky tests multiple times, use a separate directory for each run.
if (suffix := getattr(request.node, "execution_count", None)) is not None:
@@ -3254,6 +3262,14 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
assert isinstance(test_dir, Path)
return test_dir
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
return _get_test_dir(request, top_output_dir, "")
def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
return _get_test_dir(request, top_output_dir, "overlay-")
def get_test_snapshot_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
return _get_test_dir(request, top_output_dir, "snapshot")
def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
return get_test_output_dir(request, top_output_dir) / "repo"
@@ -3283,9 +3299,11 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
# this fixture ensures that the directory exists. That works because
# 'autouse' fixtures are run before other fixtures.
@pytest.fixture(scope="function", autouse=True)
def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]:
def test_output_dir(request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path) -> Iterator[Path]:
"""Create the working directory for an individual test."""
_ = test_overlay_dir # request it so it can do cleanups
# one directory per test
test_dir = get_test_output_dir(request, top_output_dir)
log.info(f"test_output_dir is {test_dir}")
@@ -3296,6 +3314,36 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P
allure_attach_from_dir(test_dir)
@pytest.fixture(scope="function", autouse=True)
def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]:
"""Create the overlay state directory for an individual test."""
if not os.getenv("NEON_ENV_FROM_REPO_DIR_USE_OVERLAYFS"):
return None
# one directory per test
overlay_dir = get_test_overlay_dir(request, top_output_dir)
log.info(f"test_overlay_dir is {overlay_dir}")
overlay_dir.mkdir(exist_ok=True)
# unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir`
for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)):
cmd = [ "sudo", "umount", str(mountpoint) ]
log.info(f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}")
subprocess.run(cmd, capture_output=True, check=True)
# the overlayfs `workdir`` is owned by `root`
cmd = [ "sudo", "rm", "-rf", str(overlay_dir)]
subprocess.run(cmd, capture_output=True, check=True)
overlay_dir.mkdir()
return overlay_dir
# no need to clean up anything: on clean shutdown,
# NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup
# and on unclean shutdown, this function will take care of it
# on the next test run
SKIP_DIRS = frozenset(
(

View File

@@ -0,0 +1,11 @@
from typing import Iterator
import psutil
from pathlib import Path
def iter_mounts_beneath(topdir: Path) -> Iterator[Path]:
for part in psutil.disk_partitions(all=True):
if part.fstype == "overlay":
mountpoint = Path(part.mountpoint)
if topdir in mountpoint.parents:
yield mountpoint

View File

@@ -15,8 +15,9 @@ from fixtures.types import TenantId, TimelineId
@pytest.fixture(scope="function")
@pytest.mark.timeout(1000)
def snapshotting_env(
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path,
) -> Tuple[NeonEnv, TimelineId, List[TenantId]]:
"""
The fixture prepares environment or restores it from a snapshot.
@@ -44,7 +45,7 @@ def snapshotting_env(
}
if snapshot_dir.exists():
env = neon_env_builder.from_repo_dir(snapshot_dir, use_overlay=True)
env = neon_env_builder.from_repo_dir(snapshot_dir)
ps_http = env.pageserver.http_client()
tenants = list({TenantId(t.name) for t in (snapshot_dir.glob("pageserver_*/tenants/*"))})
template_timeline = env.initial_timeline
@@ -78,7 +79,7 @@ def snapshotting_env(
src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines"
assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
tenants = [template_tenant]
for i in range(0, 20):
for i in range(0, 100):
new_tenant = TenantId.generate()
tenants.append(new_tenant)
log.info("Duplicating tenant #%s: %s", i, new_tenant)
@@ -152,6 +153,7 @@ def test_getpage_throughput(
env.pageserver.connstr(password=None),
"--runtime",
duration,
# "--per-target-rate-limit", "50",
*[f"{tenant}/{template_timeline}" for tenant in tenants],
]
log.info(f"command: {' '.join(cmd)}")