From 72da46dd5a7a9c54464c76e2335c69361d0d1b1a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 5 Jan 2024 12:33:17 +0000 Subject: [PATCH] improve overlayfs cleanup code --- test_runner/fixtures/neon_fixtures.py | 114 +++++++++++++++------ test_runner/fixtures/overlayfs.py | 11 ++ test_runner/performance/test_pageserver.py | 8 +- 3 files changed, 97 insertions(+), 36 deletions(-) create mode 100644 test_runner/fixtures/overlayfs.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d8fa723d88..82ec504706 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -70,6 +70,7 @@ from fixtures.utils import ( subprocess_capture, wait_until, ) +from fixtures import overlayfs """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -425,6 +426,7 @@ class NeonEnvBuilder: pg_version: PgVersion, test_name: str, test_output_dir: Path, + test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, @@ -469,7 +471,8 @@ class NeonEnvBuilder: self.initial_timeline = initial_timeline or TimelineId.generate() self.scrub_on_exit = False self.test_output_dir = test_output_dir - self.overlay_mounts: List[Tuple[str, Path]] = [] + self.test_overlay_dir = test_overlay_dir + self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] assert test_name.startswith( "test_" @@ -522,7 +525,6 @@ class NeonEnvBuilder: repo_dir: Path, neon_binpath: Optional[Path] = None, pg_distrib_dir: Optional[Path] = None, - use_overlay: bool = False, ) -> NeonEnv: """ A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. @@ -550,10 +552,10 @@ class NeonEnvBuilder: tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") - if not use_overlay: + if not self.test_overlay_dir: shutil.copytree(tenants_from_dir, tenants_to_dir) else: - self.mount_overlay(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir) + self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir) for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name @@ -562,12 +564,12 @@ class NeonEnvBuilder: shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid")) shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) - if not use_overlay: + if not self.test_overlay_dir: shutil.copytree( repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" ) else: - self.mount_overlay("local_fs_remote_storage", + self.overlay_mount("local_fs_remote_storage", repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage") if (attachments_json := Path(repo_dir / "attachments.json")).exists(): @@ -585,16 +587,16 @@ class NeonEnvBuilder: return self.env - @property - def overlay_state_dir(self): - return self.test_output_dir / "overlay-state" - - def mount_overlay(self, ident: str, srcdir: Path, dstdir: Path): - assert self.overlay_state_dir.parent in dstdir.parents # so the post-cleanup assertion in self.cleanup_overlay is simpler - self.overlay_state_dir.mkdir(exist_ok=True) + def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): + """ + Mount `srcdir` as an overlayfs mount at `dstdir`. + The overlayfs `upperdir` and `workdir` will be placed in test_overlay_dir. + """ + assert self.test_overlay_dir + assert self.test_output_dir in dstdir.parents # so that teardown & test_overlay_dir fixture work assert srcdir.is_dir() dstdir.mkdir(exist_ok=False, parents=False) - ident_state_dir = self.overlay_state_dir / ident + ident_state_dir = self.test_overlay_dir / ident upper = ident_state_dir / "upper" work = ident_state_dir / "work" ident_state_dir.mkdir(exist_ok=False, parents=False) # exists_ok=False also checks uniqueness in self.overlay_mounts @@ -603,27 +605,28 @@ class NeonEnvBuilder: cmd = [ "sudo", "mount", "-t", "overlay", "overlay", "-o", f"lowerdir={srcdir},upperdir={upper},workdir={work}", str(dstdir) ] log.info(f"Mounting overlayfs srcdir={srcdir} dstdir={dstdir}: {cmd}") - subprocess_capture(self.overlay_state_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) - self.overlay_mounts.append((ident, dstdir)) + subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) + self.overlay_mounts_created_by_us.append((ident, dstdir)) - def cleanup_overlay(self): - while len(self.overlay_mounts) > 0: - (ident, mountpoint) = self.overlay_mounts.pop() - ident_state_dir = self.overlay_state_dir / ident + def overlay_cleanup_teardown(self): + """ + Unmount the overlayfs mounts created by `self.overlay_mount()`. + Supposed to be called during env teardown. + """ + if not self.test_overlay_dir: + return + while len(self.overlay_mounts_created_by_us) > 0: + (ident, mountpoint) = self.overlay_mounts_created_by_us.pop() + ident_state_dir = self.test_overlay_dir / ident cmd = [ "sudo", "umount", str(mountpoint) ] log.info(f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}") - subprocess_capture(self.overlay_state_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) + subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) log.info(f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}") cmd = [ "sudo", "rm", "-rf", str(ident_state_dir)] - subprocess_capture(self.overlay_state_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) - - self.overlay_state_dir.rmdir() # fails if empty, which acts as an assertion that above cleanup is complete + subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) # assert all overlayfs mounts in our test directory are gone - for part in psutil.disk_partitions(): - if part.fstype == "overlay": - mountpoint = Path(part.mountpoint) - assert not self.test_output_dir in mountpoint.parents + assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) def enable_scrub_on_exit(self): """ @@ -731,7 +734,10 @@ class NeonEnvBuilder: sk.stop(immediate=True) for pageserver in self.env.pageservers: - pageserver.assert_no_metric_errors() + # if the test threw an exception, don't check for errors + # as a failing assertion would cause the cleanup below to fail + if exc_type is not None: + pageserver.assert_no_metric_errors() pageserver.stop(immediate=True) @@ -747,7 +753,7 @@ class NeonEnvBuilder: cleanup_error = e try: - self.cleanup_overlay() + self.overlay_cleanup_teardown() except Exception as e: log.error(f"Error cleaning up overlay state: {e}") if cleanup_error is not None: @@ -1074,6 +1080,7 @@ def neon_env_builder( default_broker: NeonBroker, run_id: uuid.UUID, request: FixtureRequest, + test_overlay_dir: Path, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1104,6 +1111,7 @@ def neon_env_builder( preserve_database_files=pytestconfig.getoption("--preserve-database-files"), test_name=request.node.name, test_output_dir=test_output_dir, + test_overlay_dir=test_overlay_dir, ) as builder: yield builder @@ -3240,10 +3248,10 @@ class S3Scrubber: raise -def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: +def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name - test_dir = top_output_dir / test_name.replace("/", "-") + test_dir = top_output_dir / (prefix+test_name.replace("/", "-")) # We rerun flaky tests multiple times, use a separate directory for each run. if (suffix := getattr(request.node, "execution_count", None)) is not None: @@ -3254,6 +3262,14 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: assert isinstance(test_dir, Path) return test_dir +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + return _get_test_dir(request, top_output_dir, "") + +def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + return _get_test_dir(request, top_output_dir, "overlay-") + +def get_test_snapshot_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + return _get_test_dir(request, top_output_dir, "snapshot") def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return get_test_output_dir(request, top_output_dir) / "repo" @@ -3283,9 +3299,11 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. @pytest.fixture(scope="function", autouse=True) -def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]: +def test_output_dir(request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path) -> Iterator[Path]: """Create the working directory for an individual test.""" + _ = test_overlay_dir # request it so it can do cleanups + # one directory per test test_dir = get_test_output_dir(request, top_output_dir) log.info(f"test_output_dir is {test_dir}") @@ -3296,6 +3314,36 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P allure_attach_from_dir(test_dir) +@pytest.fixture(scope="function", autouse=True) +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: + """Create the overlay state directory for an individual test.""" + + if not os.getenv("NEON_ENV_FROM_REPO_DIR_USE_OVERLAYFS"): + return None + + # one directory per test + overlay_dir = get_test_overlay_dir(request, top_output_dir) + log.info(f"test_overlay_dir is {overlay_dir}") + + overlay_dir.mkdir(exist_ok=True) + # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` + for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): + cmd = [ "sudo", "umount", str(mountpoint) ] + log.info(f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}") + subprocess.run(cmd, capture_output=True, check=True) + # the overlayfs `workdir`` is owned by `root` + cmd = [ "sudo", "rm", "-rf", str(overlay_dir)] + subprocess.run(cmd, capture_output=True, check=True) + + overlay_dir.mkdir() + + return overlay_dir + + # no need to clean up anything: on clean shutdown, + # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup + # and on unclean shutdown, this function will take care of it + # on the next test run + SKIP_DIRS = frozenset( ( diff --git a/test_runner/fixtures/overlayfs.py b/test_runner/fixtures/overlayfs.py new file mode 100644 index 0000000000..328a8e9404 --- /dev/null +++ b/test_runner/fixtures/overlayfs.py @@ -0,0 +1,11 @@ +from typing import Iterator +import psutil +from pathlib import Path + + +def iter_mounts_beneath(topdir: Path) -> Iterator[Path]: + for part in psutil.disk_partitions(all=True): + if part.fstype == "overlay": + mountpoint = Path(part.mountpoint) + if topdir in mountpoint.parents: + yield mountpoint diff --git a/test_runner/performance/test_pageserver.py b/test_runner/performance/test_pageserver.py index 8a813fbcef..ef12eb6799 100644 --- a/test_runner/performance/test_pageserver.py +++ b/test_runner/performance/test_pageserver.py @@ -15,8 +15,9 @@ from fixtures.types import TenantId, TimelineId @pytest.fixture(scope="function") +@pytest.mark.timeout(1000) def snapshotting_env( - neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path, ) -> Tuple[NeonEnv, TimelineId, List[TenantId]]: """ The fixture prepares environment or restores it from a snapshot. @@ -44,7 +45,7 @@ def snapshotting_env( } if snapshot_dir.exists(): - env = neon_env_builder.from_repo_dir(snapshot_dir, use_overlay=True) + env = neon_env_builder.from_repo_dir(snapshot_dir) ps_http = env.pageserver.http_client() tenants = list({TenantId(t.name) for t in (snapshot_dir.glob("pageserver_*/tenants/*"))}) template_timeline = env.initial_timeline @@ -78,7 +79,7 @@ def snapshotting_env( src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines" assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory" tenants = [template_tenant] - for i in range(0, 20): + for i in range(0, 100): new_tenant = TenantId.generate() tenants.append(new_tenant) log.info("Duplicating tenant #%s: %s", i, new_tenant) @@ -152,6 +153,7 @@ def test_getpage_throughput( env.pageserver.connstr(password=None), "--runtime", duration, + # "--per-target-rate-limit", "50", *[f"{tenant}/{template_timeline}" for tenant in tenants], ] log.info(f"command: {' '.join(cmd)}")