diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 001d4e23a9..353416a2ed 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -40,6 +40,7 @@ from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal from urllib3.util.retry import Retry +from fixtures import overlayfs from fixtures.broker import NeonBroker from fixtures.log_helper import log from fixtures.pageserver.allowed_errors import ( @@ -424,6 +425,7 @@ class NeonEnvBuilder: pg_version: PgVersion, test_name: str, test_output_dir: Path, + test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, @@ -468,6 +470,8 @@ class NeonEnvBuilder: self.initial_timeline = initial_timeline or TimelineId.generate() self.scrub_on_exit = False self.test_output_dir = test_output_dir + self.test_overlay_dir = test_overlay_dir + self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] assert test_name.startswith( "test_" @@ -547,7 +551,10 @@ class NeonEnvBuilder: tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") - shutil.copytree(tenants_from_dir, tenants_to_dir) + if self.test_overlay_dir is None: + shutil.copytree(tenants_from_dir, tenants_to_dir) + else: + self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir) for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name @@ -556,9 +563,16 @@ class NeonEnvBuilder: shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid")) shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) - shutil.copytree( - repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" - ) + if self.test_overlay_dir is None: + shutil.copytree( + repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + ) + else: + self.overlay_mount( + "local_fs_remote_storage", + repo_dir / "local_fs_remote_storage", + self.repo_dir / "local_fs_remote_storage", + ) if (attachments_json := Path(repo_dir / "attachments.json")).exists(): shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) @@ -575,6 +589,69 @@ class NeonEnvBuilder: return self.env + def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): + """ + Mount `srcdir` as an overlayfs mount at `dstdir`. + The overlayfs `upperdir` and `workdir` will be placed in test_overlay_dir. + """ + assert self.test_overlay_dir + assert ( + self.test_output_dir in dstdir.parents + ) # so that teardown & test_overlay_dir fixture work + assert srcdir.is_dir() + dstdir.mkdir(exist_ok=False, parents=False) + ident_state_dir = self.test_overlay_dir / ident + upper = ident_state_dir / "upper" + work = ident_state_dir / "work" + ident_state_dir.mkdir( + exist_ok=False, parents=False + ) # exists_ok=False also checks uniqueness in self.overlay_mounts + upper.mkdir() + work.mkdir() + cmd = [ + "sudo", + "mount", + "-t", + "overlay", + "overlay", + "-o", + f"lowerdir={srcdir},upperdir={upper},workdir={work}", + str(dstdir), + ] + log.info(f"Mounting overlayfs srcdir={srcdir} dstdir={dstdir}: {cmd}") + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + self.overlay_mounts_created_by_us.append((ident, dstdir)) + + def overlay_cleanup_teardown(self): + """ + Unmount the overlayfs mounts created by `self.overlay_mount()`. + Supposed to be called during env teardown. + """ + if self.test_overlay_dir is None: + return + while len(self.overlay_mounts_created_by_us) > 0: + (ident, mountpoint) = self.overlay_mounts_created_by_us.pop() + ident_state_dir = self.test_overlay_dir / ident + cmd = ["sudo", "umount", str(mountpoint)] + log.info( + f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}" + ) + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + log.info( + f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}" + ) + cmd = ["sudo", "rm", "-rf", str(ident_state_dir)] + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + + # assert all overlayfs mounts in our test directory are gone + assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) + def enable_scrub_on_exit(self): """ Call this if you would like the fixture to automatically run @@ -681,7 +758,10 @@ class NeonEnvBuilder: sk.stop(immediate=True) for pageserver in self.env.pageservers: - pageserver.assert_no_metric_errors() + # if the test threw an exception, don't check for errors + # as a failing assertion would cause the cleanup below to fail + if exc_type is not None: + pageserver.assert_no_metric_errors() pageserver.stop(immediate=True) @@ -696,6 +776,13 @@ class NeonEnvBuilder: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e + try: + self.overlay_cleanup_teardown() + except Exception as e: + log.error(f"Error cleaning up overlay state: {e}") + if cleanup_error is not None: + cleanup_error = e + try: self.cleanup_remote_storage() except Exception as e: @@ -1017,6 +1104,7 @@ def neon_env_builder( default_broker: NeonBroker, run_id: uuid.UUID, request: FixtureRequest, + test_overlay_dir: Path, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1047,6 +1135,7 @@ def neon_env_builder( preserve_database_files=pytestconfig.getoption("--preserve-database-files"), test_name=request.node.name, test_output_dir=test_output_dir, + test_overlay_dir=test_overlay_dir, ) as builder: yield builder @@ -3194,10 +3283,10 @@ class S3Scrubber: raise -def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - """Compute the working directory for an individual test.""" +def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: + """Compute the path to a working directory for an individual test.""" test_name = request.node.name - test_dir = top_output_dir / test_name.replace("/", "-") + test_dir = top_output_dir / f"{prefix}{test_name.replace('/', '-')}" # We rerun flaky tests multiple times, use a separate directory for each run. if (suffix := getattr(request.node, "execution_count", None)) is not None: @@ -3209,6 +3298,21 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return test_dir +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + The working directory for a test. + """ + return _get_test_dir(request, top_output_dir, "") + + +def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + Directory that contains `upperdir` and `workdir` for overlayfs mounts + that a test creates. See `NeonEnvBuilder.overlay_mount`. + """ + return _get_test_dir(request, top_output_dir, "overlay-") + + def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return get_test_output_dir(request, top_output_dir) / "repo" @@ -3236,8 +3340,12 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] # scope. So it uses the get_test_output_dir() function to get the path, and # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. +# +# NB: we request the overlay dir fixture so the fixture does its cleanups @pytest.fixture(scope="function", autouse=True) -def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]: +def test_output_dir( + request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path +) -> Iterator[Path]: """Create the working directory for an individual test.""" # one directory per test @@ -3251,6 +3359,43 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P allure_attach_from_dir(test_dir) +@pytest.fixture(scope="function") +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: + """ + Idempotently create a test's overlayfs mount state directory. + If the functionality isn't enabled via env var, returns None. + + The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). + """ + + if os.getenv("NEON_ENV_BUILDER_FROM_REPO_DIR_USE_OVERLAYFS") is None: + return None + + overlay_dir = get_test_overlay_dir(request, top_output_dir) + log.info(f"test_overlay_dir is {overlay_dir}") + + overlay_dir.mkdir(exist_ok=True) + # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` + for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): + cmd = ["sudo", "umount", str(mountpoint)] + log.info( + f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}" + ) + subprocess.run(cmd, capture_output=True, check=True) + # the overlayfs `workdir`` is owned by `root`, shutil.rmtree won't work. + cmd = ["sudo", "rm", "-rf", str(overlay_dir)] + subprocess.run(cmd, capture_output=True, check=True) + + overlay_dir.mkdir() + + return overlay_dir + + # no need to clean up anything: on clean shutdown, + # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup + # and on unclean shutdown, this function will take care of it + # on the next test run + + SKIP_DIRS = frozenset( ( "pg_wal", diff --git a/test_runner/fixtures/overlayfs.py b/test_runner/fixtures/overlayfs.py new file mode 100644 index 0000000000..3e2f661893 --- /dev/null +++ b/test_runner/fixtures/overlayfs.py @@ -0,0 +1,16 @@ +from pathlib import Path +from typing import Iterator + +import psutil + + +def iter_mounts_beneath(topdir: Path) -> Iterator[Path]: + """ + Iterate over the overlayfs mounts beneath the specififed `topdir`. + The `topdir` itself isn't considered. + """ + for part in psutil.disk_partitions(all=True): + if part.fstype == "overlay": + mountpoint = Path(part.mountpoint) + if topdir in mountpoint.parents: + yield mountpoint