diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 969929fd30..45d99f5017 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -625,6 +625,54 @@ class NeonEnvBuilder: ) self.overlay_mounts_created_by_us.append((ident, dstdir)) + def _overlay_umount(self, mountpoint: Path): + cmd = ["sudo", "umount", str(mountpoint)] + assert mountpoint.is_mount() + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + + def overlay_unmount_and_move(self, ident: str, dst: Path): + """ + Unmount previously established overlayfs mount at `dstdir` and move the upperdir contents to `dst`. + If `dst` is an empty directory, it gets replaced. + Caller is responsible for ensuring the unmount will succeed, i.e., that there aren't any nested mounts. + + Raises exception if self.test_overlay_dir is None + """ + assert self.test_overlay_dir is not None + # not mutating state yet, make checks + ident_state_dir = self.test_overlay_dir / ident + assert ident_state_dir.is_dir() + upper = ident_state_dir / "upper" + work = ident_state_dir / "work" + assert upper.is_dir() + assert work.is_dir() + assert ( + self.test_overlay_dir not in dst.parents + ), "otherwise workdir cleanup below wouldn't work" + # find index, still not mutating state + idxmap = { + existing_ident: idx + for idx, (existing_ident, _) in enumerate(self.overlay_mounts_created_by_us) + } + idx = idxmap.get(ident) + if idx is None: + raise RuntimeError(f"cannot find mount for ident {ident}") + + if dst.is_dir(): + dst.rmdir() # raises exception if not empty, which is what we want + + _, mountpoint = self.overlay_mounts_created_by_us.pop(idx) + self._overlay_umount(mountpoint) + upper.rename(dst) + # we moved the upperdir, clean up workdir and then its parent ident_state_dir + cmd = ["sudo", "rm", "-rf", str(work)] + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + ident_state_dir.rmdir() # should be empty since we moved `upper` out + def overlay_cleanup_teardown(self): """ Unmount the overlayfs mounts created by `self.overlay_mount()`. @@ -635,12 +683,17 @@ class NeonEnvBuilder: while len(self.overlay_mounts_created_by_us) > 0: (ident, mountpoint) = self.overlay_mounts_created_by_us.pop() ident_state_dir = self.test_overlay_dir / ident - cmd = [ "sudo", "umount", str(mountpoint) ] - log.info(f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}") - subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) - log.info(f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}") - cmd = [ "sudo", "rm", "-rf", str(ident_state_dir)] - subprocess_capture(self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True) + log.info( + f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}" + ) + self._overlay_umount(mountpoint) + log.info( + f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}" + ) + cmd = ["sudo", "rm", "-rf", str(ident_state_dir)] + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) # assert all overlayfs mounts in our test directory are gone assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) @@ -711,8 +764,15 @@ class NeonEnvBuilder: if self.preserve_database_files: return + overlayfs_mounts = {mountpoint for _, mountpoint in self.overlay_mounts_created_by_us} + directories_to_clean: List[Path] = [] for test_entry in Path(self.repo_dir).glob("**/*"): + if test_entry in overlayfs_mounts: + continue + for parent in test_entry.parents: + if parent in overlayfs_mounts: + continue if test_entry.is_file(): test_file = test_entry if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name): @@ -761,13 +821,6 @@ class NeonEnvBuilder: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e - try: - self.overlay_cleanup_teardown() - except Exception as e: - log.error(f"Error cleaning up overlay state: {e}") - if cleanup_error is not None: - cleanup_error = e - try: self.cleanup_remote_storage() except Exception as e: @@ -788,6 +841,13 @@ class NeonEnvBuilder: for pageserver in self.env.pageservers: pageserver.assert_no_errors() + try: + self.overlay_cleanup_teardown() + except Exception as e: + log.error(f"Error cleaning up overlay state: {e}") + if cleanup_error is not None: + cleanup_error = e + class NeonEnv: """ diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index f3b9d563bb..5b44b96069 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -47,9 +47,22 @@ def single_timeline( {TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))} ) template_timeline = env.initial_timeline - - neon_env_builder.start() else: + if snapshot_dir.path.exists(): + shutil.rmtree(snapshot_dir.path) + + if save_snapshot and neon_env_builder.test_overlay_dir is not None: + # Make repo_dir an overlayfs mount with lowerdir being the empty snapshot_dir. + # When we're done filling up repo_dir, tear everything down, unmount the overlayfs, and use + # the upperdir as the snapshot. This is equivalent to docker `FROM scratch`. + assert not neon_env_builder.repo_dir.exists() + assert neon_env_builder.repo_dir.parent.exists() + snapshot_dir.path.mkdir() + neon_env_builder.overlay_mount( + "create-snapshot-repo-dir", snapshot_dir.path, neon_env_builder.repo_dir + ) + neon_env_builder.config_init_force = "empty-dir-ok" + env = neon_env_builder.init_start() remote_storage = env.pageserver_remote_storage @@ -88,7 +101,7 @@ def single_timeline( config=template_config.copy(), ) time.sleep(0.1) - wait_until_tenant_state(ps_http, tenant, "Broken", 3) + wait_until_tenant_state(ps_http, tenant, "Broken", 10) work_queue.do(22, tenants, attach_broken) @@ -100,7 +113,29 @@ def single_timeline( fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( env, tenant_timelines ) - env.pageserver.start() + + if save_snapshot: + env.stop(immediate=True, ps_assert_metric_no_errors=True) + if neon_env_builder.test_overlay_dir is None: + log.info(f"take snapshot using shutil.copytree") + shutil.copytree(env.repo_dir, snapshot_dir.path) + else: + log.info(f"take snapshot by using overlayfs upperdir") + neon_env_builder.overlay_unmount_and_move( + "create-snapshot-repo-dir", snapshot_dir.path + ) + log.info("remove empty repo_dir (previously mountpoint) for snapshot overlay_mount") + env.repo_dir.rmdir() + # TODO from here on, we should be able to reset / goto top where snapshot_dir.is_initialized() + log.info(f"make repo_dir an overlayfs mount of the snapshot we just created") + neon_env_builder.overlay_mount( + "repo-dir-after-taking-snapshot", snapshot_dir.path, env.repo_dir + ) + snapshot_dir.set_initialized() + else: + log.info("skip taking snapshot") + + env.start() log.info(f"wait for tenants to become active") for tenant in tenants: @@ -113,14 +148,5 @@ def single_timeline( for layer in info.historic_layers: assert not layer.remote - # take snapshot after download all layers so tenant dir restoration is fast - # TODO: use overlayfs to make this step less costly; we'd implement half of docker at that point - if save_snapshot: - log.info(f"take snapshot") - shutil.copytree(env.repo_dir, snapshot_dir.path) - snapshot_dir.set_initialized() - else: - log.info("skip taking snapshot") - log.info("ready") return SingleTimeline(env, template_timeline, tenants)