From 5e95860e708ee1cd09f740b964f9a8c369ddd0bc Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 7 Feb 2025 16:27:39 +0000 Subject: [PATCH] tests: wait for manifest persistence in test_timeline_archival_chaos (#10719) ## Problem This test would sometimes fail its assertion that a timeline does not revert to active once archived. That's because it was using the in-memory offload state, not the persistent state, so this was sometimes lost across a pageserver restart. Closes: https://github.com/neondatabase/neon/issues/10389 ## Summary of changes - When reading offload status, read from pageserver API _and_ remote storage before considering the timeline offloaded --- test_runner/fixtures/remote_storage.py | 52 +++++++++++++++++--- test_runner/regress/test_timeline_archive.py | 29 ++++++++++- 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index d969971a35..4df2b2df2b 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -282,18 +282,35 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: + """ + Gets the latest generation key from a list of keys. + + @param index_keys: A list of keys of different generations, which start with `prefix` + """ + + def parse_gen(key: str) -> int: + shortname = key.split("/")[-1] + generation_str = shortname.removeprefix(prefix).removesuffix(suffix) + try: + return int(generation_str, base=16) + except ValueError: + log.info(f"Ignoring non-matching key: {key}") + return -1 + + if len(keys) == 0: + raise IndexError("No keys found") + + return max(keys, key=parse_gen) + def get_latest_index_key(self, index_keys: list[str]) -> str: """ Gets the latest index file key. @param index_keys: A list of index keys of different generations. """ - - def parse_gen(index_key: str) -> int: - parts = index_key.split("index_part.json-") - return int(parts[-1], base=16) if len(parts) == 2 else -1 - - return max(index_keys, key=parse_gen) + key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys) + return key def download_index_part(self, index_key: str) -> IndexPartDump: """ @@ -306,6 +323,29 @@ class S3Storage: log.info(f"index_part.json: {body}") return IndexPartDump.from_json(json.loads(body)) + def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None: + tenant_prefix = self.tenant_path(tenant_id) + + objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[ + "Contents" + ] + keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1] + try: + manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys) + except IndexError: + log.info( + f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet" + ) + return None + + response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key) + body = response["Body"].read().decode("utf-8") + log.info(f"Downloaded manifest {manifest_key}: {body}") + + manifest = json.loads(body) + assert isinstance(manifest, dict) + return manifest + def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 306e971657..50f674f539 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -554,8 +554,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): log.info(f"Timeline {state.timeline_id} is still active") shutdown.wait(0.5) elif state.timeline_id in offloaded_ids: - log.info(f"Timeline {state.timeline_id} is now offloaded") - state.offloaded = True + log.info(f"Timeline {state.timeline_id} is now offloaded in memory") + + # Hack: when we see something offloaded in the API, it doesn't guarantee that the offload + # is persistent (it is marked offloaded first, then that is persisted to the tenant manifest). + # So we wait until we see the manifest update before considering it offloaded, that way + # subsequent checks that it doesn't revert to active on a restart will pass reliably. + time.sleep(0.1) + assert isinstance(env.pageserver_remote_storage, S3Storage) + manifest = env.pageserver_remote_storage.download_tenant_manifest( + tenant_id + ) + if manifest is None: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)" + ) + elif str(state.timeline_id) in [ + t["timeline_id"] for t in manifest["offloaded_timelines"] + ]: + log.info( + f"Timeline {state.timeline_id} is now offloaded persistently" + ) + state.offloaded = True + else: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})" + ) + break else: # Timeline is neither offloaded nor active, this is unexpected: the pageserver