From 5e95860e708ee1cd09f740b964f9a8c369ddd0bc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 7 Feb 2025 16:27:39 +0000
Subject: [PATCH] tests: wait for manifest persistence in
 test_timeline_archival_chaos (#10719)

## Problem

This test would sometimes fail its assertion that a timeline does not
revert to active once archived. That's because it was using the
in-memory offload state, not the persistent state, so this was sometimes
lost across a pageserver restart.

Closes: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- When reading offload status, read from pageserver API _and_ remote
storage before considering the timeline offloaded
---
 test_runner/fixtures/remote_storage.py       | 52 +++++++++++++++++---
 test_runner/regress/test_timeline_archive.py | 29 ++++++++++-
 2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index d969971a35..4df2b2df2b 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -282,18 +282,35 @@ class S3Storage:
     def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
         return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
 
+    def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
+        """
+        Gets the latest generation key from a list of keys.
+
+        @param index_keys: A list of keys of different generations, which start with `prefix`
+        """
+
+        def parse_gen(key: str) -> int:
+            shortname = key.split("/")[-1]
+            generation_str = shortname.removeprefix(prefix).removesuffix(suffix)
+            try:
+                return int(generation_str, base=16)
+            except ValueError:
+                log.info(f"Ignoring non-matching key: {key}")
+                return -1
+
+        if len(keys) == 0:
+            raise IndexError("No keys found")
+
+        return max(keys, key=parse_gen)
+
     def get_latest_index_key(self, index_keys: list[str]) -> str:
         """
         Gets the latest index file key.
 
         @param index_keys: A list of index keys of different generations.
         """
-
-        def parse_gen(index_key: str) -> int:
-            parts = index_key.split("index_part.json-")
-            return int(parts[-1], base=16) if len(parts) == 2 else -1
-
-        return max(index_keys, key=parse_gen)
+        key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys)
+        return key
 
     def download_index_part(self, index_key: str) -> IndexPartDump:
         """
@@ -306,6 +323,29 @@ class S3Storage:
         log.info(f"index_part.json: {body}")
         return IndexPartDump.from_json(json.loads(body))
 
+    def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None:
+        tenant_prefix = self.tenant_path(tenant_id)
+
+        objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[
+            "Contents"
+        ]
+        keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1]
+        try:
+            manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys)
+        except IndexError:
+            log.info(
+                f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet"
+            )
+            return None
+
+        response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key)
+        body = response["Body"].read().decode("utf-8")
+        log.info(f"Downloaded manifest {manifest_key}: {body}")
+
+        manifest = json.loads(body)
+        assert isinstance(manifest, dict)
+        return manifest
+
     def heatmap_key(self, tenant_id: TenantId) -> str:
         return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 306e971657..50f674f539 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -554,8 +554,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
                                 log.info(f"Timeline {state.timeline_id} is still active")
                                 shutdown.wait(0.5)
                             elif state.timeline_id in offloaded_ids:
-                                log.info(f"Timeline {state.timeline_id} is now offloaded")
-                                state.offloaded = True
+                                log.info(f"Timeline {state.timeline_id} is now offloaded in memory")
+
+                                # Hack: when we see something offloaded in the API, it doesn't guarantee that the offload
+                                # is persistent (it is marked offloaded first, then that is persisted to the tenant manifest).
+                                # So we wait until we see the manifest update before considering it offloaded, that way
+                                # subsequent checks that it doesn't revert to active on a restart will pass reliably.
+                                time.sleep(0.1)
+                                assert isinstance(env.pageserver_remote_storage, S3Storage)
+                                manifest = env.pageserver_remote_storage.download_tenant_manifest(
+                                    tenant_id
+                                )
+                                if manifest is None:
+                                    log.info(
+                                        f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)"
+                                    )
+                                elif str(state.timeline_id) in [
+                                    t["timeline_id"] for t in manifest["offloaded_timelines"]
+                                ]:
+                                    log.info(
+                                        f"Timeline {state.timeline_id} is now offloaded persistently"
+                                    )
+                                    state.offloaded = True
+                                else:
+                                    log.info(
+                                        f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})"
+                                    )
+
                                 break
                             else:
                                 # Timeline is neither offloaded nor active, this is unexpected: the pageserver