tests: wait for manifest persistence in test_timeline_archival_chaos (#10719)

## Problem

This test would sometimes fail its assertion that a timeline does not
revert to active once archived. That's because it was using the
in-memory offload state, not the persistent state, so this was sometimes
lost across a pageserver restart.

Closes: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- When reading offload status, read from pageserver API _and_ remote
storage before considering the timeline offloaded
This commit is contained in:
John Spray
2025-02-07 16:27:39 +00:00
committed by GitHub
parent 0abff59e97
commit 5e95860e70
2 changed files with 73 additions and 8 deletions

View File

@@ -282,18 +282,35 @@ class S3Storage:
def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
"""
Gets the latest generation key from a list of keys.
@param index_keys: A list of keys of different generations, which start with `prefix`
"""
def parse_gen(key: str) -> int:
shortname = key.split("/")[-1]
generation_str = shortname.removeprefix(prefix).removesuffix(suffix)
try:
return int(generation_str, base=16)
except ValueError:
log.info(f"Ignoring non-matching key: {key}")
return -1
if len(keys) == 0:
raise IndexError("No keys found")
return max(keys, key=parse_gen)
def get_latest_index_key(self, index_keys: list[str]) -> str:
"""
Gets the latest index file key.
@param index_keys: A list of index keys of different generations.
"""
def parse_gen(index_key: str) -> int:
parts = index_key.split("index_part.json-")
return int(parts[-1], base=16) if len(parts) == 2 else -1
return max(index_keys, key=parse_gen)
key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys)
return key
def download_index_part(self, index_key: str) -> IndexPartDump:
"""
@@ -306,6 +323,29 @@ class S3Storage:
log.info(f"index_part.json: {body}")
return IndexPartDump.from_json(json.loads(body))
def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None:
tenant_prefix = self.tenant_path(tenant_id)
objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[
"Contents"
]
keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1]
try:
manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys)
except IndexError:
log.info(
f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet"
)
return None
response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key)
body = response["Body"].read().decode("utf-8")
log.info(f"Downloaded manifest {manifest_key}: {body}")
manifest = json.loads(body)
assert isinstance(manifest, dict)
return manifest
def heatmap_key(self, tenant_id: TenantId) -> str:
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"

View File

@@ -554,8 +554,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
log.info(f"Timeline {state.timeline_id} is still active")
shutdown.wait(0.5)
elif state.timeline_id in offloaded_ids:
log.info(f"Timeline {state.timeline_id} is now offloaded")
state.offloaded = True
log.info(f"Timeline {state.timeline_id} is now offloaded in memory")
# Hack: when we see something offloaded in the API, it doesn't guarantee that the offload
# is persistent (it is marked offloaded first, then that is persisted to the tenant manifest).
# So we wait until we see the manifest update before considering it offloaded, that way
# subsequent checks that it doesn't revert to active on a restart will pass reliably.
time.sleep(0.1)
assert isinstance(env.pageserver_remote_storage, S3Storage)
manifest = env.pageserver_remote_storage.download_tenant_manifest(
tenant_id
)
if manifest is None:
log.info(
f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)"
)
elif str(state.timeline_id) in [
t["timeline_id"] for t in manifest["offloaded_timelines"]
]:
log.info(
f"Timeline {state.timeline_id} is now offloaded persistently"
)
state.offloaded = True
else:
log.info(
f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})"
)
break
else:
# Timeline is neither offloaded nor active, this is unexpected: the pageserver