mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
tests: wait for manifest persistence in test_timeline_archival_chaos (#10719)
## Problem This test would sometimes fail its assertion that a timeline does not revert to active once archived. That's because it was using the in-memory offload state, not the persistent state, so this was sometimes lost across a pageserver restart. Closes: https://github.com/neondatabase/neon/issues/10389 ## Summary of changes - When reading offload status, read from pageserver API _and_ remote storage before considering the timeline offloaded
This commit is contained in:
@@ -282,18 +282,35 @@ class S3Storage:
|
||||
def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
|
||||
|
||||
def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
|
||||
"""
|
||||
Gets the latest generation key from a list of keys.
|
||||
|
||||
@param index_keys: A list of keys of different generations, which start with `prefix`
|
||||
"""
|
||||
|
||||
def parse_gen(key: str) -> int:
|
||||
shortname = key.split("/")[-1]
|
||||
generation_str = shortname.removeprefix(prefix).removesuffix(suffix)
|
||||
try:
|
||||
return int(generation_str, base=16)
|
||||
except ValueError:
|
||||
log.info(f"Ignoring non-matching key: {key}")
|
||||
return -1
|
||||
|
||||
if len(keys) == 0:
|
||||
raise IndexError("No keys found")
|
||||
|
||||
return max(keys, key=parse_gen)
|
||||
|
||||
def get_latest_index_key(self, index_keys: list[str]) -> str:
|
||||
"""
|
||||
Gets the latest index file key.
|
||||
|
||||
@param index_keys: A list of index keys of different generations.
|
||||
"""
|
||||
|
||||
def parse_gen(index_key: str) -> int:
|
||||
parts = index_key.split("index_part.json-")
|
||||
return int(parts[-1], base=16) if len(parts) == 2 else -1
|
||||
|
||||
return max(index_keys, key=parse_gen)
|
||||
key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys)
|
||||
return key
|
||||
|
||||
def download_index_part(self, index_key: str) -> IndexPartDump:
|
||||
"""
|
||||
@@ -306,6 +323,29 @@ class S3Storage:
|
||||
log.info(f"index_part.json: {body}")
|
||||
return IndexPartDump.from_json(json.loads(body))
|
||||
|
||||
def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None:
|
||||
tenant_prefix = self.tenant_path(tenant_id)
|
||||
|
||||
objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[
|
||||
"Contents"
|
||||
]
|
||||
keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1]
|
||||
try:
|
||||
manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys)
|
||||
except IndexError:
|
||||
log.info(
|
||||
f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet"
|
||||
)
|
||||
return None
|
||||
|
||||
response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key)
|
||||
body = response["Body"].read().decode("utf-8")
|
||||
log.info(f"Downloaded manifest {manifest_key}: {body}")
|
||||
|
||||
manifest = json.loads(body)
|
||||
assert isinstance(manifest, dict)
|
||||
return manifest
|
||||
|
||||
def heatmap_key(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
|
||||
|
||||
|
||||
@@ -554,8 +554,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
log.info(f"Timeline {state.timeline_id} is still active")
|
||||
shutdown.wait(0.5)
|
||||
elif state.timeline_id in offloaded_ids:
|
||||
log.info(f"Timeline {state.timeline_id} is now offloaded")
|
||||
state.offloaded = True
|
||||
log.info(f"Timeline {state.timeline_id} is now offloaded in memory")
|
||||
|
||||
# Hack: when we see something offloaded in the API, it doesn't guarantee that the offload
|
||||
# is persistent (it is marked offloaded first, then that is persisted to the tenant manifest).
|
||||
# So we wait until we see the manifest update before considering it offloaded, that way
|
||||
# subsequent checks that it doesn't revert to active on a restart will pass reliably.
|
||||
time.sleep(0.1)
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||
manifest = env.pageserver_remote_storage.download_tenant_manifest(
|
||||
tenant_id
|
||||
)
|
||||
if manifest is None:
|
||||
log.info(
|
||||
f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)"
|
||||
)
|
||||
elif str(state.timeline_id) in [
|
||||
t["timeline_id"] for t in manifest["offloaded_timelines"]
|
||||
]:
|
||||
log.info(
|
||||
f"Timeline {state.timeline_id} is now offloaded persistently"
|
||||
)
|
||||
state.offloaded = True
|
||||
else:
|
||||
log.info(
|
||||
f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})"
|
||||
)
|
||||
|
||||
break
|
||||
else:
|
||||
# Timeline is neither offloaded nor active, this is unexpected: the pageserver
|
||||
|
||||
Reference in New Issue
Block a user