tests: debugging for test_secondary_downloads failures (#6984)

## Problem

- #6966 
- Existing logs aren't pointing to a cause: it looks like heatmap upload
and download are happening, but for some reason the evicted layer isn't
removed on the secondary location.

## Summary of changes

- Assert evicted layer is gone from heatmap before checking its gone
from local disk: this will give clarity on whether the issue is with the
uploads or downloads.
- On assertion failures, log the contents of heatmap.
This commit is contained in:
John Spray
2024-03-04 09:10:04 +00:00
committed by GitHub
parent fad9be4598
commit 8dc7dc79dd
2 changed files with 40 additions and 11 deletions

View File

@@ -252,6 +252,16 @@ class S3Storage:
log.info(f"deleted {cnt} objects from remote storage")
def tenant_path(self, tenant_id: TenantId) -> str:
return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
def heatmap_key(self, tenant_id: TenantId) -> str:
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
def heatmap_content(self, tenant_id: TenantId):
r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
return json.loads(r["Body"].read().decode("utf-8"))
RemoteStorage = Union[LocalFsStorage, S3Storage]

View File

@@ -1,3 +1,4 @@
import json
import random
from pathlib import Path
from typing import Any, Dict, Optional
@@ -10,7 +11,7 @@ from fixtures.pageserver.utils import (
poll_for_remote_storage_iterations,
tenant_delete_wait_completed,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
from fixtures.types import TenantId, TimelineId
from fixtures.utils import wait_until
from fixtures.workload import Workload
@@ -436,6 +437,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -491,18 +493,35 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
# Do evictions on attached pageserver, check secondary follows along
# ==================================================================
log.info("Evicting a layer...")
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
try:
log.info("Evicting a layer...")
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
log.info(f"Victim layer: {layer_to_evict.name}")
ps_attached.http_client().evict_layer(
tenant_id, timeline_id, layer_name=layer_to_evict.name
)
log.info("Synchronizing after eviction...")
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
ps_secondary.http_client().tenant_secondary_download(tenant_id)
log.info("Synchronizing after eviction...")
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id)
heatmap_layers = set(
layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
)
assert layer_to_evict.name not in heatmap_layers
assert some_other_layer.name in heatmap_layers
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
ps_secondary.http_client().tenant_secondary_download(tenant_id)
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
except:
# On assertion failures, log some details to help with debugging
heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}")
raise
# Scrub the remote storage
# ========================