mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
s3_scrubber: add tenant-snapshot (#7444)
## Problem Downloading tenant data for analysis/debug with `aws s3 cp` works well for small tenants, but for larger tenants it is unlikely that one ends up with an index that matches layer files, due to the time taken to download. ## Summary of changes - Add a `tenant-snapshot` command to the scrubber, which reads timeline indices and then downloads the layers referenced in the index, even if they were deleted. The result is a snapshot of the tenant's remote storage state that should be usable when imported (#7399 ).
This commit is contained in:
@@ -2310,20 +2310,24 @@ class NeonPageserver(PgProtocol):
|
||||
# The entries in the list are regular experessions.
|
||||
self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
|
||||
|
||||
def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
|
||||
def timeline_dir(
|
||||
self,
|
||||
tenant_shard_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: Optional[TimelineId] = None,
|
||||
) -> Path:
|
||||
"""Get a timeline directory's path based on the repo directory of the test environment"""
|
||||
if timeline_id is None:
|
||||
return self.tenant_dir(tenant_id) / "timelines"
|
||||
return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
|
||||
return self.tenant_dir(tenant_shard_id) / "timelines"
|
||||
return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id)
|
||||
|
||||
def tenant_dir(
|
||||
self,
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None,
|
||||
) -> Path:
|
||||
"""Get a tenant directory's path based on the repo directory of the test environment"""
|
||||
if tenant_id is None:
|
||||
if tenant_shard_id is None:
|
||||
return self.workdir / "tenants"
|
||||
return self.workdir / "tenants" / str(tenant_id)
|
||||
return self.workdir / "tenants" / str(tenant_shard_id)
|
||||
|
||||
def start(
|
||||
self,
|
||||
@@ -2510,8 +2514,10 @@ class NeonPageserver(PgProtocol):
|
||||
client = self.http_client()
|
||||
return client.tenant_location_conf(tenant_id, config, **kwargs)
|
||||
|
||||
def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
|
||||
path = self.tenant_dir(tenant_id) / "config-v1"
|
||||
def read_tenant_location_conf(
|
||||
self, tenant_shard_id: Union[TenantId, TenantShardId]
|
||||
) -> dict[str, Any]:
|
||||
path = self.tenant_dir(tenant_shard_id) / "config-v1"
|
||||
log.info(f"Reading location conf from {path}")
|
||||
bytes = open(path, "r").read()
|
||||
try:
|
||||
@@ -3715,7 +3721,7 @@ class S3Scrubber:
|
||||
log.warning(f"Scrub environment: {env}")
|
||||
log.warning(f"Output at: {output_path}")
|
||||
|
||||
raise RuntimeError("Remote storage scrub failed")
|
||||
raise RuntimeError(f"Scrubber failed while running {args}")
|
||||
|
||||
assert stdout is not None
|
||||
return stdout
|
||||
@@ -3730,6 +3736,13 @@ class S3Scrubber:
|
||||
log.error(stdout)
|
||||
raise
|
||||
|
||||
def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
|
||||
stdout = self.scrubber_cli(
|
||||
["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
|
||||
timeout=30,
|
||||
)
|
||||
log.info(f"tenant-snapshot output: {stdout}")
|
||||
|
||||
|
||||
def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
|
||||
"""Compute the path to a working directory for an individual test."""
|
||||
|
||||
@@ -252,8 +252,11 @@ class S3Storage:
|
||||
|
||||
log.info(f"deleted {cnt} objects from remote storage")
|
||||
|
||||
def tenants_path(self) -> str:
|
||||
return f"{self.prefix_in_bucket}/tenants"
|
||||
|
||||
def tenant_path(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
|
||||
return f"{self.tenants_path()}/{tenant_id}"
|
||||
|
||||
def heatmap_key(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
|
||||
@@ -262,6 +265,9 @@ class S3Storage:
|
||||
r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
|
||||
return json.loads(r["Body"].read().decode("utf-8"))
|
||||
|
||||
def mock_remote_tenant_path(self, tenant_id: TenantId):
|
||||
assert self.real is False
|
||||
|
||||
|
||||
RemoteStorage = Union[LocalFsStorage, S3Storage]
|
||||
|
||||
|
||||
@@ -156,7 +156,11 @@ class TenantShardId:
|
||||
raise ValueError(f"Invalid TenantShardId '{input}'")
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
|
||||
if self.shard_count > 0:
|
||||
return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
|
||||
else:
|
||||
# Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id)
|
||||
return str(self.tenant_id)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
111
test_runner/regress/test_s3_scrubber.py
Normal file
111
test_runner/regress/test_s3_scrubber.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import os
|
||||
import shutil
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
S3Scrubber,
|
||||
)
|
||||
from fixtures.remote_storage import S3Storage, s3_storage
|
||||
from fixtures.types import TenantShardId
|
||||
from fixtures.workload import Workload
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
|
||||
"""
|
||||
Test the `tenant-snapshot` subcommand, which grabs data from remote storage
|
||||
|
||||
This is only a support/debug tool, but worth testing to ensure the tool does not regress.
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
branch = "main"
|
||||
|
||||
# Do some work
|
||||
workload = Workload(env, tenant_id, timeline_id, branch)
|
||||
workload.init()
|
||||
|
||||
# Multiple write/flush passes to generate multiple layers
|
||||
for _n in range(0, 3):
|
||||
workload.write_rows(128)
|
||||
|
||||
# Do some more work after a restart, so that we have multiple generations
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
|
||||
for _n in range(0, 3):
|
||||
workload.write_rows(128)
|
||||
|
||||
# If we're doing multiple shards, split: this is important to exercise
|
||||
# the scrubber's ability to understand the references from child shards to parent shard's layers
|
||||
if shard_count is not None:
|
||||
tenant_shard_ids = env.storage_controller.tenant_shard_split(
|
||||
tenant_id, shard_count=shard_count
|
||||
)
|
||||
|
||||
# Write after shard split: this will result in shards containing a mixture of owned
|
||||
# and parent layers in their index.
|
||||
workload.write_rows(128)
|
||||
else:
|
||||
tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
|
||||
|
||||
output_path = neon_env_builder.test_output_dir / "snapshot"
|
||||
os.makedirs(output_path)
|
||||
|
||||
scrubber = S3Scrubber(neon_env_builder)
|
||||
scrubber.tenant_snapshot(tenant_id, output_path)
|
||||
|
||||
assert len(os.listdir(output_path)) > 0
|
||||
|
||||
workload.stop()
|
||||
|
||||
# Stop pageservers
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.stop()
|
||||
|
||||
# Drop all shards' local storage
|
||||
for tenant_shard_id in tenant_shard_ids:
|
||||
pageserver = env.get_tenant_pageserver(tenant_shard_id)
|
||||
shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id))
|
||||
|
||||
# Replace remote storage contents with the snapshot we downloaded
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||
|
||||
remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id)
|
||||
|
||||
# Delete current remote storage contents
|
||||
bucket = env.pageserver_remote_storage.bucket_name
|
||||
remote_client = env.pageserver_remote_storage.client
|
||||
deleted = 0
|
||||
for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[
|
||||
"Contents"
|
||||
]:
|
||||
key = object["Key"]
|
||||
remote_client.delete_object(Key=key, Bucket=bucket)
|
||||
deleted += 1
|
||||
assert deleted > 0
|
||||
|
||||
# Upload from snapshot
|
||||
for root, _dirs, files in os.walk(output_path):
|
||||
for file in files:
|
||||
full_local_path = os.path.join(root, file)
|
||||
full_remote_path = (
|
||||
env.pageserver_remote_storage.tenants_path()
|
||||
+ "/"
|
||||
+ full_local_path.removeprefix(f"{output_path}/")
|
||||
)
|
||||
remote_client.upload_file(full_local_path, bucket, full_remote_path)
|
||||
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.start()
|
||||
|
||||
# Check we can read everything
|
||||
workload.validate()
|
||||
Reference in New Issue
Block a user