mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 17:40:37 +00:00
Merge remote-tracking branch 'upstream/main' into jcsp/storcon-split-refine
This commit is contained in:
@@ -96,5 +96,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_evictions_total",
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# "pageserver_directory_entries_count", -- only used if above a certain threshold
|
||||
# "pageserver_broken_tenants_count" -- used only for broken
|
||||
)
|
||||
|
||||
@@ -899,7 +899,7 @@ class NeonEnvBuilder:
|
||||
|
||||
if self.scrub_on_exit:
|
||||
try:
|
||||
S3Scrubber(self.test_output_dir, self).scan_metadata()
|
||||
S3Scrubber(self).scan_metadata()
|
||||
except Exception as e:
|
||||
log.error(f"Error during remote storage scrub: {e}")
|
||||
cleanup_error = e
|
||||
@@ -3659,9 +3659,9 @@ class SafekeeperHttpClient(requests.Session):
|
||||
|
||||
|
||||
class S3Scrubber:
|
||||
def __init__(self, log_dir: Path, env: NeonEnvBuilder):
|
||||
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
|
||||
self.env = env
|
||||
self.log_dir = log_dir
|
||||
self.log_dir = log_dir or env.test_output_dir
|
||||
|
||||
def scrubber_cli(self, args: list[str], timeout) -> str:
|
||||
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
|
||||
@@ -3682,7 +3682,7 @@ class S3Scrubber:
|
||||
args = base_args + args
|
||||
|
||||
(output_path, stdout, status_code) = subprocess_capture(
|
||||
self.log_dir,
|
||||
self.env.test_output_dir,
|
||||
args,
|
||||
echo_stderr=True,
|
||||
echo_stdout=True,
|
||||
@@ -3967,27 +3967,24 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
|
||||
|
||||
# pg is the existing and running compute node, that we want to compare with a basebackup
|
||||
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
|
||||
# Get the timeline ID. We need it for the 'basebackup' command
|
||||
timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
|
||||
|
||||
# many tests already checkpoint, but do it just in case
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CHECKPOINT")
|
||||
|
||||
# wait for pageserver to catch up
|
||||
wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
|
||||
# stop postgres to ensure that files won't change
|
||||
endpoint.stop()
|
||||
|
||||
# Read the shutdown checkpoint's LSN
|
||||
pg_controldata_path = os.path.join(pg_bin.pg_bin_path, "pg_controldata")
|
||||
cmd = f"{pg_controldata_path} -D {endpoint.pgdata_dir}"
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
|
||||
checkpoint_lsn = re.findall(
|
||||
"Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
|
||||
)[0]
|
||||
log.debug(f"last checkpoint at {checkpoint_lsn}")
|
||||
|
||||
# Take a basebackup from pageserver
|
||||
restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
|
||||
restored_dir_path.mkdir(exist_ok=True)
|
||||
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
|
||||
|
||||
pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
|
||||
@@ -3995,7 +3992,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
|
||||
{psql_path} \
|
||||
--no-psqlrc \
|
||||
postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline_id}' \
|
||||
| tar -x -C {restored_dir_path}
|
||||
"""
|
||||
|
||||
|
||||
@@ -893,37 +893,14 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
|
||||
# in its heatmap
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
# Configure the secondary pageserver to have a phony small disk size
|
||||
ps_secondary.stop()
|
||||
total_size, _, _ = env.timelines_du(ps_secondary)
|
||||
blocksize = 512
|
||||
total_blocks = (total_size + (blocksize - 1)) // blocksize
|
||||
evict_bytes = total_size // 3
|
||||
|
||||
min_avail_bytes = total_size // 3
|
||||
|
||||
env.pageserver_start_with_disk_usage_eviction(
|
||||
ps_secondary,
|
||||
period="1s",
|
||||
max_usage_pct=100,
|
||||
min_avail_bytes=min_avail_bytes,
|
||||
mock_behavior={
|
||||
"type": "Success",
|
||||
"blocksize": blocksize,
|
||||
"total_blocks": total_blocks,
|
||||
# Only count layer files towards used bytes in the mock_statvfs.
|
||||
# This avoids accounting for metadata files & tenant conf in the tests.
|
||||
"name_filter": ".*__.*",
|
||||
},
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
def relieved_log_message():
|
||||
assert ps_secondary.log_contains(".*disk usage pressure relieved")
|
||||
|
||||
wait_until(10, 1, relieved_log_message)
|
||||
response = ps_secondary.http_client().disk_usage_eviction_run({"evict_bytes": evict_bytes})
|
||||
log.info(f"{response}")
|
||||
|
||||
post_eviction_total_size, _, _ = env.timelines_du(ps_secondary)
|
||||
|
||||
assert (
|
||||
total_size - post_eviction_total_size >= min_avail_bytes
|
||||
), "we requested at least min_avail_bytes worth of free space"
|
||||
total_size - post_eviction_total_size >= evict_bytes
|
||||
), "we requested at least evict_bytes worth of free space"
|
||||
|
||||
@@ -265,9 +265,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Having written a mixture of generation-aware and legacy index_part.json,
|
||||
# ensure the scrubber handles the situation as expected.
|
||||
metadata_summary = S3Scrubber(
|
||||
neon_env_builder.test_output_dir, neon_env_builder
|
||||
).scan_metadata()
|
||||
metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
|
||||
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
|
||||
assert metadata_summary["timeline_count"] == 1
|
||||
assert metadata_summary["timeline_shard_count"] == 1
|
||||
|
||||
@@ -498,7 +498,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# Scrub the remote storage
|
||||
# ========================
|
||||
# This confirms that the scrubber isn't upset by the presence of the heatmap
|
||||
S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
|
||||
S3Scrubber(neon_env_builder).scan_metadata()
|
||||
|
||||
# Detach secondary and delete tenant
|
||||
# ===================================
|
||||
|
||||
@@ -9,6 +9,7 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
S3Scrubber,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
@@ -19,12 +20,13 @@ from fixtures.pageserver.utils import (
|
||||
assert_prefix_not_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_for_upload,
|
||||
wait_tenant_status_404,
|
||||
wait_until_tenant_active,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import run_pg_bench_small, wait_until
|
||||
from requests.exceptions import ReadTimeout
|
||||
|
||||
@@ -669,3 +671,39 @@ def test_tenant_delete_races_timeline_creation(
|
||||
|
||||
# Zero tenants remain (we deleted the default tenant)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
|
||||
|
||||
|
||||
def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Validate that creating and then deleting the tenant both survives the scrubber,
|
||||
and that one can run the scrubber without problems.
|
||||
"""
|
||||
|
||||
remote_storage_kind = RemoteStorageKind.MOCK_S3
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
scrubber = S3Scrubber(neon_env_builder)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
# create a tenant separate from the main tenant so that we have one remaining
|
||||
# after we deleted it, as the scrubber treats empty buckets as an error.
|
||||
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
|
||||
env.stop()
|
||||
|
||||
result = scrubber.scan_metadata()
|
||||
assert result["with_warnings"] == []
|
||||
|
||||
env.start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
env.stop()
|
||||
|
||||
scrubber.scan_metadata()
|
||||
assert result["with_warnings"] == []
|
||||
|
||||
Reference in New Issue
Block a user