Merge branch 'problame/2024-02-walredo-work/prespawn/split-code' into problame/2024-02-walredo-work/prespawn/broken-tenants-no-walredo

This commit is contained in:
Christian Schwarz
2024-02-02 18:36:15 +00:00
5 changed files with 43 additions and 84 deletions

View File

@@ -4388,10 +4388,6 @@ impl Timeline {
guard.finish_gc_timeline(&gc_layers);
if result.layers_removed != 0 {
fail_point!("after-timeline-gc-removed-layers");
}
#[cfg(feature = "testing")]
{
result.doomed_layers = gc_layers;

View File

@@ -831,3 +831,16 @@ class PageserverHttpClient(requests.Session):
self.put(
f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
).raise_for_status()
def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int:
detail = self.timeline_detail(
tenant_id,
timeline_id,
include_non_incremental_logical_size=True,
force_await_initial_logical_size=True,
)
current_logical_size = detail["current_logical_size"]
non_incremental = detail["current_logical_size_non_incremental"]
assert current_logical_size == non_incremental
assert isinstance(current_logical_size, int)
return current_logical_size

View File

@@ -155,6 +155,15 @@ class EvictionEnv:
mock_behavior,
eviction_order: EvictionOrder,
):
"""
Starts pageserver up with mocked statvfs setup. The startup is
problematic because of dueling initial logical size calculations
requiring layers and disk usage based task evicting.
Returns after initial logical sizes are complete, but the phase of disk
usage eviction task is unknown; it might need to run one more iteration
before assertions can be made.
"""
disk_usage_config = {
"period": period,
"max_usage_pct": max_usage_pct,
@@ -183,9 +192,15 @@ class EvictionEnv:
),
)
# we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
for tenant_id, timeline_id in self.timelines:
pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
def statvfs_called():
assert pageserver.log_contains(".*running mocked statvfs.*")
# we most likely have already completed multiple runs
wait_until(10, 1, statvfs_called)
@@ -789,9 +804,11 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
wait_until(10, 1, relieved_log_message)
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
def less_than_max_usage_pct():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
wait_until(2, 2, less_than_max_usage_pct)
def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
@@ -831,11 +848,13 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
wait_until(10, 1, relieved_log_message)
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
def more_than_min_avail_bytes_freed():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
assert (
total_size - post_eviction_total_size >= min_avail_bytes
), f"we requested at least {min_avail_bytes} worth of free space"
assert (
total_size - post_eviction_total_size >= min_avail_bytes
), "we requested at least min_avail_bytes worth of free space"
wait_until(2, 2, more_than_min_avail_bytes_freed)
def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):

View File

@@ -1,47 +0,0 @@
import subprocess
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
# Test gc_cutoff
#
# This test sets fail point at the end of GC, and checks that pageserver
# normally restarts after it. Also, there should be GC ERRORs in the log,
# but the fixture checks the log for any unexpected ERRORs after every
# test anyway, so it doesn't need any special attention here.
@pytest.mark.timeout(600)
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start(
initial_tenant_conf={
"gc_period": "10 s",
"gc_horizon": f"{1024 ** 2}",
"checkpoint_distance": f"{1024 ** 2}",
"compaction_period": "5 s",
# set PITR interval to be small, so we can do GC
"pitr_interval": "1 s",
"compaction_threshold": "3",
"image_creation_threshold": "2",
}
)
pageserver_http = env.pageserver.http_client()
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
tenant_id = env.initial_tenant
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
connstr = endpoint.connstr(options="-csynchronous_commit=off")
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
# Because this test does a rapid series of restarts of the same node, it's possible that
# we are restarted again before we can clean up deletion lists form the previous generation,
# resulting in a subsequent startup logging a warning.
env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
for _ in range(5):
with pytest.raises(subprocess.SubprocessError):
pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
env.pageserver.stop()
env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})

View File

@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
VanillaPostgres,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
assert_tenant_state,
timeline_delete_wait_completed,
@@ -40,7 +40,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
client = env.pageserver.http_client()
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
endpoint_main = env.endpoints.create_start("test_timeline_size")
log.info("postgres is running on 'test_timeline_size' branch")
@@ -73,7 +73,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
client = env.pageserver.http_client()
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
timeline_details = client.timeline_detail(
env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
)
@@ -153,7 +153,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
client = env.pageserver.http_client()
new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
endpoint_main = env.endpoints.create(
"test_timeline_size_quota_on_startup",
@@ -219,7 +219,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
client = env.pageserver.http_client()
new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota")
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
endpoint_main = env.endpoints.create(
"test_timeline_size_quota",
@@ -715,28 +715,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
# Timeline logical size initialization is an asynchronous background task that runs once,
# try a few times to ensure it's activated properly
def wait_for_timeline_size_init(
client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
):
for i in range(10):
timeline_details = client.timeline_detail(
tenant, timeline, include_non_incremental_logical_size=True
)
current_logical_size = timeline_details["current_logical_size"]
non_incremental = timeline_details["current_logical_size_non_incremental"]
if current_logical_size == non_incremental:
return
log.info(
f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}"
)
time.sleep(1)
raise Exception(
f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
)
def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
"""
Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete