diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d6b0075306..f1ddf4f705 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4388,10 +4388,6 @@ impl Timeline { guard.finish_gc_timeline(&gc_layers); - if result.layers_removed != 0 { - fail_point!("after-timeline-gc-removed-layers"); - } - #[cfg(feature = "testing")] { result.doomed_layers = gc_layers; diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 1a8765d830..92e5027a9f 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -831,3 +831,16 @@ class PageserverHttpClient(requests.Session): self.put( f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}" ).raise_for_status() + + def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int: + detail = self.timeline_detail( + tenant_id, + timeline_id, + include_non_incremental_logical_size=True, + force_await_initial_logical_size=True, + ) + current_logical_size = detail["current_logical_size"] + non_incremental = detail["current_logical_size_non_incremental"] + assert current_logical_size == non_incremental + assert isinstance(current_logical_size, int) + return current_logical_size diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 6a4f0edbea..dcbf8a5025 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -155,6 +155,15 @@ class EvictionEnv: mock_behavior, eviction_order: EvictionOrder, ): + """ + Starts pageserver up with mocked statvfs setup. The startup is + problematic because of dueling initial logical size calculations + requiring layers and disk usage based task evicting. + + Returns after initial logical sizes are complete, but the phase of disk + usage eviction task is unknown; it might need to run one more iteration + before assertions can be made. + """ disk_usage_config = { "period": period, "max_usage_pct": max_usage_pct, @@ -183,9 +192,15 @@ class EvictionEnv: ), ) + # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction + for tenant_id, timeline_id in self.timelines: + pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client() + pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id) + def statvfs_called(): assert pageserver.log_contains(".*running mocked statvfs.*") + # we most likely have already completed multiple runs wait_until(10, 1, statvfs_called) @@ -789,9 +804,11 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): wait_until(10, 1, relieved_log_message) - post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + def less_than_max_usage_pct(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage" - assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage" + wait_until(2, 2, less_than_max_usage_pct) def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): @@ -831,11 +848,13 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): wait_until(10, 1, relieved_log_message) - post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + def more_than_min_avail_bytes_freed(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert ( + total_size - post_eviction_total_size >= min_avail_bytes + ), f"we requested at least {min_avail_bytes} worth of free space" - assert ( - total_size - post_eviction_total_size >= min_avail_bytes - ), "we requested at least min_avail_bytes worth of free space" + wait_until(2, 2, more_than_min_avail_bytes_freed) def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py deleted file mode 100644 index 284a8c3563..0000000000 --- a/test_runner/regress/test_gc_cutoff.py +++ /dev/null @@ -1,47 +0,0 @@ -import subprocess - -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin - - -# Test gc_cutoff -# -# This test sets fail point at the end of GC, and checks that pageserver -# normally restarts after it. Also, there should be GC ERRORs in the log, -# but the fixture checks the log for any unexpected ERRORs after every -# test anyway, so it doesn't need any special attention here. -@pytest.mark.timeout(600) -def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start( - initial_tenant_conf={ - "gc_period": "10 s", - "gc_horizon": f"{1024 ** 2}", - "checkpoint_distance": f"{1024 ** 2}", - "compaction_period": "5 s", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", - "compaction_threshold": "3", - "image_creation_threshold": "2", - } - ) - - pageserver_http = env.pageserver.http_client() - - # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test - tenant_id = env.initial_tenant - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) - - pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) - - # Because this test does a rapid series of restarts of the same node, it's possible that - # we are restarted again before we can clean up deletion lists form the previous generation, - # resulting in a subsequent startup logging a warning. - env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*") - - for _ in range(5): - with pytest.raises(subprocess.SubprocessError): - pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) - env.pageserver.stop() - env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"}) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 4c5cb32caa..303aabb58d 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -20,7 +20,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, @@ -40,7 +40,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -73,7 +73,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) timeline_details = client.timeline_detail( env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) @@ -153,7 +153,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create( "test_timeline_size_quota_on_startup", @@ -219,7 +219,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create( "test_timeline_size_quota", @@ -715,28 +715,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS -# Timeline logical size initialization is an asynchronous background task that runs once, -# try a few times to ensure it's activated properly -def wait_for_timeline_size_init( - client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -): - for i in range(10): - timeline_details = client.timeline_detail( - tenant, timeline, include_non_incremental_logical_size=True - ) - current_logical_size = timeline_details["current_logical_size"] - non_incremental = timeline_details["current_logical_size_non_incremental"] - if current_logical_size == non_incremental: - return - log.info( - f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}" - ) - time.sleep(1) - raise Exception( - f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" - ) - - def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): """ Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete