Evict WAL files from disk (#8022)

Fixes https://github.com/neondatabase/neon/issues/6337 Add safekeeper support to switch between `Present` and `Offloaded(flush_lsn)` states. The offloading is disabled by default, but can be controlled using new cmdline arguments: ``` --enable-offload Enable automatic switching to offloaded state --delete-offloaded-wal Delete local WAL files after offloading. When disabled, they will be left on disk --control-file-save-interval <CONTROL_FILE_SAVE_INTERVAL> Pending updates to control file will be automatically saved after this interval [default: 300s] ``` Manager watches state updates and detects when there are no actvity on the timeline and actual partial backup upload in remote storage. When all conditions are met, the state can be switched to offloaded. In `timeline.rs` there is `StateSK` enum to support switching between states. When offloaded, code can access only control file structure and cannot use `SafeKeeper` to accept new WAL. `FullAccessTimeline` is now renamed to `WalResidentTimeline`. This struct contains guard to notify manager about active tasks requiring on-disk WAL access. All guards are issued by the manager, all requests are sent via channel using `ManagerCtl`. When manager receives request to issue a guard, it unevicts timeline if it's currently evicted. Fixed a bug in partial WAL backup, it used `term` instead of `last_log_term` previously. After this commit is merged, next step is to roll this change out, as in issue #6338.
2026-01-07 13:32:57 +00:00 · 2024-06-26 18:58:56 +01:00
parent dd3adc3693
commit 76fc3d4aa1
25 changed files with 1673 additions and 480 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3916,6 +3916,8 @@ class Safekeeper(LogUtils):

    def assert_no_errors(self):
        assert not self.log_contains("manager task finished prematurely")
+        assert not self.log_contains("error while acquiring WalResidentTimeline guard")
+        assert not self.log_contains("timeout while acquiring WalResidentTimeline guard")

    def append_logical_message(
        self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1,4 +1,5 @@
 import filecmp
+import logging
 import os
 import random
 import shutil
@@ -2178,3 +2179,102 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder):

    do_something()
    do_something()
+
+
+# Test creates 5 endpoints and tries to wake them up randomly. All timeouts are
+# configured to be very short, so that we expect that:
+# - pageserver will update remote_consistent_lsn very often
+# - safekeepers will upload partial WAL segments very often
+# - safekeeper will try to evict and unevict timelines
+#
+# Test checks that there are no critical errors while doing this. Also it checks
+# that every safekeeper has at least one successful eviction.
+@pytest.mark.parametrize("delete_offloaded_wal", [False, True])
+@pytest.mark.parametrize("restart_chance", [0.0, 0.2])
+def test_s3_eviction(
+    neon_env_builder: NeonEnvBuilder, delete_offloaded_wal: bool, restart_chance: float
+):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_timeout": "100ms",
+        }
+    )
+
+    extra_opts = [
+        "--enable-offload",
+        "--partial-backup-timeout",
+        "50ms",
+        "--control-file-save-interval",
+        "1s",
+    ]
+    if delete_offloaded_wal:
+        extra_opts.append("--delete-offloaded-wal")
+
+    for sk in env.safekeepers:
+        sk.stop().start(extra_opts=extra_opts)
+
+    n_timelines = 5
+
+    branch_names = [f"branch{tlin}" for tlin in range(n_timelines)]
+    timelines = []
+    ps_client = env.pageservers[0].http_client()
+
+    # start postgres on each timeline
+    endpoints: list[Endpoint] = []
+    for branch_name in branch_names:
+        timeline_id = env.neon_cli.create_branch(branch_name)
+        timelines.append(timeline_id)
+
+        endpoints.append(env.endpoints.create_start(branch_name))
+        endpoints[-1].safe_psql("CREATE TABLE t(i int)")
+        endpoints[-1].safe_psql("INSERT INTO t VALUES (0)")
+
+        lsn = endpoints[-1].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]
+        log.info(f"{branch_name}: LSN={lsn}")
+
+        endpoints[-1].stop()
+
+        # update remote_consistent_lsn on pageserver
+        ps_client.timeline_checkpoint(env.initial_tenant, timelines[-1], wait_until_uploaded=True)
+
+    check_values = [0] * n_timelines
+
+    n_iters = 20
+    for _ in range(n_iters):
+        if log.isEnabledFor(logging.DEBUG):
+            for j in range(n_timelines):
+                detail = ps_client.timeline_detail(env.initial_tenant, timelines[j])
+                log.debug(
+                    f'{branch_names[j]}: RCL={detail["remote_consistent_lsn"]}, LRL={detail["last_record_lsn"]}'
+                )
+
+        i = random.randint(0, n_timelines - 1)
+        log.info(f"Starting endpoint {i}")
+        endpoints[i].start()
+        check_values[i] += 1
+        res = endpoints[i].safe_psql("UPDATE t SET i = i + 1 RETURNING i")
+        assert res[0][0] == check_values[i]
+
+        lsn = endpoints[i].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]
+        log.info(f"{branch_names[i]}: LSN={lsn}")
+
+        endpoints[i].stop()
+
+        # update remote_consistent_lsn on pageserver
+        ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
+
+        # restarting random safekeepers
+        for sk in env.safekeepers:
+            if random.random() < restart_chance:
+                sk.stop().start(extra_opts=extra_opts)
+        time.sleep(0.5)
+
+    # require at least one successful eviction in at least one safekeeper
+    # TODO: require eviction in each safekeeper after https://github.com/neondatabase/neon/issues/8148 is fixed
+    assert any(
+        sk.log_contains("successfully evicted timeline")
+        and sk.log_contains("successfully restored evicted timeline")
+        for sk in env.safekeepers
+    )
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -200,9 +200,8 @@ async def run_restarts_under_load(
        # assert that at least one transaction has completed in every worker
        stats.check_progress()

-        # testing #6530, temporary here
-        # TODO: remove afer partial backup is enabled by default
-        victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"])
+        # testing #6530
+        victim.start(extra_opts=["--partial-backup-timeout=2s"])

    log.info("Iterations are finished, exiting coroutines...")
    stats.running = False