Add FullAccessTimeline guard in safekeepers (#7887)

This is a preparation for
https://github.com/neondatabase/neon/issues/6337.

The idea is to add FullAccessTimeline, which will act as a guard for
tasks requiring access to WAL files. Eviction will be blocked on these
tasks and WAL won't be deleted from disk until there is at least one
active FullAccessTimeline.

To get FullAccessTimeline, tasks call `tli.full_access_guard().await?`.
After eviction is implemented, this function will be responsible for
downloading missing WAL file and waiting until the download finishes.

This commit also contains other small refactorings:
- Separate `get_tenant_dir` and `get_timeline_dir` functions for
building a local path. This is useful for looking at usages and finding
tasks requiring access to local filesystem.
- `timeline_manager` is now responsible for spawning all background
tasks
- WAL removal task is now spawned instantly after horizon is updated
This commit is contained in:
Arthur Petukhovsky
2024-05-31 14:19:45 +01:00
committed by GitHub
parent 5a394fde56
commit 16b2e74037
23 changed files with 726 additions and 576 deletions

View File

@@ -72,6 +72,18 @@ class Lsn:
def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
return self.lsn_int // seg_sz
def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
segno = self.segno(seg_sz)
# The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
# XXXXXXXX is the higher 8 hex digits of segno
high_bits = segno >> 8
# YY is the lower 2 hex digits of segno
low_bits = segno & 0xFF
return f"00000001{high_bits:08X}000000{low_bits:02X}"
@dataclass(frozen=True)
class Key:

View File

@@ -973,6 +973,9 @@ class NeonEnvBuilder:
for pageserver in self.env.pageservers:
pageserver.assert_no_errors()
for safekeeper in self.env.safekeepers:
safekeeper.assert_no_errors()
self.env.storage_controller.assert_no_errors()
try:
@@ -3813,6 +3816,9 @@ class Safekeeper(LogUtils):
self.running = False
return self
def assert_no_errors(self):
assert not self.log_contains("manager task finished prematurely")
def append_logical_message(
self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
) -> Dict[str, Any]:
@@ -3898,6 +3904,15 @@ class Safekeeper(LogUtils):
"""
cli = self.http_client()
target_segment_file = lsn.segment_name()
def are_segments_removed():
segments = self.list_segments(tenant_id, timeline_id)
log.info(
f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
)
assert all(target_segment_file <= s for s in segments)
def are_lsns_advanced():
stat = cli.timeline_status(tenant_id, timeline_id)
log.info(
@@ -3909,6 +3924,7 @@ class Safekeeper(LogUtils):
# pageserver to this safekeeper
wait_until(30, 1, are_lsns_advanced)
cli.checkpoint(tenant_id, timeline_id)
wait_until(30, 1, are_segments_removed)
def wait_until_paused(self, failpoint: str):
msg = f"at failpoint {failpoint}"