storcon: reproduce shard split issue (#11290)

## Problem

Issue https://github.com/neondatabase/neon/issues/11254 describes a case
where restart during a shard split can result in a bad end state in the
database.

## Summary of changes

- Add a reproducer for the issue
- Tighten an existing safety check around updated row counts in
complete_shard_split
This commit is contained in:
John Spray
2025-03-21 08:48:56 +00:00
committed by GitHub
parent 0d99609870
commit 76088c16d2
4 changed files with 155 additions and 3 deletions

View File

@@ -1725,6 +1725,8 @@ class LogUtils:
log.warning(f"Skipping log check: {logfile} does not exist")
return None
log.info(f"Checking log {logfile} for pattern '{pattern}'")
contains_re = re.compile(pattern)
# XXX: Our rust logging machinery buffers the messages, so if you
@@ -2618,10 +2620,13 @@ class NeonProxiedStorageController(NeonStorageController):
self.running = False
return self
def instance_log_path(self, instance_id: int) -> Path:
return self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log"
def assert_no_errors(self):
for instance_id in self.instances.keys():
assert_no_errors(
self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log",
self.instance_log_path(instance_id),
"storage_controller",
self.allowed_errors,
)
@@ -2629,7 +2634,14 @@ class NeonProxiedStorageController(NeonStorageController):
def log_contains(
self, pattern: str, offset: None | LogCursor = None
) -> tuple[str, LogCursor] | None:
raise NotImplementedError()
for instance_id in self.instances.keys():
log_path = self.instance_log_path(instance_id)
checker = LogUtils(log_path)
found = checker.log_contains(pattern, offset)
if found is not None:
return found
return None
@dataclass