ingest: rate-limited warning if WAL commit timestamps lags for > wait_lsn_timeout (#8839)

refs https://github.com/neondatabase/cloud/issues/13750

The logging in this commit will make it easier to detect lagging ingest.

We're trusting compute timestamps --- ideally we'd use SK timestmaps
instead.
But trusting the compute timestamp is ok for now.
This commit is contained in:
Christian Schwarz
2024-08-29 13:06:00 +02:00
committed by GitHub
parent cfa45ff5ee
commit c2f8fdccd7
8 changed files with 111 additions and 10 deletions

View File

@@ -173,6 +173,11 @@ def test_backward_compatibility(
try:
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
# check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
ingest_lag_log_line = (
".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
)
env.pageserver.allowed_errors.append(ingest_lag_log_line)
neon_env_builder.start()
check_neon_works(
@@ -181,6 +186,9 @@ def test_backward_compatibility(
sql_dump_path=compatibility_snapshot_dir / "dump.sql",
repo_dir=env.repo_dir,
)
env.pageserver.assert_log_contains(ingest_lag_log_line)
except Exception:
if breaking_changes_allowed:
pytest.xfail(

View File

@@ -62,6 +62,12 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
elements_to_insert = 1_000_000
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
# we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
# => after we run into a timeout and reconnect to a different SK, more time than wait_lsn_timeout has passed
# ==> we log this error
env.pageserver.allowed_errors.append(
".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
)
insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)