mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
safekeeper: fix endpoint restart immediately after xlog switch.
Check that truncation point is not from the future by comparing it with write_record_lsn, not write_lsn, and explain that xlog switch changes their normal order. ref https://github.com/neondatabase/neon/issues/8911
This commit is contained in:
@@ -938,8 +938,9 @@ where
|
||||
}
|
||||
|
||||
trace!(
|
||||
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
|
||||
"processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
|
||||
msg.wal_data.len(),
|
||||
msg.h.begin_lsn,
|
||||
msg.h.end_lsn,
|
||||
msg.h.commit_lsn,
|
||||
msg.h.truncate_lsn,
|
||||
|
||||
@@ -98,7 +98,19 @@ pub struct PhysicalStorage {
|
||||
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
|
||||
write_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record written to disk. Still can be not fully flushed.
|
||||
/// The LSN of the last WAL record written to disk. Still can be not fully
|
||||
/// flushed.
|
||||
///
|
||||
/// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
|
||||
/// switch ingest the reverse is true because we don't bump write_lsn up to
|
||||
/// the next segment: WAL stream from the compute doesn't have the gap and
|
||||
/// for simplicity / as a sanity check we disallow any non-sequential
|
||||
/// writes, so write zeros as is.
|
||||
///
|
||||
/// Similar effect is in theory possible due to LSN alignment: if record
|
||||
/// ends at *2, decoder will report end lsn as *8 even though we haven't
|
||||
/// written these zeros yet. In practice compute likely never sends
|
||||
/// non-aligned chunks of data.
|
||||
write_record_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record flushed to disk.
|
||||
@@ -440,11 +452,12 @@ impl Storage for PhysicalStorage {
|
||||
.with_label_values(&["truncate_wal"])
|
||||
.start_timer();
|
||||
|
||||
// Streaming must not create a hole, so truncate cannot be called on non-written lsn
|
||||
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
|
||||
// Streaming must not create a hole, so truncate cannot be called on
|
||||
// non-written lsn.
|
||||
if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
|
||||
bail!(
|
||||
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
|
||||
self.write_lsn,
|
||||
"truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
|
||||
self.write_record_lsn,
|
||||
end_pos
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1057,6 +1057,24 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
|
||||
endpoint.start()
|
||||
|
||||
|
||||
# Try restarting endpoint immediately after xlog switch.
|
||||
# https://github.com/neondatabase/neon/issues/8911
|
||||
def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
endpoint.safe_psql("create table t (i int)")
|
||||
|
||||
endpoint.safe_psql("SELECT pg_switch_wal()")
|
||||
|
||||
# we want immediate shutdown to have endpoint restart on xlog switch record,
|
||||
# so prevent shutdown checkpoint.
|
||||
endpoint.stop(mode="immediate")
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.safe_psql("SELECT 'works'")
|
||||
|
||||
|
||||
# Context manager which logs passed time on exit.
|
||||
class DurationLogger:
|
||||
def __init__(self, desc):
|
||||
|
||||
Reference in New Issue
Block a user