mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
pageserver: tighten up code around SLRU dir key handling (#10082)
## Problem Changes in #9786 were functionally complete but missed some edges that made testing less robust than it should have been: - `is_key_disposable` didn't consider SLRU dir keys disposable - Timeline `init_empty` was always creating SLRU dir keys on all shards The result was that when we had a bug (https://github.com/neondatabase/neon/pull/10080), it wasn't apparent in tests, because one would only encounter the issue if running on a long-lived timeline with enough compaction to drop the initially created empty SLRU dir keys, _and_ some CLog truncation going on. Closes: https://github.com/neondatabase/cloud/issues/21516 ## Summary of changes - Update is_key_global and init_empty to handle SLRU dir keys properly -- the only functional impact is that we avoid writing some spurious keys in shards >0, but this makes testing much more robust. - Make `test_clog_truncate` explicitly use a sharded tenant The net result is that if one reverts #10080, then tests fail (i.e. this PR is a reproducer for the issue)
This commit is contained in:
@@ -565,6 +565,10 @@ impl Key {
|
||||
&& self.field5 == 0
|
||||
&& self.field6 == u32::MAX
|
||||
}
|
||||
|
||||
pub fn is_slru_dir_key(&self) -> bool {
|
||||
slru_dir_kind(self).is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
|
||||
@@ -173,7 +173,11 @@ impl ShardIdentity {
|
||||
|
||||
/// Return true if the key should be stored on all shards, not just one.
|
||||
pub fn is_key_global(&self, key: &Key) -> bool {
|
||||
if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
|
||||
if key.is_slru_block_key()
|
||||
|| key.is_slru_segment_size_key()
|
||||
|| key.is_aux_file_key()
|
||||
|| key.is_slru_dir_key()
|
||||
{
|
||||
// Special keys that are only stored on shard 0
|
||||
false
|
||||
} else if key.is_rel_block_key() {
|
||||
|
||||
@@ -1319,18 +1319,23 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
|
||||
let empty_dir = Value::Image(buf);
|
||||
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(
|
||||
slru_dir_to_key(SlruKind::MultiXactMembers),
|
||||
empty_dir.clone(),
|
||||
);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
|
||||
|
||||
// Initialize SLRUs on shard 0 only: creating these on other shards would be
|
||||
// harmless but they'd just be dropped on later compaction.
|
||||
if self.tline.tenant_shard_id.is_shard_zero() {
|
||||
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(
|
||||
slru_dir_to_key(SlruKind::MultiXactMembers),
|
||||
empty_dir.clone(),
|
||||
);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.utils import query_scalar
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
|
||||
|
||||
#
|
||||
# Test compute node start after clog truncation
|
||||
#
|
||||
def test_clog_truncate(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
def test_clog_truncate(neon_env_builder: NeonEnvBuilder):
|
||||
# Use a multi-sharded tenant because WAL ingest logic is shard-dependent, and
|
||||
# this test is one of the very few that exercises a CLogTruncate WAL record.
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=2)
|
||||
|
||||
# set aggressive autovacuum to make sure that truncation will happen
|
||||
config = [
|
||||
@@ -31,6 +32,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
|
||||
endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
|
||||
|
||||
# Consume many xids to advance clog
|
||||
log.info("Consuming xids...")
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("select test_consume_xids(1000*1000*10);")
|
||||
log.info("xids consumed")
|
||||
@@ -47,11 +49,17 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
|
||||
pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000")
|
||||
log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")
|
||||
|
||||
while os.path.isfile(pg_xact_0000_path):
|
||||
log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}")
|
||||
time.sleep(5)
|
||||
def assert_file_removed():
|
||||
exists = os.path.isfile(pg_xact_0000_path)
|
||||
if exists:
|
||||
log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}")
|
||||
assert not exists
|
||||
|
||||
log.info("Waiting for truncation...")
|
||||
wait_until(assert_file_removed)
|
||||
|
||||
# checkpoint to advance latest lsn
|
||||
log.info("Checkpointing...")
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("CHECKPOINT;")
|
||||
lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()")
|
||||
|
||||
Reference in New Issue
Block a user