Timeline::repartition: enforce no concurrent callers & lsn to not move backwards (#6862)

This PR enforces aspects of `Timeline::repartition` that were already
true at runtime:

- it's not called concurrently, so, bail out if it is anyway (see
  comment why it's not called concurrently)
- the `lsn` should never be moving backwards over the lifetime of a
  Timeline object, because last_record_lsn() can only move forwards
  over the lifetime of a Timeline object

The switch to tokio::sync::Mutex blows up the size of the `partitioning`
field from 40 bytes to 72 bytes on Linux x86_64.
That would be concerning if it was a hot field, but, `partitioning` is
only accessed every 20s by one task, so, there won't be excessive cache
pain on it.
(It still sucks that it's now >1 cache line, but I need the Send-able
MutexGuard in the next PR)

part of https://github.com/neondatabase/neon/issues/6861
This commit is contained in:
Christian Schwarz
2024-02-26 11:22:15 +01:00
committed by GitHub
parent 5273c94c59
commit ceedc3ef73

View File

@@ -292,7 +292,7 @@ pub struct Timeline {
pub initdb_lsn: Lsn,
/// When did we last calculate the partitioning?
partitioning: Mutex<(KeyPartitioning, Lsn)>,
partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
@@ -1640,7 +1640,7 @@ impl Timeline {
// initial logical size is 0.
LogicalSize::empty_initial()
},
partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
repartition_threshold: 0,
last_received_wal: Mutex::new(None),
@@ -3354,30 +3354,34 @@ impl Timeline {
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
{
let partitioning_guard = self.partitioning.lock().unwrap();
let distance = lsn.0 - partitioning_guard.1 .0;
if partitioning_guard.1 != Lsn(0)
&& distance <= self.repartition_threshold
&& !flags.contains(CompactFlags::ForceRepartition)
{
debug!(
distance,
threshold = self.repartition_threshold,
"no repartitioning needed"
);
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
}
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
// and hence before the compaction task starts.
anyhow::bail!("repartition() called concurrently, this should not happen");
};
if lsn < partitioning_guard.1 {
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
}
let distance = lsn.0 - partitioning_guard.1 .0;
if partitioning_guard.1 != Lsn(0)
&& distance <= self.repartition_threshold
&& !flags.contains(CompactFlags::ForceRepartition)
{
debug!(
distance,
threshold = self.repartition_threshold,
"no repartitioning needed"
);
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
}
let keyspace = self.collect_keyspace(lsn, ctx).await?;
let partitioning = keyspace.partition(partition_size);
let mut partitioning_guard = self.partitioning.lock().unwrap();
if lsn > partitioning_guard.1 {
*partitioning_guard = (partitioning, lsn);
} else {
warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless");
}
*partitioning_guard = (partitioning, lsn);
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
}