refactor(compaction): remove CompactionError::AlreadyRunning variant, use ::Other instead (#12512)

The only call stack that can emit the `::AlreadyRunning` variant is
```
-> iteration_inner
	-> iteration
		-> compaction_iteration
			-> compaction_loop
				-> start_background_loops
```

And on that call stack, the only differentiated handling of it is its
invocations of
`log_compaction_error -> CompactionError::is_cancel()`, which returns
`true` for
`::AlreadyRunning`.

I think the condition of `AlreadyRunning` is severe; it really shouldn't
happen.
So, this PR starts treating it as something that is to be logged at
`ERROR` / `WARN`
level, depending on the `degrate_to_warning` argument to
`log_compaction_error`.

refs
- https://databricks.atlassian.net/browse/LKB-182
This commit is contained in:
Christian Schwarz
2025-07-08 18:45:34 +02:00
committed by GitHub
parent 29d73e1404
commit f9b05a42d7
5 changed files with 5 additions and 14 deletions

View File

@@ -2503,7 +2503,6 @@ async fn timeline_checkpoint_handler(
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::Other(e) => ApiError::InternalServerError(e),
CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
}
)?;
}
@@ -3697,7 +3696,7 @@ async fn tenant_evaluate_feature_flag(
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
// and we don't need to worry about it for now.
let properties = tenant.feature_resolver.collect_properties();
if as_type.as_deref() == Some("boolean") {

View File

@@ -3334,7 +3334,6 @@ impl TenantShard {
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
}
CompactionError::AlreadyRunning(_) => {}
}
}

View File

@@ -303,7 +303,6 @@ pub(crate) fn log_compaction_error(
let level = match err {
e if e.is_cancel() => return,
ShuttingDown => return,
AlreadyRunning(_) => Level::ERROR,
CollectKeySpaceError(_) => Level::ERROR,
_ if task_cancelled => Level::INFO,
Other(err) => {

View File

@@ -2067,9 +2067,6 @@ impl Timeline {
Err(CompactionError::ShuttingDown) => {
// Covered by the `Err(e) if e.is_cancel()` branch.
}
Err(CompactionError::AlreadyRunning(_)) => {
// Covered by the `Err(e) if e.is_cancel()` branch.
}
Err(CompactionError::Other(_)) => {
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
}
@@ -6018,8 +6015,6 @@ pub(crate) enum CompactionError {
CollectKeySpaceError(#[from] CollectKeySpaceError),
#[error(transparent)]
Other(anyhow::Error),
#[error("Compaction already running: {0}")]
AlreadyRunning(&'static str),
}
impl CompactionError {
@@ -6028,7 +6023,6 @@ impl CompactionError {
matches!(
self,
Self::ShuttingDown
| Self::AlreadyRunning(_)
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
PageReconstructError::Cancelled

View File

@@ -572,7 +572,7 @@ impl GcCompactionQueue {
match res {
Ok(res) => Ok(res),
Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
Err(_) => {
Err(CompactionError::CollectKeySpaceError(_) | CompactionError::Other(_)) => {
// There are some cases where traditional gc might collect some layer
// files causing gc-compaction cannot read the full history of the key.
// This needs to be resolved in the long-term by improving the compaction
@@ -591,9 +591,9 @@ impl GcCompactionQueue {
timeline: &Arc<Timeline>,
) -> Result<CompactionOutcome, CompactionError> {
let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
return Err(CompactionError::AlreadyRunning(
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.",
));
return Err(CompactionError::Other(anyhow::anyhow!(
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."
)));
};
let has_pending_tasks;
let mut yield_for_l0 = false;