mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 00:50:36 +00:00
refactor(compaction): remove CompactionError::AlreadyRunning variant, use ::Other instead (#12512)
The only call stack that can emit the `::AlreadyRunning` variant is ``` -> iteration_inner -> iteration -> compaction_iteration -> compaction_loop -> start_background_loops ``` And on that call stack, the only differentiated handling of it is its invocations of `log_compaction_error -> CompactionError::is_cancel()`, which returns `true` for `::AlreadyRunning`. I think the condition of `AlreadyRunning` is severe; it really shouldn't happen. So, this PR starts treating it as something that is to be logged at `ERROR` / `WARN` level, depending on the `degrate_to_warning` argument to `log_compaction_error`. refs - https://databricks.atlassian.net/browse/LKB-182
This commit is contained in:
committed by
GitHub
parent
29d73e1404
commit
f9b05a42d7
@@ -2503,7 +2503,6 @@ async fn timeline_checkpoint_handler(
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e),
|
||||
CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
}
|
||||
)?;
|
||||
}
|
||||
@@ -3697,7 +3696,7 @@ async fn tenant_evaluate_feature_flag(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// and we don't need to worry about it for now.
|
||||
let properties = tenant.feature_resolver.collect_properties();
|
||||
if as_type.as_deref() == Some("boolean") {
|
||||
|
||||
@@ -3334,7 +3334,6 @@ impl TenantShard {
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::AlreadyRunning(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -303,7 +303,6 @@ pub(crate) fn log_compaction_error(
|
||||
let level = match err {
|
||||
e if e.is_cancel() => return,
|
||||
ShuttingDown => return,
|
||||
AlreadyRunning(_) => Level::ERROR,
|
||||
CollectKeySpaceError(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
|
||||
@@ -2067,9 +2067,6 @@ impl Timeline {
|
||||
Err(CompactionError::ShuttingDown) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::AlreadyRunning(_)) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::Other(_)) => {
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
@@ -6018,8 +6015,6 @@ pub(crate) enum CompactionError {
|
||||
CollectKeySpaceError(#[from] CollectKeySpaceError),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
#[error("Compaction already running: {0}")]
|
||||
AlreadyRunning(&'static str),
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
@@ -6028,7 +6023,6 @@ impl CompactionError {
|
||||
matches!(
|
||||
self,
|
||||
Self::ShuttingDown
|
||||
| Self::AlreadyRunning(_)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::Cancelled
|
||||
|
||||
@@ -572,7 +572,7 @@ impl GcCompactionQueue {
|
||||
match res {
|
||||
Ok(res) => Ok(res),
|
||||
Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
|
||||
Err(_) => {
|
||||
Err(CompactionError::CollectKeySpaceError(_) | CompactionError::Other(_)) => {
|
||||
// There are some cases where traditional gc might collect some layer
|
||||
// files causing gc-compaction cannot read the full history of the key.
|
||||
// This needs to be resolved in the long-term by improving the compaction
|
||||
@@ -591,9 +591,9 @@ impl GcCompactionQueue {
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
|
||||
return Err(CompactionError::AlreadyRunning(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.",
|
||||
));
|
||||
return Err(CompactionError::Other(anyhow::anyhow!(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."
|
||||
)));
|
||||
};
|
||||
let has_pending_tasks;
|
||||
let mut yield_for_l0 = false;
|
||||
|
||||
Reference in New Issue
Block a user