mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
storage_scrubber: fixes to garbage commands (#9409)
## Problem While running `find-garbage` and `purge-garbage`, I encountered two things that needed updating: - Console API may omit `user_id` since org accounts were added - When we cut over to using GenericRemoteStorage, the object listings we do during purge did not get proper retry handling, so could easily fail on usual S3 errors, and make the whole process drop out. ...and one bug: - We had a `.unwrap` which expects that after finding an object in a tenant path, a listing in that path will always return objects. This is not true, because a pageserver might be deleting the path at the same time as we scan it. ## Summary of changes - When listing objects during purge, use backoff::retry - Make `user_id` an `Option` - Handle the case where a tenant's objects go away during find-garbage.
This commit is contained in:
@@ -138,7 +138,7 @@ pub struct ProjectData {
|
||||
pub name: String,
|
||||
pub region_id: String,
|
||||
pub platform_id: String,
|
||||
pub user_id: String,
|
||||
pub user_id: Option<String>,
|
||||
pub pageserver_id: Option<u64>,
|
||||
#[serde(deserialize_with = "from_nullable_id")]
|
||||
pub tenant: TenantId,
|
||||
|
||||
@@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantId;
|
||||
use utils::{backoff, id::TenantId};
|
||||
|
||||
use crate::{
|
||||
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
|
||||
init_remote, list_objects_with_retries,
|
||||
metadata_stream::{stream_tenant_timelines, stream_tenants},
|
||||
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
|
||||
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -250,13 +250,16 @@ async fn find_garbage_inner(
|
||||
&target.tenant_root(&tenant_shard_id),
|
||||
)
|
||||
.await?;
|
||||
let object = tenant_objects.keys.first().unwrap();
|
||||
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
|
||||
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
|
||||
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||
continue;
|
||||
if let Some(object) = tenant_objects.keys.first() {
|
||||
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
|
||||
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
|
||||
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||
continue;
|
||||
} else {
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
|
||||
}
|
||||
} else {
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran");
|
||||
}
|
||||
} else {
|
||||
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
|
||||
@@ -406,14 +409,17 @@ pub async fn get_tenant_objects(
|
||||
// TODO: apply extra validation based on object modification time. Don't purge
|
||||
// tenants where any timeline's index_part.json has been touched recently.
|
||||
|
||||
let list = s3_client
|
||||
.list(
|
||||
Some(&tenant_root),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await?;
|
||||
let cancel = CancellationToken::new();
|
||||
let list = backoff::retry(
|
||||
|| s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel),
|
||||
|_| false,
|
||||
3,
|
||||
MAX_RETRIES as u32,
|
||||
"get_tenant_objects",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.expect("dummy cancellation token")?;
|
||||
Ok(list.keys)
|
||||
}
|
||||
|
||||
@@ -424,14 +430,25 @@ pub async fn get_timeline_objects(
|
||||
tracing::debug!("Listing objects in timeline {ttid}");
|
||||
let timeline_root = super::remote_timeline_path_id(&ttid);
|
||||
|
||||
let list = s3_client
|
||||
.list(
|
||||
Some(&timeline_root),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await?;
|
||||
let cancel = CancellationToken::new();
|
||||
let list = backoff::retry(
|
||||
|| {
|
||||
s3_client.list(
|
||||
Some(&timeline_root),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
MAX_RETRIES as u32,
|
||||
"get_timeline_objects",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.expect("dummy cancellation token")?;
|
||||
|
||||
Ok(list.keys)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user