s3_scrubber: implement scan-metadata for safekeepers.

It works by listing postgres table with memory dump of safekeepers state. s3
contents for each timeline are checked then against timeline_start_lsn and
backup_lsn. If inconsistency is found, before complaining timeline (branch) is
checked at control plane; it might have been deleted between the dump take and
s3 check.
This commit is contained in:
Arseny Sher
2024-04-03 15:46:54 +03:00
committed by Arseny Sher
parent 010f0a310a
commit 3da54e6d90
9 changed files with 363 additions and 37 deletions

21
Cargo.lock generated
View File

@@ -3184,6 +3184,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -3520,6 +3530,12 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -5095,8 +5111,11 @@ dependencies = [
"hex",
"histogram",
"itertools",
"native-tls",
"pageserver",
"pageserver_api",
"postgres-native-tls",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest",
@@ -5105,6 +5124,7 @@ dependencies = [
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-rustls 0.25.0",
"tokio-stream",
"tracing",
@@ -6507,6 +6527,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",

View File

@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.20.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"
urlencoding = "2.1"

View File

@@ -22,7 +22,11 @@ serde_with.workspace = true
workspace_hack.workspace = true
utils.workspace = true
async-stream.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres_ffi.workspace = true
tokio-stream.workspace = true
tokio-postgres.workspace = true
futures-util.workspace = true
itertools.workspace = true
camino.workspace = true

View File

@@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted.
#### `scan-metadata`
Walk objects in a pageserver S3 bucket, and report statistics on the contents.
Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
Errors are logged to stderr and summary to stdout.
For pageserver:
```
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
Timelines: 31106
With errors: 3
@@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
```
For safekeepers, dump_db_connstr and dump_db_table must be
specified; they should point to table with debug dump which will be used
to list timelines and find their backup and start LSNs.
## Cleaning up running pageservers
If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.

View File

@@ -4,7 +4,8 @@ pub mod checks;
pub mod cloud_admin_api;
pub mod garbage;
pub mod metadata_stream;
pub mod scan_metadata;
pub mod scan_pageserver_metadata;
pub mod scan_safekeeper_metadata;
pub mod tenant_snapshot;
use std::env;
@@ -141,12 +142,17 @@ impl RootTarget {
pub fn tenants_root(&self) -> S3Target {
match self {
Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
Self::Safekeeper(root) => root.with_sub_segment("wal"),
Self::Safekeeper(root) => root.clone(),
}
}
pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
self.tenants_root().with_sub_segment(&tenant_id.to_string())
match self {
Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
Self::Safekeeper(_) => self
.tenants_root()
.with_sub_segment(&tenant_id.tenant_id.to_string()),
}
}
pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
@@ -337,9 +343,7 @@ fn init_remote(
}),
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config
.prefix_in_bucket
.unwrap_or("safekeeper/v1".to_string()),
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal".to_string()),
delimiter,
}),
};

View File

@@ -1,9 +1,13 @@
use anyhow::bail;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use s3_scrubber::scan_metadata::scan_metadata;
use s3_scrubber::scan_pageserver_metadata::scan_metadata;
use s3_scrubber::tenant_snapshot::SnapshotDownloader;
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
use s3_scrubber::{
init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
NodeKind, TraversingDepth,
};
use clap::{Parser, Subcommand};
use utils::id::TenantId;
@@ -35,11 +39,20 @@ enum Command {
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
mode: PurgeMode,
},
#[command(verbatim_doc_comment)]
ScanMetadata {
#[arg(short, long)]
node_kind: NodeKind,
#[arg(short, long, default_value_t = false)]
json: bool,
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
#[arg(long, default_value = None)]
/// For safekeeper node_kind only, points to db with debug dump
dump_db_connstr: Option<String>,
/// For safekeeper node_kind only, table in the db with debug dump
#[arg(long, default_value = None)]
dump_db_table: Option<String>,
},
TenantSnapshot {
#[arg(long = "tenant-id")]
@@ -72,33 +85,75 @@ async fn main() -> anyhow::Result<()> {
));
match cli.command {
Command::ScanMetadata { json, tenant_ids } => {
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
Command::ScanMetadata {
json,
tenant_ids,
node_kind,
dump_db_connstr,
dump_db_table,
} => {
if let NodeKind::Safekeeper = node_kind {
let dump_db_connstr =
dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
let dump_db_table =
dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
let summary = scan_safekeeper_metadata(
bucket_config.clone(),
tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
dump_db_connstr,
dump_db_table,
)
.await?;
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
if summary.is_fatal() {
bail!("Fatal scrub errors detected");
}
if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
bail!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
);
}
Ok(())
} else {
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
}
}
}
}

View File

@@ -0,0 +1,234 @@
use std::{collections::HashSet, str::FromStr};
use aws_sdk_s3::Client;
use futures::stream::{StreamExt, TryStreamExt};
use pageserver_api::shard::TenantShardId;
use postgres_ffi::{XLogFileName, PG_TLI};
use serde::Serialize;
use tokio_postgres::types::PgLsn;
use tracing::{error, info, trace};
use utils::{
id::{TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
};
use crate::{
cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
};
/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
#[derive(Serialize)]
pub struct MetadataSummary {
timeline_count: usize,
with_errors: HashSet<TenantTimelineId>,
deleted_count: usize,
}
impl MetadataSummary {
fn new() -> Self {
Self {
timeline_count: 0,
with_errors: HashSet::new(),
deleted_count: 0,
}
}
pub fn summary_string(&self) -> String {
format!(
"timeline_count: {}, with_errors: {}",
self.timeline_count,
self.with_errors.len()
)
}
pub fn is_empty(&self) -> bool {
self.timeline_count == 0
}
pub fn is_fatal(&self) -> bool {
!self.with_errors.is_empty()
}
}
/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
/// statistics.
///
/// It works by listing timelines along with timeline_start_lsn and backup_lsn
/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
/// segments are missing, before complaining control plane is queried to check if
/// the project wasn't deleted in the meanwhile.
pub async fn scan_safekeeper_metadata(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantId>,
dump_db_connstr: String,
dump_db_table: String,
) -> anyhow::Result<MetadataSummary> {
info!(
"checking bucket {}, region {}, dump_db_table {}",
bucket_config.bucket, bucket_config.region, dump_db_table
);
// Use the native TLS implementation (Neon requires TLS)
let tls_connector =
postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
let tenant_filter_clause = if !tenant_ids.is_empty() {
format!(
"and tenant_id in ({})",
tenant_ids
.iter()
.map(|t| format!("'{}'", t))
.collect::<Vec<_>>()
.join(", ")
)
} else {
"".to_owned()
};
let query = format!(
"select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
dump_db_table, tenant_filter_clause,
);
info!("query is {}", query);
let timelines = client.query(&query, &[]).await?;
info!("loaded {} timelines", timelines.len());
let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
let console_config = ConsoleConfig::from_env()?;
let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
let timeline_start_lsn_pg: PgLsn = row.get(2);
let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
let backup_lsn_pg: PgLsn = row.get(3);
let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
check_timeline(
&s3_client,
&target,
&cloud_admin_api_client,
ttid,
timeline_start_lsn,
backup_lsn,
)
});
// Run multiple check_timeline's concurrently.
const CONCURRENCY: usize = 32;
let mut timelines = checks.try_buffered(CONCURRENCY);
let mut summary = MetadataSummary::new();
while let Some(r) = timelines.next().await {
let res = r?;
summary.timeline_count += 1;
if !res.is_ok {
summary.with_errors.insert(res.ttid);
}
if res.is_deleted {
summary.deleted_count += 1;
}
}
Ok(summary)
}
struct TimelineCheckResult {
ttid: TenantTimelineId,
is_ok: bool,
is_deleted: bool, // timeline is deleted in cplane
}
/// List s3 and check that is has all expected WAL for the ttid. Consistency
/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
/// Ok(false) if not, Err if failed to check.
async fn check_timeline(
s3_client: &Client,
root: &RootTarget,
api_client: &CloudAdminApiClient,
ttid: TenantTimelineId,
timeline_start_lsn: Lsn,
backup_lsn: Lsn,
) -> anyhow::Result<TimelineCheckResult> {
trace!(
"checking ttid {}, should contain WAL [{}-{}]",
ttid,
timeline_start_lsn,
backup_lsn
);
// calculate expected segfiles
let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
(expected_first_segno..expected_last_segno)
.map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
);
let expected_files_num = expected_segfiles.len();
trace!("expecting {} files", expected_segfiles.len(),);
// now list s3 and check if it misses something
let ttshid =
TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
let mut timeline_dir_target = root.timeline_root(&ttshid);
// stream_listing yields only common_prefixes if delimiter is not empty, but
// we need files, so unset it.
timeline_dir_target.delimiter = String::new();
let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
while let Some(obj) = stream.next().await {
let obj = obj?;
let key = obj.key();
let seg_name = key
.strip_prefix(&timeline_dir_target.prefix_in_bucket)
.expect("failed to extract segment name");
expected_segfiles.remove(seg_name);
}
if !expected_segfiles.is_empty() {
// Before complaining check cplane, probably timeline is already deleted.
let bdata = api_client.find_timeline_branch(ttid.timeline_id).await?;
let deleted = match bdata {
Some(bdata) => bdata.deleted,
None => {
// note: should be careful with selecting proper cplane address
info!("ttid {} not found, assuming it is deleted", ttid);
true
}
};
if deleted {
// ok, branch is deleted
return Ok(TimelineCheckResult {
ttid,
is_ok: true,
is_deleted: true,
});
}
error!(
"ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
ttid,
expected_segfiles.len(),
expected_files_num,
timeline_start_lsn,
backup_lsn,
);
return Ok(TimelineCheckResult {
ttid,
is_ok: false,
is_deleted: false,
});
}
Ok(TimelineCheckResult {
ttid,
is_ok: true,
is_deleted: false,
})
}

View File

@@ -3734,7 +3734,9 @@ class S3Scrubber:
return stdout
def scan_metadata(self) -> Any:
stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
stdout = self.scrubber_cli(
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
)
try:
return json.loads(stdout)