diff --git a/Cargo.lock b/Cargo.lock index de548bb2de..f2f06210cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3184,6 +3184,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.1" @@ -3520,6 +3530,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -5095,8 +5111,11 @@ dependencies = [ "hex", "histogram", "itertools", + "native-tls", "pageserver", "pageserver_api", + "postgres-native-tls", + "postgres_ffi", "rand 0.8.5", "remote_storage", "reqwest", @@ -5105,6 +5124,7 @@ dependencies = [ "serde_with", "thiserror", "tokio", + "tokio-postgres", "tokio-rustls 0.25.0", "tokio-stream", "tracing", @@ -6507,6 +6527,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", diff --git a/Cargo.toml b/Cargo.toml index 92dcc254d4..32a0bc23e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.20.0" -tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } twox-hash = { version = "1.6.3", default-features = false } url = "2.2" urlencoding = "2.1" diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index 0ee9112010..37124e6caf 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -22,7 +22,11 @@ serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true +native-tls.workspace = true +postgres-native-tls.workspace = true +postgres_ffi.workspace = true tokio-stream.workspace = true +tokio-postgres.workspace = true futures-util.workspace = true itertools.workspace = true camino.workspace = true diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md index 2f21b9f191..c1deab8852 100644 --- a/s3_scrubber/README.md +++ b/s3_scrubber/README.md @@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted. #### `scan-metadata` -Walk objects in a pageserver S3 bucket, and report statistics on the contents. +Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency. +Errors are logged to stderr and summary to stdout. +For pageserver: ``` -env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata +env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver Timelines: 31106 With errors: 3 @@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053 ``` +For safekeepers, dump_db_connstr and dump_db_table must be +specified; they should point to table with debug dump which will be used +to list timelines and find their backup and start LSNs. + ## Cleaning up running pageservers If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers. diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index 90d58a3bc2..43be258150 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -4,7 +4,8 @@ pub mod checks; pub mod cloud_admin_api; pub mod garbage; pub mod metadata_stream; -pub mod scan_metadata; +pub mod scan_pageserver_metadata; +pub mod scan_safekeeper_metadata; pub mod tenant_snapshot; use std::env; @@ -141,12 +142,17 @@ impl RootTarget { pub fn tenants_root(&self) -> S3Target { match self { Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME), - Self::Safekeeper(root) => root.with_sub_segment("wal"), + Self::Safekeeper(root) => root.clone(), } } pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { - self.tenants_root().with_sub_segment(&tenant_id.to_string()) + match self { + Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()), + Self::Safekeeper(_) => self + .tenants_root() + .with_sub_segment(&tenant_id.tenant_id.to_string()), + } } pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target { @@ -337,9 +343,7 @@ fn init_remote( }), NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("safekeeper/v1".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal".to_string()), delimiter, }), }; diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 88ba9bfa61..e49c280b99 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -1,9 +1,13 @@ +use anyhow::bail; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use s3_scrubber::scan_metadata::scan_metadata; +use s3_scrubber::scan_pageserver_metadata::scan_metadata; use s3_scrubber::tenant_snapshot::SnapshotDownloader; -use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; +use s3_scrubber::{ + init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, + NodeKind, TraversingDepth, +}; use clap::{Parser, Subcommand}; use utils::id::TenantId; @@ -35,11 +39,20 @@ enum Command { #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] mode: PurgeMode, }, + #[command(verbatim_doc_comment)] ScanMetadata { + #[arg(short, long)] + node_kind: NodeKind, #[arg(short, long, default_value_t = false)] json: bool, #[arg(long = "tenant-id", num_args = 0..)] tenant_ids: Vec, + #[arg(long, default_value = None)] + /// For safekeeper node_kind only, points to db with debug dump + dump_db_connstr: Option, + /// For safekeeper node_kind only, table in the db with debug dump + #[arg(long, default_value = None)] + dump_db_table: Option, }, TenantSnapshot { #[arg(long = "tenant-id")] @@ -72,33 +85,75 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json, tenant_ids } => { - match scan_metadata(bucket_config.clone(), tenant_ids).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) + Command::ScanMetadata { + json, + tenant_ids, + node_kind, + dump_db_connstr, + dump_db_table, + } => { + if let NodeKind::Safekeeper = node_kind { + let dump_db_connstr = + dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?; + let dump_db_table = + dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?; + + let summary = scan_safekeeper_metadata( + bucket_config.clone(), + tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(), + dump_db_connstr, + dump_db_table, + ) + .await?; + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); + if summary.is_fatal() { + bail!("Fatal scrub errors detected"); + } + if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + bail!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + Ok(()) + } else { + match scan_metadata(bucket_config.clone(), tenant_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) + } else { + Ok(()) + } } } } diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_pageserver_metadata.rs similarity index 100% rename from s3_scrubber/src/scan_metadata.rs rename to s3_scrubber/src/scan_pageserver_metadata.rs diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs new file mode 100644 index 0000000000..f56bc165db --- /dev/null +++ b/s3_scrubber/src/scan_safekeeper_metadata.rs @@ -0,0 +1,234 @@ +use std::{collections::HashSet, str::FromStr}; + +use aws_sdk_s3::Client; +use futures::stream::{StreamExt, TryStreamExt}; +use pageserver_api::shard::TenantShardId; +use postgres_ffi::{XLogFileName, PG_TLI}; +use serde::Serialize; +use tokio_postgres::types::PgLsn; +use tracing::{error, info, trace}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use crate::{ + cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; + +/// Generally we should ask safekeepers, but so far we use everywhere default 16MB. +const WAL_SEGSIZE: usize = 16 * 1024 * 1024; + +#[derive(Serialize)] +pub struct MetadataSummary { + timeline_count: usize, + with_errors: HashSet, + deleted_count: usize, +} + +impl MetadataSummary { + fn new() -> Self { + Self { + timeline_count: 0, + with_errors: HashSet::new(), + deleted_count: 0, + } + } + + pub fn summary_string(&self) -> String { + format!( + "timeline_count: {}, with_errors: {}", + self.timeline_count, + self.with_errors.len() + ) + } + + pub fn is_empty(&self) -> bool { + self.timeline_count == 0 + } + + pub fn is_fatal(&self) -> bool { + !self.with_errors.is_empty() + } +} + +/// Scan the safekeeper metadata in an S3 bucket, reporting errors and +/// statistics. +/// +/// It works by listing timelines along with timeline_start_lsn and backup_lsn +/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL +/// segments are missing, before complaining control plane is queried to check if +/// the project wasn't deleted in the meanwhile. +pub async fn scan_safekeeper_metadata( + bucket_config: BucketConfig, + tenant_ids: Vec, + dump_db_connstr: String, + dump_db_table: String, +) -> anyhow::Result { + info!( + "checking bucket {}, region {}, dump_db_table {}", + bucket_config.bucket, bucket_config.region, dump_db_table + ); + // Use the native TLS implementation (Neon requires TLS) + let tls_connector = + postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap()); + let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let tenant_filter_clause = if !tenant_ids.is_empty() { + format!( + "and tenant_id in ({})", + tenant_ids + .iter() + .map(|t| format!("'{}'", t)) + .collect::>() + .join(", ") + ) + } else { + "".to_owned() + }; + let query = format!( + "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;", + dump_db_table, tenant_filter_clause, + ); + info!("query is {}", query); + let timelines = client.query(&query, &[]).await?; + info!("loaded {} timelines", timelines.len()); + + let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?; + let console_config = ConsoleConfig::from_env()?; + let cloud_admin_api_client = CloudAdminApiClient::new(console_config); + + let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| { + let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id"); + let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id"); + let timeline_start_lsn_pg: PgLsn = row.get(2); + let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg)); + let backup_lsn_pg: PgLsn = row.get(3); + let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + check_timeline( + &s3_client, + &target, + &cloud_admin_api_client, + ttid, + timeline_start_lsn, + backup_lsn, + ) + }); + // Run multiple check_timeline's concurrently. + const CONCURRENCY: usize = 32; + let mut timelines = checks.try_buffered(CONCURRENCY); + + let mut summary = MetadataSummary::new(); + while let Some(r) = timelines.next().await { + let res = r?; + summary.timeline_count += 1; + if !res.is_ok { + summary.with_errors.insert(res.ttid); + } + if res.is_deleted { + summary.deleted_count += 1; + } + } + + Ok(summary) +} + +struct TimelineCheckResult { + ttid: TenantTimelineId, + is_ok: bool, + is_deleted: bool, // timeline is deleted in cplane +} + +/// List s3 and check that is has all expected WAL for the ttid. Consistency +/// errors are logged to stderr; returns Ok(true) if timeline is consistent, +/// Ok(false) if not, Err if failed to check. +async fn check_timeline( + s3_client: &Client, + root: &RootTarget, + api_client: &CloudAdminApiClient, + ttid: TenantTimelineId, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +) -> anyhow::Result { + trace!( + "checking ttid {}, should contain WAL [{}-{}]", + ttid, + timeline_start_lsn, + backup_lsn + ); + // calculate expected segfiles + let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE); + let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE); + let mut expected_segfiles: HashSet = HashSet::from_iter( + (expected_first_segno..expected_last_segno) + .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)), + ); + let expected_files_num = expected_segfiles.len(); + trace!("expecting {} files", expected_segfiles.len(),); + + // now list s3 and check if it misses something + let ttshid = + TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id); + let mut timeline_dir_target = root.timeline_root(&ttshid); + // stream_listing yields only common_prefixes if delimiter is not empty, but + // we need files, so unset it. + timeline_dir_target.delimiter = String::new(); + + let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); + while let Some(obj) = stream.next().await { + let obj = obj?; + let key = obj.key(); + + let seg_name = key + .strip_prefix(&timeline_dir_target.prefix_in_bucket) + .expect("failed to extract segment name"); + expected_segfiles.remove(seg_name); + } + if !expected_segfiles.is_empty() { + // Before complaining check cplane, probably timeline is already deleted. + let bdata = api_client.find_timeline_branch(ttid.timeline_id).await?; + let deleted = match bdata { + Some(bdata) => bdata.deleted, + None => { + // note: should be careful with selecting proper cplane address + info!("ttid {} not found, assuming it is deleted", ttid); + true + } + }; + if deleted { + // ok, branch is deleted + return Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: true, + }); + } + error!( + "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}", + ttid, + expected_segfiles.len(), + expected_files_num, + timeline_start_lsn, + backup_lsn, + ); + return Ok(TimelineCheckResult { + ttid, + is_ok: false, + is_deleted: false, + }); + } + Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: false, + }) +} diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index abe2718a49..fa83ebdccb 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3734,7 +3734,9 @@ class S3Scrubber: return stdout def scan_metadata(self) -> Any: - stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30) + stdout = self.scrubber_cli( + ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30 + ) try: return json.loads(stdout)