diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 3ea815b907..b2bacb89ca 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -6,6 +6,7 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-2 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 +zenith-us-stage-sk-5 console_region_id=27 [storage:children] pageservers diff --git a/Cargo.lock b/Cargo.lock index 1949babd98..c56b9519fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1594,6 +1594,7 @@ dependencies = [ "clap 3.0.14", "comfy-table", "control_plane", + "git-version", "pageserver", "postgres", "postgres_ffi", @@ -1785,6 +1786,7 @@ dependencies = [ "daemonize", "fail", "futures", + "git-version", "hex", "hex-literal", "humantime", @@ -2177,6 +2179,7 @@ dependencies = [ "bytes", "clap 3.0.14", "futures", + "git-version", "hashbrown", "hex", "hmac 0.12.1", @@ -2629,6 +2632,7 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", + "git-version", "hex", "humantime", "hyper", diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 27114b8202..334e0a9e05 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -136,13 +136,20 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { xact.execute(query.as_str(), &[])?; } } else { - info!("role name {}", &name); + info!("role name: '{}'", &name); let mut query: String = format!("CREATE ROLE {} ", name.quote()); - info!("role create query {}", &query); + info!("role create query: '{}'", &query); info_print!(" -> create"); query.push_str(&role.to_pg_options()); xact.execute(query.as_str(), &[])?; + + let grant_query = format!( + "grant pg_read_all_data, pg_write_all_data to {}", + name.quote() + ); + xact.execute(grant_query.as_str(), &[])?; + info!("role grant query: '{}'", &grant_query); } info_print!("\n"); diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 01cc0cf162..1b27f99ccf 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -51,7 +51,7 @@ pub struct SkTimelineInfo { #[serde(default)] pub peer_horizon_lsn: Option, #[serde(default)] - pub wal_stream_connection_string: Option, + pub safekeeper_connection_string: Option, } #[derive(Debug, thiserror::Error)] @@ -217,16 +217,22 @@ pub async fn subscribe_to_safekeeper_timeline_updates( break; } - let mut timeline_updates: HashMap> = - HashMap::new(); + let mut timeline_updates: HashMap> = HashMap::new(); + // Keep track that the timeline data updates from etcd arrive in the right order. + // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas + // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. + let mut timeline_etcd_versions: HashMap = HashMap::new(); + let events = resp.events(); debug!("Processing {} events", events.len()); for event in events { if EventType::Put == event.event_type() { - if let Some(kv) = event.kv() { - match parse_etcd_key_value(subscription_kind, ®ex, kv) { + if let Some(new_etcd_kv) = event.kv() { + let new_kv_version = new_etcd_kv.version(); + + match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) { Ok(Some((zttid, timeline))) => { match timeline_updates .entry(zttid) @@ -234,12 +240,15 @@ pub async fn subscribe_to_safekeeper_timeline_updates( .entry(timeline.safekeeper_id) { hash_map::Entry::Occupied(mut o) => { - if o.get().flush_lsn < timeline.info.flush_lsn { + let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN); + if old_etcd_kv_version < new_kv_version { o.insert(timeline.info); + timeline_etcd_versions.insert(zttid,new_kv_version); } } hash_map::Entry::Vacant(v) => { v.insert(timeline.info); + timeline_etcd_versions.insert(zttid,new_kv_version); } } } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 923fbe4d5a..28d9a13dbf 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -8,6 +8,7 @@ #![allow(deref_nullptr)] use serde::{Deserialize, Serialize}; +use utils::lsn::Lsn; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); @@ -37,3 +38,21 @@ pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) -> let diff = id1.wrapping_sub(id2) as i32; diff < 0 } + +// Check if page is not yet initialized (port of Postgres PageIsInit() macro) +pub fn page_is_new(pg: &[u8]) -> bool { + pg[14] == 0 && pg[15] == 0 // pg_upper == 0 +} + +// ExtractLSN from page header +pub fn page_get_lsn(pg: &[u8]) -> Lsn { + Lsn( + ((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32) + | u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64, + ) +} + +pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { + pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); + pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); +} diff --git a/libs/utils/build.rs b/libs/utils/build.rs deleted file mode 100644 index ee3346ae66..0000000000 --- a/libs/utils/build.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("cargo:rerun-if-env-changed=GIT_VERSION"); -} diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 3bc8993c26..8e3d357397 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -1,7 +1,7 @@ use std::str::FromStr; use super::error::ApiError; -use hyper::{Body, Request}; +use hyper::{body::HttpBody, Body, Request}; use routerify::ext::RequestExt; pub fn get_request_param<'a>( @@ -31,3 +31,10 @@ pub fn parse_request_param( ))), } } + +pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { + match request.body_mut().data().await { + Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())), + None => Ok(()), + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index de266efe64..4810909712 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -54,31 +54,44 @@ pub mod nonblock; // Default signal handling pub mod signals; -// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages -// -// we have several cases: -// * building locally from git repo -// * building in CI from git repo -// * building in docker (either in CI or locally) -// -// One thing to note is that .git is not available in docker (and it is bad to include it there). -// So everything becides docker build is covered by git_version crate. -// For docker use environment variable to pass git version, which is then retrieved by buildscript (build.rs). -// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. -// Git version received from environment variable used as a fallback in git_version invokation. -// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. -// So the build script will be run only when GIT_VERSION envvar has changed. -// -// Why not to use buildscript to get git commit sha directly without procmacro from different crate? -// Caching and workspaces complicates that. In case `utils` is not -// recompiled due to caching then version may become outdated. -// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro, -// so if we changed the index state git_version will pick that up and rerun the macro. -// -// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. -use git_version::git_version; -pub const GIT_VERSION: &str = git_version!( - prefix = "git:", - fallback = concat!("git-env:", env!("GIT_VERSION")), - args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha -); +/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages +/// +/// we have several cases: +/// * building locally from git repo +/// * building in CI from git repo +/// * building in docker (either in CI or locally) +/// +/// One thing to note is that .git is not available in docker (and it is bad to include it there). +/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required. +/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. +/// Git version received from environment variable used as a fallback in git_version invokation. +/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. +/// So the build script will be run only when GIT_VERSION envvar has changed. +/// +/// Why not to use buildscript to get git commit sha directly without procmacro from different crate? +/// Caching and workspaces complicates that. In case `utils` is not +/// recompiled due to caching then version may become outdated. +/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro, +/// so if we changed the index state git_version will pick that up and rerun the macro. +/// +/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. +/// +/// ############################################################################################# +/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details. +/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036 +/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains +/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. +/// The problem needs further investigation and regular `const` declaration instead of a macro. +#[macro_export] +macro_rules! project_git_version { + ($const_identifier:ident) => { + const $const_identifier: &str = git_version::git_version!( + prefix = "git:", + fallback = concat!( + "git-env:", + env!("GIT_VERSION", "Missing GIT_VERSION envvar") + ), + args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha + ); + }; +} diff --git a/neon_local/Cargo.toml b/neon_local/Cargo.toml index 78d339789f..8ebd7d5c17 100644 --- a/neon_local/Cargo.toml +++ b/neon_local/Cargo.toml @@ -9,6 +9,7 @@ anyhow = "1.0" serde_json = "1" comfy-table = "5.0.1" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +git-version = "0.3.5" # FIXME: 'pageserver' is needed for BranchInfo. Refactor pageserver = { path = "../pageserver" } diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index 8b54054080..6538cdefc4 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -20,8 +20,8 @@ use utils::{ auth::{Claims, Scope}, lsn::Lsn, postgres_backend::AuthType, + project_git_version, zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, - GIT_VERSION, }; use pageserver::timelines::TimelineInfo; @@ -30,6 +30,7 @@ use pageserver::timelines::TimelineInfo; const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; +project_git_version!(GIT_VERSION); fn default_conf() -> String { format!( @@ -540,6 +541,29 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an "tenant {} successfully created on the pageserver", new_tenant_id ); + + // Create an initial timeline for the new tenant + let new_timeline_id = parse_timeline_id(create_match)?; + let timeline = pageserver + .timeline_create(new_tenant_id, new_timeline_id, None, None)? + .context(format!( + "Failed to create initial timeline for tenant {new_tenant_id}" + ))?; + let new_timeline_id = timeline.timeline_id; + let last_record_lsn = timeline + .local + .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))? + .last_record_lsn; + + env.register_branch_mapping( + DEFAULT_BRANCH_NAME.to_string(), + new_tenant_id, + new_timeline_id, + )?; + + println!( + "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}", + ); } Some(("config", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index fe692f4304..f4f8ca1fe6 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -52,6 +52,7 @@ nix = "0.23" once_cell = "1.8.0" crossbeam-utils = "0.8.5" fail = "0.5.0" +git-version = "0.3.5" # 'experimental' is needed for the `zstd::bulk::Decompressor::upper_bound` function. zstd = { version = "0.11.1", features = ["experimental"] } diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index af73ef6bdb..87390a1b06 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -7,7 +7,9 @@ use pageserver::layered_repository::dump_layerfile_from_path; use pageserver::page_cache; use pageserver::virtual_file; use std::path::PathBuf; -use utils::GIT_VERSION; +use utils::project_git_version; + +project_git_version!(GIT_VERSION); fn main() -> Result<()> { let arg_matches = App::new("Zenith dump_layerfile utility") diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9cb7e6f13d..190e38e341 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -20,17 +20,18 @@ use utils::{ http::endpoint, logging, postgres_backend::AuthType, + project_git_version, shutdown::exit_now, signals::{self, Signal}, tcp_listener, zid::{ZTenantId, ZTimelineId}, - GIT_VERSION, }; +project_git_version!(GIT_VERSION); + fn version() -> String { format!( - "{} profiling:{} failpoints:{}", - GIT_VERSION, + "{GIT_VERSION} profiling:{} failpoints:{}", cfg!(feature = "profiling"), fail::has_failpoints() ) @@ -217,7 +218,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // Initialize logger let log_file = logging::init(LOG_FILE_NAME, daemonize)?; - info!("version: {}", GIT_VERSION); + info!("version: {GIT_VERSION}"); // TODO: Check that it looks like a valid repository before going further diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index fae5e5c2e3..983fdb8647 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -6,7 +6,9 @@ use clap::{App, Arg}; use pageserver::layered_repository::metadata::TimelineMetadata; use std::path::PathBuf; use std::str::FromStr; -use utils::{lsn::Lsn, GIT_VERSION}; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(GIT_VERSION); fn main() -> Result<()> { let arg_matches = App::new("Zenith update metadata utility") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index caf2d0a9b3..38881adbf0 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -24,7 +24,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt}; pub const ZSTD_MAX_SAMPLES: usize = 1024; pub const ZSTD_MIN_SAMPLES: usize = 8; // magic requirement of zstd pub const ZSTD_MAX_SAMPLE_BYTES: usize = 10 * 1024 * 1024; // max memory size for holding samples -pub const ZSTD_MAX_DICTIONARY_SIZE: usize = 8 * 1024; +pub const ZSTD_MAX_DICTIONARY_SIZE: usize = 8 * 1024 - 4; // make dictionary + BLOB length fit in first page pub const ZSTD_COMPRESSION_LEVEL: i32 = 0; // default compression level pub mod defaults { diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 01c2b961eb..c7536cc959 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -74,6 +74,7 @@ pub mod metadata; mod par_fsync; mod storage_layer; +use crate::pgdatadir_mapping::LsnForTimestamp; use delta_layer::{DeltaLayer, DeltaLayerWriter}; use ephemeral_file::is_ephemeral_file; use filename::{DeltaFileName, ImageFileName}; @@ -81,6 +82,7 @@ use image_layer::{ImageLayer, ImageLayerWriter}; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; use layer_map::SearchResult; +use postgres_ffi::xlog_utils::to_pg_timestamp; use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; // re-export this function so that page_cache.rs can use it. @@ -89,7 +91,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme // Metrics collected on operations on the storage repository. lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( - "pageserver_storage_time", + "pageserver_storage_operations_seconds", "Time spent on storage operations", &["operation", "tenant_id", "timeline_id"] ) @@ -99,8 +101,8 @@ lazy_static! { // Metrics collected on operations on the storage repository. lazy_static! { static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( - "pageserver_getpage_reconstruct_time", - "Time spent on storage operations", + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); @@ -108,13 +110,13 @@ lazy_static! { lazy_static! { static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( - "materialize_page_cache_hits", + "pageserver_materialized_cache_hits_total", "Number of cache hits from materialized page cache", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( - "wait_lsn_time", + "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", &["tenant_id", "timeline_id"] ) @@ -134,12 +136,12 @@ lazy_static! { // or in testing they estimate how much we would upload if we did. lazy_static! { static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( - "pageserver_num_persistent_files_created", + "pageserver_created_persistent_files_total", "Number of files created that are meant to be uploaded to cloud storage", ) .expect("failed to define a metric"); static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( - "pageserver_persistent_bytes_written", + "pageserver_written_persistent_bytes_total", "Total bytes written that are meant to be uploaded to cloud storage", ) .expect("failed to define a metric"); @@ -1512,7 +1514,7 @@ impl LayeredTimeline { .ensure_loaded() .with_context(|| { format!( - "Ancestor timeline is not is not loaded. Timeline id: {} Ancestor id {:?}", + "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", self.timeline_id, self.get_ancestor_timeline_id(), ) @@ -2118,11 +2120,54 @@ impl LayeredTimeline { let cutoff = gc_info.cutoff; let pitr = gc_info.pitr; + // Calculate pitr cutoff point. + // If we cannot determine a cutoff LSN, be conservative and don't GC anything. + let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn(); + + if let Ok(timeline) = + tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) + { + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { + let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); + + match timeline.find_lsn_for_timestamp(pitr_timestamp)? { + LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + LsnForTimestamp::Future(lsn) => { + debug!("future({})", lsn); + pitr_cutoff_lsn = cutoff; + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + } + } + debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } + } else if cfg!(test) { + // We don't have local timeline in mocked cargo tests. + // So, just ignore pitr_interval setting in this case. + pitr_cutoff_lsn = cutoff; + } + + let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn); + + // Nothing to GC. Return early. + if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff { + info!( + "Nothing to GC for timeline {}. cutoff_lsn {}", + self.timeline_id, new_gc_cutoff + ); + result.elapsed = now.elapsed()?; + return Ok(result); + } + let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered(); // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = cutoff; + *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; info!("GC starting"); @@ -2162,30 +2207,18 @@ impl LayeredTimeline { result.layers_needed_by_cutoff += 1; continue 'outer; } - // 2. It is newer than PiTR interval? - // We use modification time of layer file to estimate update time. - // This estimation is not quite precise but maintaining LSN->timestamp map seems to be overkill. - // It is not expected that users will need high precision here. And this estimation - // is conservative: modification time of file is always newer than actual time of version - // creation. So it is safe for users. - // TODO A possible "bloat" issue still persists here. - // If modification time changes because of layer upload/download, we will keep these files - // longer than necessary. - // https://github.com/neondatabase/neon/issues/1554 - // - if let Ok(metadata) = fs::metadata(&l.filename()) { - let last_modified = metadata.modified()?; - if now.duration_since(last_modified)? < pitr { - debug!( - "keeping {} because it's modification time {:?} is newer than PITR {:?}", - l.filename().display(), - last_modified, - pitr - ); - result.layers_needed_by_pitr += 1; - continue 'outer; - } + + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > pitr_cutoff_lsn { + debug!( + "keeping {} because it's newer than pitr_cutoff_lsn {}", + l.filename().display(), + pitr_cutoff_lsn + ); + result.layers_needed_by_pitr += 1; + continue 'outer; } + // 3. Is it needed by a child branch? // NOTE With that wee would keep data that // might be referenced by child branches forever. diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 6fa870bcfe..cf4f33d4c8 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -318,6 +318,9 @@ impl Layer for DeltaLayer { return false; } let entry_lsn = DeltaKey::extract_lsn_from_buf(key); + if entry_lsn < lsn_range.start { + return false; + } blob_refs.push((entry_lsn, blob_ref)); !blob_ref.will_init() @@ -532,7 +535,7 @@ impl DeltaLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - fn load(&self) -> Result> { + fn load(&self) -> Result>{ loop { // Quick exit if already loaded let inner = self.inner.read().unwrap(); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 83985069ec..fdce0e5c5f 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( - "pageserver_live_connections_count", + "pageserver_live_connections", "Number of live network connections", &["pageserver_connection_kind"] ) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index da3dedfc84..28d6bf2621 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -19,7 +19,6 @@ use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::{Arc, RwLockReadGuard}; -use std::time::Duration; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, @@ -326,7 +325,7 @@ const TIME_BUCKETS: &[f64] = &[ lazy_static! { static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( - "pageserver_smgr_query_time", + "pageserver_smgr_query_seconds", "Time spent on smgr query handling", &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() @@ -796,7 +795,9 @@ impl postgres_backend::Handler for PageServerHandler { .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let result = repo.gc_iteration(Some(timelineid), gc_horizon, Duration::ZERO, true)?; + // Use tenant's pitr setting + let pitr = repo.get_pitr_interval(); + let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index b8c6f7fdab..7755e67c8d 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -208,12 +208,12 @@ lazy_static! { ) .expect("failed to register pageserver remote storage remaining sync items int gauge"); static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!( - "pageserver_remote_storage_fatal_task_failures", + "pageserver_remote_storage_fatal_task_failures_total", "Number of critically failed tasks" ) .expect("failed to register pageserver remote storage remaining sync items int gauge"); static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( - "pageserver_remote_storage_image_sync_time", + "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ Grouped by `operation_kind` (upload|download) and `status` (success|failure)", &["operation_kind", "status"], diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 4ce245a74f..37d70372b5 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -34,7 +34,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ lazy_static! { static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!( - "pageserver_io_time", + "pageserver_io_operations_seconds", "Time spent in IO operations", &["operation", "tenant_id", "timeline_id"], STORAGE_IO_TIME_BUCKETS.into() @@ -43,8 +43,8 @@ lazy_static! { } lazy_static! { static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!( - "pageserver_io_size", - "Amount of bytes", + "pageserver_io_operations_bytes_total", + "Total amount of bytes read/written in IO operations", &["operation", "tenant_id", "timeline_id"] ) .expect("failed to define a metric"); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index fbdb328d2c..5223125ce6 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -24,6 +24,7 @@ use anyhow::Context; use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::{page_is_new, page_set_lsn}; use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; @@ -304,8 +305,14 @@ impl<'a, R: Repository> WalIngest<'a, R> { image.resize(image.len() + blk.hole_length as usize, 0u8); image.unsplit(tail); } - image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); - image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); + // + // Match the logic of XLogReadBufferForRedoExtended: + // The page may be uninitialized. If so, we can't set the LSN because + // that would corrupt the page. + // + if !page_is_new(&image) { + page_set_lsn(&mut image, lsn) + } assert_eq!(image.len(), pg_constants::BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 777718b311..e556c24548 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -106,16 +106,16 @@ impl crate::walredo::WalRedoManager for DummyRedoManager { // each tenant. lazy_static! { static ref WAL_REDO_TIME: Histogram = - register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo") + register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo") .expect("failed to define a metric"); static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!( - "pageserver_wal_redo_wait_time", + "pageserver_wal_redo_wait_seconds", "Time spent waiting for access to the WAL redo process" ) .expect("failed to define a metric"); static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!( - "pageserver_wal_records_replayed", - "Number of WAL records replayed" + "pageserver_replayed_wal_records_total", + "Number of WAL records replayed in WAL redo process" ) .unwrap(); } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 43880d645a..4e45698e3e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -33,6 +33,7 @@ tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" +git-version = "0.3.5" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs index 55a0889af4..41a822701f 100644 --- a/proxy/src/auth_backend/console.rs +++ b/proxy/src/auth_backend/console.rs @@ -117,7 +117,7 @@ async fn get_auth_info( let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?; url.query_pairs_mut() - .append_pair("cluster", cluster) + .append_pair("project", cluster) .append_pair("role", user); // TODO: use a proper logger @@ -141,7 +141,7 @@ async fn wake_compute( cluster: &str, ) -> Result<(String, u16), ConsoleAuthError> { let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?; - url.query_pairs_mut().append_pair("cluster", cluster); + url.query_pairs_mut().append_pair("project", cluster); // TODO: use a proper logger println!("cplane request: {}", url); diff --git a/proxy/src/main.rs b/proxy/src/main.rs index fc2a368b85..f46e19e5d6 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -25,7 +25,9 @@ use config::ProxyConfig; use futures::FutureExt; use std::{future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; -use utils::GIT_VERSION; +use utils::project_git_version; + +project_git_version!(GIT_VERSION); /// Flattens `Result>` into `Result`. async fn flatten_err( @@ -124,7 +126,7 @@ async fn main() -> anyhow::Result<()> { auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, })); - println!("Version: {}", GIT_VERSION); + println!("Version: {GIT_VERSION}"); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 5e1ceee02e..417cf58cd5 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -29,6 +29,7 @@ hex = "0.4.3" const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-util = { version = "0.7", features = ["io"] } +git-version = "0.3.5" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d0df7093ff..6955d2aa5c 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -17,16 +17,19 @@ use url::{ParseError, Url}; use safekeeper::control_file::{self}; use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; use safekeeper::remove_wal; +use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; use safekeeper::{http, s3_offload}; use utils::{ - http::endpoint, logging, shutdown::exit_now, signals, tcp_listener, zid::ZNodeId, GIT_VERSION, + http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, + zid::ZNodeId, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; const ID_FILE_NAME: &str = "safekeeper.id"; +project_git_version!(GIT_VERSION); fn main() -> Result<()> { metrics::set_common_metrics_prefix("safekeeper"); @@ -193,7 +196,7 @@ fn main() -> Result<()> { fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; - info!("version: {}", GIT_VERSION); + info!("version: {GIT_VERSION}"); // Prevent running multiple safekeepers on the same directory let lock_file_path = conf.workdir.join(LOCK_FILE_NAME); @@ -249,6 +252,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; + let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); + GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx); let conf_ = conf.clone(); threads.push( @@ -277,13 +282,12 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b ); } - let (tx, rx) = mpsc::unbounded_channel(); let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) .spawn(|| { // thread code - let thread_result = wal_service::thread_main(conf_cloned, pg_listener, tx); + let thread_result = wal_service::thread_main(conf_cloned, pg_listener); if let Err(e) = thread_result { info!("safekeeper thread terminated: {}", e); } @@ -297,7 +301,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b .name("callmemaybe thread".into()) .spawn(|| { // thread code - let thread_result = callmemaybe::thread_main(conf_cloned, rx); + let thread_result = callmemaybe::thread_main(conf_cloned, callmemaybe_rx); if let Err(e) = thread_result { error!("callmemaybe thread terminated: {}", e); } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index c9ae1a8d98..d9c60c9db0 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -60,7 +60,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // lock is held. for zttid in GlobalTimelines::get_active_timelines() { if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { - let sk_info = tli.get_public_info()?; + let sk_info = tli.get_public_info(&conf)?; let put_opts = PutOptions::new().with_lease(lease.id()); client .put( diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 7d86523b0e..9af78661f9 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -21,9 +21,6 @@ use utils::{ zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; -use crate::callmemaybe::CallmeEvent; -use tokio::sync::mpsc::UnboundedSender; - /// Safekeeper handler of postgres commands pub struct SafekeeperPostgresHandler { pub conf: SafeKeeperConf, @@ -33,8 +30,6 @@ pub struct SafekeeperPostgresHandler { pub ztimelineid: Option, pub timeline: Option>, pageserver_connstr: Option, - //sender to communicate with callmemaybe thread - pub tx: UnboundedSender, } /// Parsed Postgres command. @@ -140,7 +135,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf, tx: UnboundedSender) -> Self { + pub fn new(conf: SafeKeeperConf) -> Self { SafekeeperPostgresHandler { conf, appname: None, @@ -148,7 +143,6 @@ impl SafekeeperPostgresHandler { ztimelineid: None, timeline: None, pageserver_connstr: None, - tx, } } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index e731db5617..62fbd2ff2f 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -3,19 +3,20 @@ use hyper::{Body, Request, Response, StatusCode}; use serde::Serialize; use serde::Serializer; +use std::collections::HashMap; use std::fmt::Display; use std::sync::Arc; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; -use crate::timeline::GlobalTimelines; +use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; use crate::SafeKeeperConf; use utils::{ http::{ endpoint, error::ApiError, json::{json_request, json_response}, - request::parse_request_param, + request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, lsn::Lsn, @@ -130,6 +131,44 @@ async fn timeline_create_handler(mut request: Request) -> Result, +) -> Result, ApiError> { + let zttid = ZTenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + ensure_no_body(&mut request).await?; + json_response( + StatusCode::OK, + GlobalTimelines::delete_force(get_conf(&request), &zttid).map_err(ApiError::from_err)?, + ) +} + +/// Deactivates all timelines for the tenant and removes its data directory. +/// See `timeline_delete_force_handler`. +async fn tenant_delete_force_handler( + mut request: Request, +) -> Result, ApiError> { + let tenant_id = parse_request_param(&request, "tenant_id")?; + ensure_no_body(&mut request).await?; + json_response( + StatusCode::OK, + GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id) + .map_err(ApiError::from_err)? + .iter() + .map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp)) + .collect::>(), + ) +} + /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { let zttid = ZTenantTimelineId::new( @@ -155,6 +194,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder timeline_status_handler, ) .post("/v1/timeline", timeline_create_handler) + .delete( + "/v1/tenant/:tenant_id/timeline/:timeline_id", + timeline_delete_force_handler, + ) + .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler) // for tests .post( "/v1/record_safekeeper_info/:tenant_id/:timeline_id", diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index c848de9e71..09b2e68a49 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use std::time::Duration; use url::Url; -use utils::zid::{ZNodeId, ZTenantTimelineId}; +use utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId}; pub mod broker; pub mod callmemaybe; @@ -31,7 +31,7 @@ pub mod defaults { pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); - pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(1); + pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); } #[derive(Debug, Clone)] @@ -57,9 +57,12 @@ pub struct SafeKeeperConf { } impl SafeKeeperConf { + pub fn tenant_dir(&self, tenant_id: &ZTenantId) -> PathBuf { + self.workdir.join(tenant_id.to_string()) + } + pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf { - self.workdir - .join(zttid.tenant_id.to_string()) + self.tenant_dir(&zttid.tenant_id) .join(zttid.timeline_id.to_string()) } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 3ad99ab0df..0ef335c9ed 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -5,7 +5,6 @@ use anyhow::{anyhow, bail, Result}; use bytes::BytesMut; -use tokio::sync::mpsc::UnboundedSender; use tracing::*; use crate::timeline::Timeline; @@ -28,8 +27,6 @@ use utils::{ sock_split::ReadStream, }; -use crate::callmemaybe::CallmeEvent; - pub struct ReceiveWalConn<'pg> { /// Postgres connection pg_backend: &'pg mut PostgresBackend, @@ -91,10 +88,9 @@ impl<'pg> ReceiveWalConn<'pg> { // Register the connection and defer unregister. spg.timeline .get() - .on_compute_connect(self.pageserver_connstr.as_ref(), &spg.tx)?; + .on_compute_connect(self.pageserver_connstr.as_ref())?; let _guard = ComputeConnectionGuard { timeline: Arc::clone(spg.timeline.get()), - callmemaybe_tx: spg.tx.clone(), }; let mut next_msg = Some(next_msg); @@ -194,13 +190,10 @@ impl ProposerPollStream { struct ComputeConnectionGuard { timeline: Arc, - callmemaybe_tx: UnboundedSender, } impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - self.timeline - .on_compute_disconnect(&self.callmemaybe_tx) - .unwrap(); + self.timeline.on_compute_disconnect().unwrap(); } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 960f70d154..d52dd6ea57 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -264,13 +264,13 @@ impl ReplicationConn { } else { let pageserver_connstr = pageserver_connstr.expect("there should be a pageserver connection string since this is not a wal_proposer_recovery"); let zttid = spg.timeline.get().zttid; - let tx_clone = spg.tx.clone(); + let tx_clone = spg.timeline.get().callmemaybe_tx.clone(); let subscription_key = SubscriptionStateKey::new( zttid.tenant_id, zttid.timeline_id, pageserver_connstr.clone(), ); - spg.tx + tx_clone .send(CallmeEvent::Pause(subscription_key)) .unwrap_or_else(|e| { error!("failed to send Pause request to callmemaybe thread {}", e); @@ -315,7 +315,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().check_deactivate(replica_id, &spg.tx)? { + if spg.timeline.get().check_deactivate(replica_id)? { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 8b1072a54b..84ad53d72d 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -7,6 +7,8 @@ use etcd_broker::SkTimelineInfo; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; +use serde::Serialize; + use std::cmp::{max, min}; use std::collections::HashMap; use std::fs::{self}; @@ -19,7 +21,7 @@ use tracing::*; use utils::{ lsn::Lsn, pq_proto::ZenithFeedback, - zid::{ZNodeId, ZTenantTimelineId}, + zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, }; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; @@ -89,7 +91,6 @@ struct SharedState { active: bool, num_computes: u32, pageserver_connstr: Option, - listen_pg_addr: String, last_removed_segno: XLogSegNo, } @@ -112,7 +113,6 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, - listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -132,7 +132,6 @@ impl SharedState { active: false, num_computes: 0, pageserver_connstr: None, - listen_pg_addr: conf.listen_pg_addr.clone(), last_removed_segno: 0, }) } @@ -278,15 +277,21 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { pub zttid: ZTenantTimelineId, + pub callmemaybe_tx: UnboundedSender, mutex: Mutex, /// conditional variable used to notify wal senders cond: Condvar, } impl Timeline { - fn new(zttid: ZTenantTimelineId, shared_state: SharedState) -> Timeline { + fn new( + zttid: ZTenantTimelineId, + callmemaybe_tx: UnboundedSender, + shared_state: SharedState, + ) -> Timeline { Timeline { zttid, + callmemaybe_tx, mutex: Mutex::new(shared_state), cond: Condvar::new(), } @@ -295,34 +300,27 @@ impl Timeline { /// Register compute connection, starting timeline-related activity if it is /// not running yet. /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_connect( - &self, - pageserver_connstr: Option<&String>, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { + pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.num_computes += 1; // FIXME: currently we always adopt latest pageserver connstr, but we // should have kind of generations assigned by compute to distinguish // the latest one or even pass it through consensus to reliably deliver // to all safekeepers. - shared_state.activate(&self.zttid, pageserver_connstr, callmemaybe_tx)?; + shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; Ok(()) } /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_disconnect( - &self, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { + pub fn on_compute_disconnect(&self) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.num_computes -= 1; // If there is no pageserver, can suspend right away; otherwise let // walsender do that. if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() { - shared_state.deactivate(&self.zttid, callmemaybe_tx)?; + shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; } Ok(()) } @@ -330,11 +328,7 @@ impl Timeline { /// Deactivate tenant if there is no computes and pageserver is caughtup, /// assuming the pageserver status is in replica_id. /// Returns true if deactivated. - pub fn check_deactivate( - &self, - replica_id: usize, - callmemaybe_tx: &UnboundedSender, - ) -> Result { + pub fn check_deactivate(&self, replica_id: usize) -> Result { let mut shared_state = self.mutex.lock().unwrap(); if !shared_state.active { // already suspended @@ -346,13 +340,27 @@ impl Timeline { (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn); if deactivate { - shared_state.deactivate(&self.zttid, callmemaybe_tx)?; + shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; return Ok(true); } } Ok(false) } + /// Deactivates the timeline, assuming it is being deleted. + /// Returns whether the timeline was already active. + /// + /// The callmemaybe thread is stopped by the deactivation message. We assume all other threads + /// will stop by themselves eventually (possibly with errors, but no panics). There should be no + /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but + /// we're deleting the timeline anyway. + pub fn deactivate_for_delete(&self) -> Result { + let mut shared_state = self.mutex.lock().unwrap(); + let was_active = shared_state.active; + shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + Ok(was_active) + } + fn is_active(&self) -> bool { let shared_state = self.mutex.lock().unwrap(); shared_state.active @@ -421,7 +429,7 @@ impl Timeline { } /// Prepare public safekeeper info for reporting. - pub fn get_public_info(&self) -> anyhow::Result { + pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result { let shared_state = self.mutex.lock().unwrap(); Ok(SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), @@ -435,18 +443,7 @@ impl Timeline { shared_state.sk.inmem.remote_consistent_lsn, )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - wal_stream_connection_string: shared_state - .pageserver_connstr - .as_deref() - .map(|pageserver_connstr| { - wal_stream_connection_string( - self.zttid, - &shared_state.listen_pg_addr, - pageserver_connstr, - ) - }) - .transpose() - .context("Failed to get the pageserver callmemaybe connstr")?, + safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), }) } @@ -504,29 +501,6 @@ impl Timeline { } } -// pageserver connstr is needed to be able to distinguish between different pageservers -// it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved -// TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105 -fn wal_stream_connection_string( - ZTenantTimelineId { - tenant_id, - timeline_id, - }: ZTenantTimelineId, - listen_pg_addr_str: &str, - pageserver_connstr: &str, -) -> anyhow::Result { - let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); - let me_conf = me_connstr - .parse::() - .with_context(|| { - format!("Failed to parse pageserver connection string '{me_connstr}' as a postgres one") - })?; - let (host, port) = utils::connstring::connection_host_port(&me_conf); - Ok(format!( - "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id} pageserver_connstr={pageserver_connstr}'", - )) -} - // Utilities needed by various Connection-like objects pub trait TimelineTools { fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; @@ -545,22 +519,41 @@ impl TimelineTools for Option> { } } +struct GlobalTimelinesState { + timelines: HashMap>, + callmemaybe_tx: Option>, +} + lazy_static! { - pub static ref TIMELINES: Mutex>> = - Mutex::new(HashMap::new()); + static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { + timelines: HashMap::new(), + callmemaybe_tx: None + }); +} + +#[derive(Clone, Copy, Serialize)] +pub struct TimelineDeleteForceResult { + pub dir_existed: bool, + pub was_active: bool, } /// A zero-sized struct used to manage access to the global timelines map. pub struct GlobalTimelines; impl GlobalTimelines { + pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender) { + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(state.callmemaybe_tx.is_none()); + state.callmemaybe_tx = Some(callmemaybe_tx); + } + fn create_internal( - mut timelines: MutexGuard>>, + mut state: MutexGuard, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, peer_ids: Vec, ) -> Result> { - match timelines.get(&zttid) { + match state.timelines.get(&zttid) { Some(_) => bail!("timeline {} already exists", zttid), None => { // TODO: check directory existence @@ -569,8 +562,12 @@ impl GlobalTimelines { let shared_state = SharedState::create(conf, &zttid, peer_ids) .context("failed to create shared state")?; - let new_tli = Arc::new(Timeline::new(zttid, shared_state)); - timelines.insert(zttid, Arc::clone(&new_tli)); + let new_tli = Arc::new(Timeline::new( + zttid, + state.callmemaybe_tx.as_ref().unwrap().clone(), + shared_state, + )); + state.timelines.insert(zttid, Arc::clone(&new_tli)); Ok(new_tli) } } @@ -581,20 +578,20 @@ impl GlobalTimelines { zttid: ZTenantTimelineId, peer_ids: Vec, ) -> Result> { - let timelines = TIMELINES.lock().unwrap(); - GlobalTimelines::create_internal(timelines, conf, zttid, peer_ids) + let state = TIMELINES_STATE.lock().unwrap(); + GlobalTimelines::create_internal(state, conf, zttid, peer_ids) } - /// Get a timeline with control file loaded from the global TIMELINES map. + /// Get a timeline with control file loaded from the global TIMELINES_STATE.timelines map. /// If control file doesn't exist and create=false, bails out. pub fn get( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool, ) -> Result> { - let mut timelines = TIMELINES.lock().unwrap(); + let mut state = TIMELINES_STATE.lock().unwrap(); - match timelines.get(&zttid) { + match state.timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { let shared_state = @@ -610,20 +607,19 @@ impl GlobalTimelines { .contains("No such file or directory") && create { - return GlobalTimelines::create_internal( - timelines, - conf, - zttid, - vec![], - ); + return GlobalTimelines::create_internal(state, conf, zttid, vec![]); } else { return Err(error); } } }; - let new_tli = Arc::new(Timeline::new(zttid, shared_state)); - timelines.insert(zttid, Arc::clone(&new_tli)); + let new_tli = Arc::new(Timeline::new( + zttid, + state.callmemaybe_tx.as_ref().unwrap().clone(), + shared_state, + )); + state.timelines.insert(zttid, Arc::clone(&new_tli)); Ok(new_tli) } } @@ -631,11 +627,86 @@ impl GlobalTimelines { /// Get ZTenantTimelineIDs of all active timelines. pub fn get_active_timelines() -> Vec { - let timelines = TIMELINES.lock().unwrap(); - timelines + let state = TIMELINES_STATE.lock().unwrap(); + state + .timelines .iter() .filter(|&(_, tli)| tli.is_active()) .map(|(zttid, _)| *zttid) .collect() } + + fn delete_force_internal( + conf: &SafeKeeperConf, + zttid: &ZTenantTimelineId, + was_active: bool, + ) -> Result { + match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { + Ok(_) => Ok(TimelineDeleteForceResult { + dir_existed: true, + was_active, + }), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(TimelineDeleteForceResult { + dir_existed: false, + was_active, + }), + Err(e) => Err(e.into()), + } + } + + /// Deactivates and deletes the timeline, see `Timeline::deactivate_for_delete()`, the deletes + /// the corresponding data directory. + /// We assume all timeline threads do not care about `GlobalTimelines` not containing the timeline + /// anymore, and they will eventually terminate without panics. + /// + /// There are multiple ways the timeline may be accidentally "re-created" (so we end up with two + /// `Timeline` objects in memory): + /// a) a compute node connects after this method is called, or + /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or + /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. + /// TODO: ensure all of the above never happens. + pub fn delete_force( + conf: &SafeKeeperConf, + zttid: &ZTenantTimelineId, + ) -> Result { + info!("deleting timeline {}", zttid); + let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) { + None => false, + Some(tli) => tli.deactivate_for_delete()?, + }; + GlobalTimelines::delete_force_internal(conf, zttid, was_active) + } + + /// Deactivates and deletes all timelines for the tenant, see `delete()`. + /// Returns map of all timelines which the tenant had, `true` if a timeline was active. + pub fn delete_force_all_for_tenant( + conf: &SafeKeeperConf, + tenant_id: &ZTenantId, + ) -> Result> { + info!("deleting all timelines for tenant {}", tenant_id); + let mut state = TIMELINES_STATE.lock().unwrap(); + let mut deleted = HashMap::new(); + for (zttid, tli) in &state.timelines { + if zttid.tenant_id == *tenant_id { + deleted.insert( + *zttid, + GlobalTimelines::delete_force_internal( + conf, + zttid, + tli.deactivate_for_delete()?, + )?, + ); + } + } + // TODO: test that the exact subset of timelines is removed. + state + .timelines + .retain(|zttid, _| !deleted.contains_key(zttid)); + match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + e => e?, + }; + Ok(deleted) + } } diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 468ac28526..5980160788 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -8,29 +8,22 @@ use std::net::{TcpListener, TcpStream}; use std::thread; use tracing::*; -use crate::callmemaybe::CallmeEvent; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; -use tokio::sync::mpsc::UnboundedSender; use utils::postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main( - conf: SafeKeeperConf, - listener: TcpListener, - tx: UnboundedSender, -) -> Result<()> { +pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> { loop { match listener.accept() { Ok((socket, peer_addr)) => { debug!("accepted connection from {}", peer_addr); let conf = conf.clone(); - let tx_clone = tx.clone(); let _ = thread::Builder::new() .name("WAL service thread".into()) .spawn(move || { - if let Err(err) = handle_socket(socket, conf, tx_clone) { + if let Err(err) = handle_socket(socket, conf) { error!("connection handler exited: {}", err); } }) @@ -51,16 +44,12 @@ fn get_tid() -> u64 { /// This is run by `thread_main` above, inside a background thread. /// -fn handle_socket( - socket: TcpStream, - conf: SafeKeeperConf, - tx: UnboundedSender, -) -> Result<()> { +fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> { let _enter = info_span!("", tid = ?get_tid()).entered(); socket.set_nodelay(true)?; - let mut conn_handler = SafekeeperPostgresHandler::new(conf, tx); + let mut conn_handler = SafekeeperPostgresHandler::new(conf); let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?; // libpq replication protocol between safekeeper and replicas/pagers pgbackend.run(&mut conn_handler)?; diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index d6b073492d..c07b9d6dd1 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -21,7 +21,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. - tenant = env.zenith_cli.create_tenant( + tenant, _ = env.zenith_cli.create_tenant( conf={ 'gc_period': '10 m', 'gc_horizon': '1048576', @@ -35,7 +35,6 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: pscur.execute("failpoints flush-frozen=sleep(10000)") - env.zenith_cli.create_timeline(f'main', tenant_id=tenant) pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() branch0_cur.execute("SHOW zenith.zenith_timeline") diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 4e2be352f4..fc84af5283 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -19,6 +19,8 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # # See https://github.com/zenithdb/zenith/issues/1068 zenith_env_builder.num_safekeepers = 1 + # Disable pitr, because here we want to test branch creation after GC + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = zenith_env_builder.init_start() # Branch at the point where only 100 rows were inserted diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index e4e4aa9f4a..519a6dda1c 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,7 +1,7 @@ import asyncio import random -from fixtures.zenith_fixtures import ZenithEnv, Postgres +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres from fixtures.log_helper import log # Test configuration @@ -50,9 +50,12 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): # # (repro for https://github.com/zenithdb/zenith/issues/1047) # -def test_gc_aggressive(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_gc_aggressive", "empty") +def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): + + # Disable pitr, because here we want to test branch creation after GC + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch("test_gc_aggressive", "main") pg = env.postgres.create_start('test_gc_aggressive') log.info('postgres is running on test_gc_aggressive branch') diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index e7400cff96..cf7fe09b1e 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,5 +1,7 @@ -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.zenith_fixtures import ZenithEnvBuilder from fixtures.log_helper import log +from fixtures.utils import print_gc_result +import psycopg2.extras # @@ -12,9 +14,11 @@ from fixtures.log_helper import log # just a hint that the page hasn't been modified since that LSN, and the page # server should return the latest page version regardless of the LSN. # -def test_old_request_lsn(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_old_request_lsn", "empty") +def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): + # Disable pitr, because here we want to test branch creation after GC + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch("test_old_request_lsn", "main") pg = env.postgres.create_start('test_old_request_lsn') log.info('postgres is running on test_old_request_lsn branch') @@ -26,7 +30,7 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv): timeline = cur.fetchone()[0] psconn = env.pageserver.connect() - pscur = psconn.cursor() + pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. @@ -53,6 +57,9 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv): # garbage collections so that the page server will remove old page versions. for i in range(10): pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + row = pscur.fetchone() + print_gc_result(row) + for j in range(100): cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;') diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py new file mode 100644 index 0000000000..ee19bddfe8 --- /dev/null +++ b/test_runner/batch_others/test_pitr_gc.py @@ -0,0 +1,77 @@ +import subprocess +from contextlib import closing + +import psycopg2.extras +import pytest +from fixtures.log_helper import log +from fixtures.utils import print_gc_result +from fixtures.zenith_fixtures import ZenithEnvBuilder + + +# +# Check pitr_interval GC behavior. +# Insert some data, run GC and create a branch in the past. +# +def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): + + zenith_env_builder.num_safekeepers = 1 + # Set pitr interval such that we need to keep the data + zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + + env = zenith_env_builder.init_start() + pgmain = env.postgres.create_start('main') + log.info("postgres is running on 'main' branch") + + main_pg_conn = pgmain.connect() + main_cur = main_pg_conn.cursor() + + main_cur.execute("SHOW zenith.zenith_timeline") + timeline = main_cur.fetchone()[0] + + # Create table + main_cur.execute('CREATE TABLE foo (t text)') + + for i in range(10000): + main_cur.execute(''' + INSERT INTO foo + SELECT 'long string to consume some space'; + ''') + + if i == 99: + # keep some early lsn to test branch creation after GC + main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + res = main_cur.fetchone() + lsn_a = res[0] + xid_a = res[1] + log.info(f'LSN after 100 rows: {lsn_a} xid {xid_a}') + + main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + res = main_cur.fetchone() + debug_lsn = res[0] + debug_xid = res[1] + log.info(f'LSN after 10000 rows: {debug_lsn} xid {debug_xid}') + + # run GC + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: + pscur.execute(f"compact {env.initial_tenant.hex} {timeline}") + # perform agressive GC. Data still should be kept because of the PITR setting. + pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + row = pscur.fetchone() + print_gc_result(row) + + # Branch at the point where only 100 rows were inserted + # It must have been preserved by PITR setting + env.zenith_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) + + pg_hundred = env.postgres.create_start('test_pitr_gc_hundred') + + # On the 'hundred' branch, we should see only 100 rows + hundred_pg_conn = pg_hundred.connect() + hundred_cur = hundred_pg_conn.cursor() + hundred_cur.execute('SELECT count(*) FROM foo') + assert hundred_cur.fetchone() == (100, ) + + # All the rows are visible on the main branch + main_cur.execute('SELECT count(*) FROM foo') + assert main_cur.fetchone() == (10000, ) diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index b85a541f10..d627d8a6ee 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -16,7 +16,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' env = zenith_env_builder.init_start() """Test per tenant configuration""" - tenant = env.zenith_cli.create_tenant(conf={ + tenant, _ = env.zenith_cli.create_tenant(conf={ 'checkpoint_distance': '20000', 'gc_period': '30sec', }) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 7e71c0a157..279b3a0a25 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -95,6 +95,10 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') +@pytest.mark.skip( + reason= + "needs to replace callmemaybe call with better idea how to migrate timelines between pageservers" +) @pytest.mark.parametrize('with_load', ['with_load', 'without_load']) def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, port_distributor: PortDistributor, @@ -107,7 +111,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant, _ = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) # attach does not download ancestor branches (should it?), just use root branch for now diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 682af8de49..1b593cfee3 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -12,8 +12,8 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep env = zenith_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1 = env.zenith_cli.create_tenant() - tenant_2 = env.zenith_cli.create_tenant() + tenant_1, _ = env.zenith_cli.create_tenant() + tenant_2, _ = env.zenith_cli.create_tenant() env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', tenant_id=tenant_1) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 702c27a79b..67c9d6070e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -850,3 +850,116 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) assert wal_size_after_checkpoint < 16 * 2.5 + + +def test_delete_force(zenith_env_builder: ZenithEnvBuilder): + zenith_env_builder.num_safekeepers = 1 + env = zenith_env_builder.init_start() + + # Create two tenants: one will be deleted, other should be preserved. + tenant_id = env.initial_tenant.hex + timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Acive, delete explicitly + timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explictly + timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant + timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant + + tenant_id_other_uuid, timeline_id_other_uuid = env.zenith_cli.create_tenant() + tenant_id_other = tenant_id_other_uuid.hex + timeline_id_other = timeline_id_other_uuid.hex + + # Populate branches + pg_1 = env.postgres.create_start('br1') + pg_2 = env.postgres.create_start('br2') + pg_3 = env.postgres.create_start('br3') + pg_4 = env.postgres.create_start('br4') + pg_other = env.postgres.create_start('main', tenant_id=uuid.UUID(hex=tenant_id_other)) + for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute('CREATE TABLE t(key int primary key)') + sk = env.safekeepers[0] + sk_data_dir = Path(sk.data_dir()) + sk_http = sk.http_client() + assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. + pg_2.stop_and_destroy() + pg_4.stop_and_destroy() + sk.stop() + sk.start() + + # Ensure connections to Safekeeper are established + for pg in [pg_1, pg_3, pg_other]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute('INSERT INTO t (key) VALUES (1)') + + # Remove initial tenant's br1 (active) + assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { + "dir_existed": True, + "was_active": True, + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Ensure repeated deletion succeeds + assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { + "dir_existed": False, "was_active": False + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove initial tenant's br2 (inactive) + assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == { + "dir_existed": True, + "was_active": False, + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert not (sk_data_dir / tenant_id / timeline_id_2).exists() + assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove non-existing branch, should succeed + assert sk_http.timeline_delete_force(tenant_id, '00' * 16) == { + "dir_existed": False, + "was_active": False, + } + assert not (sk_data_dir / tenant_id / timeline_id_1).exists() + assert not (sk_data_dir / tenant_id / timeline_id_2).exists() + assert (sk_data_dir / tenant_id / timeline_id_3).exists() + assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove initial tenant fully (two branches are active) + response = sk_http.tenant_delete_force(tenant_id) + assert response == { + timeline_id_3: { + "dir_existed": True, + "was_active": True, + } + } + assert not (sk_data_dir / tenant_id).exists() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Remove initial tenant again. + response = sk_http.tenant_delete_force(tenant_id) + assert response == {} + assert not (sk_data_dir / tenant_id).exists() + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + + # Ensure the other tenant still works + sk_http.timeline_status(tenant_id_other, timeline_id_other) + with closing(pg_other.connect()) as conn: + with conn.cursor() as cur: + cur.execute('INSERT INTO t (key) VALUES (123)') diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index 091d9ac8ba..bff17fa679 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -1,7 +1,7 @@ import uuid import requests -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient +from fixtures.zenith_fixtures import DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient from typing import cast @@ -64,13 +64,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1 = env.zenith_cli.create_tenant() + tenant1, _ = env.zenith_cli.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2 = env.zenith_cli.create_tenant() + tenant2, _ = env.zenith_cli.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) @@ -83,6 +83,16 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): assert tenant2.hex in tenants +def test_cli_tenant_create(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + tenant_id, _ = env.zenith_cli.create_tenant() + timelines = env.zenith_cli.list_timelines(tenant_id) + + # an initial timeline should be created upon tenant creation + assert len(timelines) == 1 + assert timelines[0][0] == DEFAULT_BRANCH_NAME + + def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index d70f57aa52..d572901ed1 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -106,9 +106,9 @@ class ZenithCompare(PgCompare): report=MetricReport.LOWER_IS_BETTER) total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_num_persistent_files_created") + self.env.pageserver, "pageserver_created_persistent_files_total") total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_persistent_bytes_written") + self.env.pageserver, "pageserver_written_persistent_bytes_total") self.zenbenchmark.record("data_uploaded", total_bytes / (1024 * 1024), "MB", diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 98af511036..7b95e729d9 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -75,7 +75,8 @@ def lsn_from_hex(lsn_hex: str) -> int: def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( - " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" + " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}" + " needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" .format_map(row)) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 3bb7c606d3..357db4c16d 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -831,20 +831,25 @@ class ZenithCli: def create_tenant(self, tenant_id: Optional[uuid.UUID] = None, - conf: Optional[Dict[str, str]] = None) -> uuid.UUID: + timeline_id: Optional[uuid.UUID] = None, + conf: Optional[Dict[str, str]] = None) -> Tuple[uuid.UUID, uuid.UUID]: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: tenant_id = uuid.uuid4() + if timeline_id is None: + timeline_id = uuid.uuid4() if conf is None: - res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) + res = self.raw_cli([ + 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex + ]) else: - res = self.raw_cli( - ['tenant', 'create', '--tenant-id', tenant_id.hex] + - sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) + res = self.raw_cli([ + 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex + ] + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) res.check_returncode() - return tenant_id + return tenant_id, timeline_id def config_tenant(self, tenant_id: uuid.UUID, conf: Dict[str, str]): """ @@ -1795,6 +1800,21 @@ class SafekeeperHttpClient(requests.Session): json=body) res.raise_for_status() + def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]: + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_delete_force(self, tenant_id: str) -> Dict[Any, Any]: + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def get_metrics(self) -> SafekeeperMetrics: request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 4e73bedcc0..3b57ac73cc 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -18,7 +18,6 @@ from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare def test_bulk_insert(zenith_with_baseline: PgCompare): env = zenith_with_baseline - # Get the timeline ID of our branch. We need it for the 'do_gc' command with closing(env.pg.connect()) as conn: with conn.cursor() as cur: cur.execute("create table huge (i int, j int);") diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index f0729d3a07..0e16d3e749 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -30,7 +30,7 @@ def test_bulk_tenant_create( for i in range(tenants_count): start = timeit.default_timer() - tenant = env.zenith_cli.create_tenant() + tenant, _ = env.zenith_cli.create_tenant() env.zenith_cli.create_timeline( f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index ba9eabcd97..205388bd90 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -8,7 +8,6 @@ from fixtures.log_helper import log import psycopg2.extras import random import time -from fixtures.utils import print_gc_result # This is a clear-box test that demonstrates the worst case scenario for the diff --git a/vendor/postgres b/vendor/postgres index 9a9459a7f9..5a595fdcce 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 9a9459a7f9cbcaa0e35ff1f2f34c419238fdec7e +Subproject commit 5a595fdcceca7f9a43398a8ed062896642cfa1c7