From d3f83eda52a1f4e372f9149ffc8b824ef3478a25 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 00:07:14 +0300 Subject: [PATCH 01/33] Use regular agent for triggering e2e tests --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1387514cc2..bf9de7d857 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -412,7 +412,7 @@ jobs: trigger-e2e-tests: runs-on: dev container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init needs: [ build-neon ] steps: From c9e7c2f014a2d6ce269ccb7943a22d778378e512 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 7 Sep 2022 17:03:20 +0300 Subject: [PATCH 02/33] Ensure all temporary and empty directories and files are cleansed on pageserver startup --- libs/remote_storage/src/lib.rs | 7 + libs/remote_storage/src/local_fs.rs | 5 +- pageserver/src/http/routes.rs | 5 +- pageserver/src/layered_repository.rs | 105 ++-- .../src/layered_repository/delta_layer.rs | 7 +- .../src/layered_repository/image_layer.rs | 4 +- pageserver/src/lib.rs | 79 +++ pageserver/src/storage_sync.rs | 314 +++--------- pageserver/src/storage_sync/download.rs | 5 +- pageserver/src/tenant_mgr.rs | 480 +++++++++++++----- pageserver/src/tenant_tasks.rs | 10 - pageserver/src/timelines.rs | 21 +- pageserver/src/walredo.rs | 25 +- test_runner/regress/test_broken_timeline.py | 45 +- 14 files changed, 639 insertions(+), 473 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e89f60de7e..6b3fd29a0e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -344,6 +344,8 @@ impl Debug for S3Config { } } +/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { let new_extension = match original_path .as_ref() @@ -468,6 +470,11 @@ mod tests { &path_with_suffix_extension(&p, ".temp").to_string_lossy(), "/foo/bar.baz..temp" ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); } #[test] diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 2561c0ca24..3ffbf3cb39 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -21,6 +21,8 @@ use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId} use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; + /// Convert a Path in the remote storage into a RemoteObjectId fn remote_object_id_from_path(path: &Path) -> anyhow::Result { Ok(RemoteObjectId( @@ -143,7 +145,8 @@ impl RemoteStorage for LocalFs { // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs - let temp_file_path = path_with_suffix_extension(&target_file_path, "temp"); + let temp_file_path = + path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX); let mut destination = io::BufWriter::new( fs::OpenOptions::new() .write(true) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a31c2fd2a5..59142bd9b2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -470,7 +470,7 @@ async fn tenant_list_handler(request: Request) -> Result, A let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("tenant_list").entered(); - crate::tenant_mgr::list_tenants(&remote_index) + crate::tenant_mgr::list_tenant_info(&remote_index) }) .await .map_err(ApiError::from_err)?; @@ -640,7 +640,8 @@ async fn tenant_config_handler(mut request: Request) -> Result Result { + let _guard = match self.file_lock.try_read() { + Ok(g) => g, + Err(_) => { + info!("File lock write acquired, shutting down GC"); + return Ok(GcResult::default()); + } + }; + let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -315,6 +323,14 @@ impl Repository { /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> Result<()> { + let _guard = match self.file_lock.try_read() { + Ok(g) => g, + Err(_) => { + info!("File lock write acquired, shutting down compaction"); + return Ok(()); + } + }; + // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -401,10 +417,10 @@ impl Repository { pub fn init_attach_timelines( &self, - timelines: Vec<(ZTimelineId, TimelineMetadata)>, + timelines: HashMap, ) -> anyhow::Result<()> { let sorted_timelines = if timelines.len() == 1 { - timelines + timelines.into_iter().collect() } else if !timelines.is_empty() { tree_sort_timelines(timelines)? } else { @@ -442,7 +458,7 @@ impl Repository { /// perform a topological sort, so that the parent of each timeline comes /// before the children. fn tree_sort_timelines( - timelines: Vec<(ZTimelineId, TimelineMetadata)>, + timelines: HashMap, ) -> Result> { let mut result = Vec::with_capacity(timelines.len()); @@ -567,13 +583,8 @@ impl Repository { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag) } - pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> { - let mut tenant_conf = self.tenant_conf.write().unwrap(); - - tenant_conf.update(&new_tenant_conf); - - Repository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; - Ok(()) + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) { + self.tenant_conf.write().unwrap().update(&new_tenant_conf); } fn initialize_new_timeline( @@ -648,32 +659,37 @@ impl Repository { tenant_id: ZTenantId, ) -> anyhow::Result { let target_config_path = TenantConf::path(conf, tenant_id); + let target_config_display = target_config_path.display(); - info!("load tenantconf from {}", target_config_path.display()); + info!("loading tenantconf from {target_config_display}"); // FIXME If the config file is not found, assume that we're attaching // a detached tenant and config is passed via attach command. // https://github.com/neondatabase/neon/issues/1555 if !target_config_path.exists() { - info!( - "tenant config not found in {}", - target_config_path.display() - ); - return Ok(Default::default()); + info!("tenant config not found in {target_config_display}"); + return Ok(TenantConfOpt::default()); } // load and parse file - let config = fs::read_to_string(target_config_path)?; + let config = fs::read_to_string(&target_config_path).with_context(|| { + format!("Failed to load config from path '{target_config_display}'") + })?; - let toml = config.parse::()?; + let toml = config.parse::().with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as toml file") + })?; - let mut tenant_conf: TenantConfOpt = Default::default(); + let mut tenant_conf = TenantConfOpt::default(); for (key, item) in toml.iter() { match key { "tenant_config" => { - tenant_conf = PageServerConf::parse_toml_tenant_conf(item)?; + tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as pageserver config") + })?; } - _ => bail!("unrecognized pageserver option '{}'", key), + _ => bail!("config file {target_config_display} has unrecognized pageserver option '{key}'"), + } } @@ -888,26 +904,6 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { Ok(()) } -pub fn load_metadata( - conf: &'static PageServerConf, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, -) -> anyhow::Result { - let metadata_path = metadata_path(conf, timeline_id, tenant_id); - let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { - format!( - "Failed to read metadata bytes from path {}", - metadata_path.display() - ) - })?; - TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { - format!( - "Failed to parse metadata bytes from path {}", - metadata_path.display() - ) - }) -} - #[cfg(test)] pub mod repo_harness { use bytes::{Bytes, BytesMut}; @@ -925,6 +921,7 @@ pub mod repo_harness { walredo::{WalRedoError, WalRedoManager}, }; + use super::metadata::metadata_path; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; @@ -1030,7 +1027,7 @@ pub mod repo_harness { false, ); // populate repo with locally available timelines - let mut timelines_to_load = Vec::new(); + let mut timelines_to_load = HashMap::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") { @@ -1042,7 +1039,7 @@ pub mod repo_harness { .to_string_lossy() .parse()?; let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; - timelines_to_load.push((timeline_id, timeline_metadata)); + timelines_to_load.insert(timeline_id, timeline_metadata); } repo.init_attach_timelines(timelines_to_load)?; @@ -1054,6 +1051,26 @@ pub mod repo_harness { } } + fn load_metadata( + conf: &'static PageServerConf, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + ) -> anyhow::Result { + let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { + format!( + "Failed to read metadata bytes from path {}", + metadata_path.display() + ) + })?; + TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { + format!( + "Failed to parse metadata bytes from path {}", + metadata_path.display() + ) + }) + } + // Mock WAL redo manager that doesn't do much pub struct TestRedoManager; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ce5cb57745..af02f84bc0 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -34,7 +34,7 @@ use crate::layered_repository::storage_layer::{ use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; -use crate::walrecord; +use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use rand::{distributions::Alphanumeric, Rng}; @@ -447,11 +447,12 @@ impl DeltaLayer { .collect(); conf.timeline_path(&timelineid, &tenantid).join(format!( - "{}-XXX__{:016X}-{:016X}.{}.temp", + "{}-XXX__{:016X}-{:016X}.{}.{}", key_start, u64::from(lsn_range.start), u64::from(lsn_range.end), - rand_string + rand_string, + TEMP_FILE_SUFFIX, )) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index bb24553afd..4fe771bb3f 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -30,7 +30,7 @@ use crate::layered_repository::storage_layer::{ use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; -use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; +use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; @@ -255,7 +255,7 @@ impl ImageLayer { .collect(); conf.timeline_path(&timelineid, &tenantid) - .join(format!("{}.{}.temp", fname, rand_string)) + .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } /// diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 4731179e22..86bbf25b67 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -23,7 +23,10 @@ pub mod walreceiver; pub mod walrecord; pub mod walredo; +use std::collections::HashMap; + use tracing::info; +use utils::zid::{ZTenantId, ZTimelineId}; use crate::thread_mgr::ThreadKind; @@ -100,6 +103,50 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds } } +/// A newtype to store arbitrary data grouped by tenant and timeline ids. +/// One could use [`utils::zid::ZTenantTimelineId`] for grouping, but that would +/// not include the cases where a certain tenant has zero timelines. +/// This is sometimes important: a tenant could be registered during initial load from FS, +/// even if he has no timelines on disk. +#[derive(Debug)] +pub struct TenantTimelineValues(HashMap>); + +impl TenantTimelineValues { + fn new() -> Self { + Self(HashMap::new()) + } + + fn with_capacity(capacity: usize) -> Self { + Self(HashMap::with_capacity(capacity)) + } + + /// A convenience method to map certain values and omit some of them, if needed. + /// Tenants that won't have any timeline entries due to the filtering, will still be preserved + /// in the structure. + fn filter_map(self, map: F) -> TenantTimelineValues + where + F: Fn(T) -> Option, + { + let capacity = self.0.len(); + self.0.into_iter().fold( + TenantTimelineValues::::with_capacity(capacity), + |mut new_values, (tenant_id, old_values)| { + let new_timeline_values = new_values.0.entry(tenant_id).or_default(); + for (timeline_id, old_value) in old_values { + if let Some(new_value) = map(old_value) { + new_timeline_values.insert(timeline_id, new_value); + } + } + new_values + }, + ) + } +} + +/// A suffix to be used during file sync from the remote storage, +/// to ensure that we do not leave corrupted files that pretend to be layers. +const TEMP_FILE_SUFFIX: &str = "___temp"; + #[cfg(test)] mod backoff_defaults_tests { use super::*; @@ -130,3 +177,35 @@ mod backoff_defaults_tests { ); } } + +#[cfg(test)] +mod tests { + use crate::layered_repository::repo_harness::TIMELINE_ID; + + use super::*; + + #[test] + fn tenant_timeline_value_mapping() { + let first_tenant = ZTenantId::generate(); + let second_tenant = ZTenantId::generate(); + assert_ne!(first_tenant, second_tenant); + + let mut initial = TenantTimelineValues::new(); + initial + .0 + .entry(first_tenant) + .or_default() + .insert(TIMELINE_ID, "test_value"); + let _ = initial.0.entry(second_tenant).or_default(); + assert_eq!(initial.0.len(), 2, "Should have entries for both tenants"); + + let filtered = initial.filter_map(|_| None::<&str>).0; + assert_eq!( + filtered.len(), + 2, + "Should have entries for both tenants even after filtering away all entries" + ); + assert!(filtered.contains_key(&first_tenant)); + assert!(filtered.contains_key(&second_tenant)); + } +} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 42fd6b8ea8..57a964cb67 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -145,7 +145,6 @@ mod upload; use std::{ collections::{hash_map, HashMap, HashSet, VecDeque}, - ffi::OsStr, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, @@ -170,244 +169,56 @@ use self::{ index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; -use crate::metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}; use crate::{ config::PageServerConf, exponential_backoff, - layered_repository::{ - ephemeral_file::is_ephemeral_file, - metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, - }, - storage_sync::{self, index::RemoteIndex}, - tenant_mgr::attach_downloaded_tenants, + layered_repository::metadata::{metadata_path, TimelineMetadata}, + storage_sync::index::RemoteIndex, + tenant_mgr::attach_local_tenants, thread_mgr, thread_mgr::ThreadKind, }; +use crate::{ + metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, + TenantTimelineValues, +}; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; -pub use self::download::TEMP_DOWNLOAD_EXTENSION; static SYNC_QUEUE: OnceCell = OnceCell::new(); /// A timeline status to share with pageserver's sync counterpart, /// after comparing local and remote timeline state. -#[derive(Clone, Copy, Debug)] +#[derive(Clone)] pub enum LocalTimelineInitStatus { /// The timeline has every remote layer present locally. /// There could be some layers requiring uploading, /// but this does not block the timeline from any user interaction. - LocallyComplete, + LocallyComplete(TimelineMetadata), /// A timeline has some files remotely, that are not present locally and need downloading. /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, /// so the data needs to be downloaded first before the timeline can be used. NeedsSync, } -type LocalTimelineInitStatuses = HashMap>; +impl std::fmt::Debug for LocalTimelineInitStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::LocallyComplete(_) => write!(f, "LocallyComplete"), + Self::NeedsSync => write!(f, "NeedsSync"), + } + } +} /// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. /// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, /// to simplify the received code. pub struct SyncStartupData { pub remote_index: RemoteIndex, - pub local_timeline_init_statuses: LocalTimelineInitStatuses, -} - -/// Based on the config, initiates the remote storage connection and starts a separate thread -/// that ensures that pageserver and the remote storage are in sync with each other. -/// If no external configuration connection given, no thread or storage initialization is done. -/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. -pub fn start_local_timeline_sync( - config: &'static PageServerConf, - storage: Option, -) -> anyhow::Result { - let local_timeline_files = local_tenant_timeline_files(config) - .context("Failed to collect local tenant timeline files")?; - - match storage.zip(config.remote_storage_config.as_ref()) { - Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - storage, - storage_config.max_concurrent_syncs, - storage_config.max_sync_errors, - ) - .context("Failed to spawn the storage sync thread"), - None => { - info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); - for ( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - _, - ) in local_timeline_files - { - local_timeline_init_statuses - .entry(tenant_id) - .or_default() - .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); - } - Ok(SyncStartupData { - local_timeline_init_statuses, - remote_index: RemoteIndex::default(), - }) - } - } -} - -fn local_tenant_timeline_files( - config: &'static PageServerConf, -) -> anyhow::Result)>> { - let mut local_tenant_timeline_files = HashMap::new(); - let tenants_dir = config.tenants_path(); - for tenants_dir_entry in std::fs::read_dir(&tenants_dir) - .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &tenants_dir_entry { - Ok(tenants_dir_entry) => { - match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { - Ok(collected_files) => { - local_tenant_timeline_files.extend(collected_files.into_iter()) - } - Err(e) => error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - tenants_dir_entry, - e - ), - } - } - Err(e) => error!( - "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", - tenants_dir_entry, - tenants_dir.display(), - e - ), - } - } - - Ok(local_tenant_timeline_files) -} - -fn collect_timelines_for_tenant( - config: &'static PageServerConf, - tenant_path: &Path, -) -> anyhow::Result)>> { - let mut timelines = HashMap::new(); - let tenant_id = tenant_path - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse tenant id out of the tenant dir name")?; - let timelines_dir = config.timelines_path(&tenant_id); - - for timelines_dir_entry in std::fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines dir entry for tenant {}", - tenant_id - ) - })? { - match timelines_dir_entry { - Ok(timelines_dir_entry) => { - let timeline_path = timelines_dir_entry.path(); - match collect_timeline_files(&timeline_path) { - Ok((timeline_id, metadata, timeline_files)) => { - timelines.insert( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - (metadata, timeline_files), - ); - } - Err(e) => error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_path.display(), - e - ), - } - } - Err(e) => error!( - "Failed to list timelines for entry tenant {}, reason: {:?}", - tenant_id, e - ), - } - } - - Ok(timelines) -} - -// discover timeline files and extract timeline metadata -// NOTE: ephemeral files are excluded from the list -fn collect_timeline_files( - timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { - let mut timeline_files = HashSet::new(); - let mut timeline_metadata_path = None; - - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; - let timeline_dir_entries = - std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; - for entry in timeline_dir_entries { - let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { - timeline_metadata_path = Some(entry_path); - } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { - debug!("skipping ephemeral file {}", entry_path.display()); - continue; - } else if entry_path.extension().and_then(OsStr::to_str) - == Some(TEMP_DOWNLOAD_EXTENSION) - { - info!("removing temp download file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp download file at {}", - entry_path.display() - ) - })?; - } else if entry_path.extension().and_then(OsStr::to_str) == Some("temp") { - info!("removing temp layer file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp layer file at {}", - entry_path.display() - ) - })?; - } else { - timeline_files.insert(entry_path); - } - } - } - - // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed - // then attach is lost. There would be no retries for that, - // initial collect will fail because there is no metadata. - // We either need to start download if we see empty dir after restart or attach caller should - // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didn't appear locally. - // Check what happens with remote index in that case. - let timeline_metadata_path = match timeline_metadata_path { - Some(path) => path, - None => bail!("No metadata file found in the timeline directory"), - }; - let metadata = TimelineMetadata::from_bytes( - &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, - ) - .context("Failed to parse timeline metadata file bytes")?; - - Ok((timeline_id, metadata, timeline_files)) + pub local_timeline_init_statuses: TenantTimelineValues, } /// Global queue of sync tasks. @@ -763,9 +574,9 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread( +pub fn spawn_storage_sync_thread( conf: &'static PageServerConf, - local_timeline_files: HashMap)>, + local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, storage: GenericRemoteStorage, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, @@ -784,19 +595,43 @@ pub(super) fn spawn_storage_sync_thread( .build() .context("Failed to create storage sync runtime")?; + // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: + // * we need to list every timeline for tenant on S3, that might be a costly operation + // * we need to download every timeline for the tenant, to activate it in memory + // + // When on-demand download gets merged, we're able to do this fast by storing timeline metadata only. + let mut empty_tenants = TenantTimelineValues::::new(); + let mut keys_for_index_part_downloads = HashSet::new(); + let mut timelines_to_sync = HashMap::new(); + + for (tenant_id, timeline_data) in local_timeline_files.0 { + if timeline_data.is_empty() { + let _ = empty_tenants.0.entry(tenant_id).or_default(); + } else { + for (timeline_id, timeline_data) in timeline_data { + let id = ZTenantTimelineId::new(tenant_id, timeline_id); + keys_for_index_part_downloads.insert(id); + timelines_to_sync.insert(id, timeline_data); + } + } + } + let applicable_index_parts = runtime.block_on(download_index_parts( conf, &storage, - local_timeline_files.keys().copied().collect(), + keys_for_index_part_downloads, )); let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; - let local_timeline_init_statuses = schedule_first_sync_tasks( + let mut local_timeline_init_statuses = schedule_first_sync_tasks( &mut runtime.block_on(remote_index.write()), sync_queue, - local_timeline_files, + timelines_to_sync, ); + local_timeline_init_statuses + .0 + .extend(empty_tenants.0.into_iter()); let remote_index_clone = remote_index.clone(); thread_mgr::spawn( @@ -872,10 +707,7 @@ fn storage_sync_loop( "Sync loop step completed, {} new tenant state update(s)", updated_tenants.len() ); - let mut timelines_to_attach: HashMap< - ZTenantId, - Vec<(ZTimelineId, TimelineMetadata)>, - > = HashMap::new(); + let mut timelines_to_attach = TenantTimelineValues::new(); let index_accessor = runtime.block_on(index.read()); for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { @@ -901,7 +733,7 @@ fn storage_sync_loop( // and register them all at once in a repository for download // to be submitted in a single operation to repository // so it can apply them at once to internal timeline map. - timelines_to_attach.insert( + timelines_to_attach.0.insert( tenant_id, tenant_entry .iter() @@ -912,7 +744,9 @@ fn storage_sync_loop( } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - attach_downloaded_tenants(conf, &index, timelines_to_attach); + if let Err(e) = attach_local_tenants(conf, &index, timelines_to_attach) { + error!("Failed to attach new timelines: {e:?}"); + }; } } ControlFlow::Break(()) => { @@ -1443,11 +1277,10 @@ fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, local_timeline_files: HashMap)>, -) -> LocalTimelineInitStatuses { - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); +) -> TenantTimelineValues { + let mut local_timeline_init_statuses = TenantTimelineValues::new(); - let mut new_sync_tasks = - VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); + let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len()); for (sync_id, (local_metadata, local_files)) in local_timeline_files { match index.timeline_entry_mut(&sync_id) { @@ -1459,18 +1292,27 @@ fn schedule_first_sync_tasks( local_files, remote_timeline, ); - let was_there = local_timeline_init_statuses + match local_timeline_init_statuses + .0 .entry(sync_id.tenant_id) .or_default() - .insert(sync_id.timeline_id, timeline_status); - - if was_there.is_some() { - // defensive check - warn!( - "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", - sync_id.timeline_id - ); + .entry(sync_id.timeline_id) + { + hash_map::Entry::Occupied(mut o) => { + { + // defensive check + warn!( + "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", + sync_id.timeline_id + ); + } + o.insert(timeline_status); + } + hash_map::Entry::Vacant(v) => { + v.insert(timeline_status); + } } + remote_timeline.awaits_download = awaits_download; } None => { @@ -1481,15 +1323,16 @@ fn schedule_first_sync_tasks( SyncTask::upload(LayersUpload { layers_to_upload: local_files, uploaded_layers: HashSet::new(), - metadata: Some(local_metadata), + metadata: Some(local_metadata.clone()), }), )); local_timeline_init_statuses + .0 .entry(sync_id.tenant_id) .or_default() .insert( sync_id.timeline_id, - LocalTimelineInitStatus::LocallyComplete, + LocalTimelineInitStatus::LocallyComplete(local_metadata), ); } } @@ -1523,7 +1366,10 @@ fn compare_local_and_remote_timeline( // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { - (LocalTimelineInitStatus::LocallyComplete, false) + ( + LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), + false, + ) }; let layers_to_upload = local_files diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index b0beb4219a..91ee557b79 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -18,6 +18,7 @@ use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, + TEMP_FILE_SUFFIX, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -26,8 +27,6 @@ use super::{ LayersDownload, SyncData, SyncQueue, }; -pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; - // We collect timelines remotely available for each tenant // in case we failed to gather all index parts (due to an error) // Poisoned variant is returned. @@ -251,7 +250,7 @@ pub(super) async fn download_timeline_layers<'a>( // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = - path_with_suffix_extension(&layer_destination_path, TEMP_DOWNLOAD_EXTENSION); + path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX); let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 041bd50737..baa58f5eb5 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,24 +3,26 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::metadata::TimelineMetadata; -use crate::layered_repository::{load_metadata, Repository, Timeline}; +use crate::layered_repository::ephemeral_file::is_ephemeral_file; +use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; +use crate::layered_repository::{Repository, Timeline}; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; use crate::walredo::PostgresRedoManager; -use crate::{thread_mgr, timelines, walreceiver}; +use crate::{thread_mgr, timelines, walreceiver, TenantTimelineValues, TEMP_FILE_SUFFIX}; use anyhow::Context; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::hash_map::{self, Entry}; +use std::collections::{HashMap, HashSet}; +use std::ffi::OsStr; use std::fmt; +use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::sync::mpsc; use tracing::*; -use utils::lsn::Lsn; pub use tenants_state::try_send_timeline_update; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -136,34 +138,49 @@ pub fn init_tenant_mgr( conf: &'static PageServerConf, remote_storage: Option, ) -> anyhow::Result { + let _entered = info_span!("init_tenant_mgr").entered(); let (timeline_updates_sender, timeline_updates_receiver) = mpsc::unbounded_channel::(); tenants_state::set_timeline_update_sender(timeline_updates_sender)?; walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; - let SyncStartupData { - remote_index, - local_timeline_init_statuses, - } = storage_sync::start_local_timeline_sync(conf, remote_storage) - .context("Failed to set up local files sync with external storage")?; + let local_tenant_files = local_tenant_timeline_files(conf) + .context("Failed to collect local tenant timeline files")?; - for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { - if let Err(err) = - init_local_repository(conf, tenant_id, local_timeline_init_statuses, &remote_index) - { - // Report the error, but continue with the startup for other tenants. An error - // loading a tenant is serious, but it's better to complete the startup and - // serve other tenants, than fail completely. - error!("Failed to initialize local tenant {tenant_id}: {:?}", err); + let (remote_index, tenants_to_attach) = if let Some(storage) = remote_storage { + let storage_config = conf + .remote_storage_config + .as_ref() + .expect("remote storage without config"); - if let Err(err) = set_tenant_state(tenant_id, TenantState::Broken) { - error!( - "Failed to set tenant state to broken {tenant_id}: {:?}", - err - ); - } - } - } + let SyncStartupData { + remote_index, + local_timeline_init_statuses, + } = storage_sync::spawn_storage_sync_thread( + conf, + local_tenant_files, + storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + .context("Failed to spawn the storage sync thread")?; + + ( + remote_index, + local_timeline_init_statuses.filter_map(|init_status| match init_status { + LocalTimelineInitStatus::LocallyComplete(metadata) => Some(metadata), + LocalTimelineInitStatus::NeedsSync => None, + }), + ) + } else { + info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); + ( + RemoteIndex::default(), + local_tenant_files.filter_map(|(metadata, _)| Some(metadata)), + ) + }; + + attach_local_tenants(conf, &remote_index, tenants_to_attach)?; Ok(remote_index) } @@ -189,35 +206,69 @@ impl std::fmt::Debug for LocalTimelineUpdate { } } -/// Updates tenants' repositories, changing their timelines state in memory. -pub fn attach_downloaded_tenants( +/// Reads local files to load tenants and their timelines given into pageserver's memory. +/// Ignores other timelines that might be present for tenant, but were not passed as a parameter. +/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", +/// and the load continues. +pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - sync_status_updates: HashMap>, -) { - if sync_status_updates.is_empty() { - debug!("No sync status updates to apply"); - return; - } - for (tenant_id, downloaded_timelines) in sync_status_updates { - info!( - "Registering downlloaded timelines for {tenant_id} {} timelines", - downloaded_timelines.len() - ); - debug!("Downloaded timelines: {downloaded_timelines:?}"); + tenants_to_attach: TenantTimelineValues, +) -> anyhow::Result<()> { + let _entered = info_span!("attach_local_tenants").entered(); + let number_of_tenants = tenants_to_attach.0.len(); - let repo = match load_local_repo(conf, tenant_id, remote_index) { - Ok(repo) => repo, - Err(e) => { - error!("Failed to load repo for tenant {tenant_id} Error: {e:?}"); - continue; + for (tenant_id, local_timelines) in tenants_to_attach.0 { + info!( + "Attaching {} timelines for {tenant_id}", + local_timelines.len() + ); + debug!("Timelines to attach: {local_timelines:?}"); + + let repository = load_local_repo(conf, tenant_id, remote_index) + .context("Failed to load repository for tenant")?; + + let repo = Arc::clone(&repository); + { + match tenants_state::write_tenants().entry(tenant_id) { + hash_map::Entry::Occupied(_) => { + anyhow::bail!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + } + hash_map::Entry::Vacant(v) => { + v.insert(Tenant { + state: TenantState::Idle, + repo, + }); + } } - }; - match repo.init_attach_timelines(downloaded_timelines) { - Ok(()) => info!("successfully loaded local timelines for tenant {tenant_id}"), - Err(e) => error!("Failed to load local timelines for tenant {tenant_id}: {e:?}"), } + // XXX: current timeline init enables walreceiver that looks for tenant in the state, so insert the tenant entry before + repository + .init_attach_timelines(local_timelines) + .context("Failed to attach timelines for tenant")?; } + + info!("Processed {number_of_tenants} local tenants during attach"); + Ok(()) +} + +fn load_local_repo( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + remote_index: &RemoteIndex, +) -> anyhow::Result> { + let repository = Repository::new( + conf, + TenantConfOpt::default(), + Arc::new(PostgresRedoManager::new(conf, tenant_id)), + tenant_id, + remote_index.clone(), + conf.remote_storage_config.is_some(), + ); + let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; + repository.update_tenant_config(tenant_conf); + + Ok(Arc::new(repository)) } /// @@ -293,13 +344,14 @@ pub fn create_tenant_repository( } pub fn update_tenant_config( + conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); - let repo = get_repository_for_tenant(tenant_id)?; + get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - repo.update_tenant_config(tenant_conf)?; + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; Ok(()) } @@ -392,7 +444,7 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow debug!("waiting for threads to shutdown"); thread_mgr::shutdown_threads(None, None, Some(timeline_id)); debug!("thread shutdown completed"); - match tenants_state::write_tenants().get_mut(&tenant_id) { + match tenants_state::read_tenants().get(&tenant_id) { Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), } @@ -428,12 +480,10 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any // need to use crossbeam-channel for (timeline_id, join_handle) in walreceiver_join_handles { info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); - join_handle.recv().context("failed to join walreceiver")?; + join_handle.recv().ok(); info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); } - tenants_state::write_tenants().remove(&tenant_id); - // If removal fails there will be no way to successfully retry detach, // because the tenant no longer exists in the in-memory map. And it needs to be removed from it // before we remove files, because it contains references to repository @@ -443,7 +493,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any let local_tenant_directory = conf.tenant_path(&tenant_id); std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( - "Failed to remove local timeline directory '{}'", + "Failed to remove local tenant directory '{}'", local_tenant_directory.display() ) })?; @@ -454,7 +504,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any /// /// Get list of tenants, for the mgmt API /// -pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { +pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { tenants_state::read_tenants() .iter() .map(|(id, tenant)| { @@ -478,98 +528,248 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { .collect() } -/// Check if a given timeline is "broken" \[1\]. -/// The function returns an error if the timeline is "broken". -/// -/// \[1\]: it's not clear now how should we classify a timeline as broken. -/// A timeline is categorized as broken when any of following conditions is true: -/// - failed to load the timeline's metadata -/// - the timeline's disk consistent LSN is zero -fn check_broken_timeline( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, -) -> anyhow::Result { - let metadata = - load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?; +/// Attempts to collect information about all tenant and timelines, existing on the local FS. +/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories, +/// that may appear due to such removals. +/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities. +fn local_tenant_timeline_files( + config: &'static PageServerConf, +) -> anyhow::Result)>> { + let _entered = info_span!("local_tenant_timeline_files").entered(); - // A timeline with zero disk consistent LSN can happen when the page server - // failed to checkpoint the timeline import data when creating that timeline. - if metadata.disk_consistent_lsn() == Lsn::INVALID { - anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + let mut local_tenant_timeline_files = TenantTimelineValues::new(); + let tenants_dir = config.tenants_path(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + let tenant_dir_path = tenants_dir_entry.path(); + if is_temporary(&tenant_dir_path) { + info!( + "Found temporary tenant directory, removing: {}", + tenant_dir_path.display() + ); + if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { + error!( + "Failed to remove temporary directory '{}': {:?}", + tenant_dir_path.display(), + e + ); + } + } else { + match collect_timelines_for_tenant(config, &tenant_dir_path) { + Ok((tenant_id, collected_files)) => { + if collected_files.is_empty() { + match remove_if_empty(&tenant_dir_path) { + Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()), + Ok(false) => { + // insert empty timeline entry: it has some non-temporary files inside that we cannot remove + // so make obvious for HTTP API callers, that something exists there and try to load the tenant + let _ = local_tenant_timeline_files.0.entry(tenant_id).or_default(); + }, + Err(e) => error!("Failed to remove empty tenant directory: {e:?}"), + } + } else { + local_tenant_timeline_files.0.entry(tenant_id).or_default().extend(collected_files.into_iter()) + } + }, + Err(e) => error!( + "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", + tenants_dir.display(), + tenants_dir_entry, + e + ), + } + } + } + Err(e) => error!( + "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } } - Ok(metadata) + info!( + "Collected files for {} tenants", + local_tenant_timeline_files.0.len() + ); + Ok(local_tenant_timeline_files) } -/// Note: all timelines are attached at once if and only if all of them are locally complete -fn init_local_repository( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - local_timeline_init_statuses: HashMap, - remote_index: &RemoteIndex, -) -> anyhow::Result<(), anyhow::Error> { - let mut timelines_to_attach = Vec::new(); - for (timeline_id, init_status) in local_timeline_init_statuses { - match init_status { - LocalTimelineInitStatus::LocallyComplete => { - debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - let metadata = check_broken_timeline(conf, tenant_id, timeline_id) - .context("found broken timeline")?; - timelines_to_attach.push((timeline_id, metadata)); +fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result { + let directory_is_empty = tenant_dir_path + .read_dir() + .with_context(|| { + format!( + "Failed to read directory '{}' contents", + tenant_dir_path.display() + ) + })? + .next() + .is_none(); + + if directory_is_empty { + std::fs::remove_dir_all(&tenant_dir_path).with_context(|| { + format!( + "Failed to remove empty directory '{}'", + tenant_dir_path.display(), + ) + })?; + + Ok(true) + } else { + Ok(false) + } +} + +fn is_temporary(path: &Path) -> bool { + match path.file_name() { + Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX), + None => false, + } +} + +#[allow(clippy::type_complexity)] +fn collect_timelines_for_tenant( + config: &'static PageServerConf, + tenant_path: &Path, +) -> anyhow::Result<( + ZTenantId, + HashMap)>, +)> { + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse tenant id out of the tenant dir name")?; + let timelines_dir = config.timelines_path(&tenant_id); + + let mut tenant_timelines = HashMap::new(); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? + { + match timelines_dir_entry { + Ok(timelines_dir_entry) => { + let timeline_dir = timelines_dir_entry.path(); + if is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() + ); + if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else { + match collect_timeline_files(&timeline_dir) { + Ok((timeline_id, metadata, timeline_files)) => { + tenant_timelines.insert(timeline_id, (metadata, timeline_files)); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } + } + } + } + } } - LocalTimelineInitStatus::NeedsSync => { - debug!( - "timeline {tenant_id} for tenant {timeline_id} needs sync, \ - so skipped for adding into repository until sync is finished" - ); - return Ok(()); + Err(e) => { + error!("Failed to list timelines for entry tenant {tenant_id}, reason: {e:?}") } } } - // initialize local tenant - let repo = load_local_repo(conf, tenant_id, remote_index) - .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; - - // Lets fail here loudly to be on the safe side. - // XXX: It may be a better api to actually distinguish between repository startup - // and processing of newly downloaded timelines. - repo.init_attach_timelines(timelines_to_attach) - .with_context(|| format!("Failed to init local timelines for tenant {tenant_id}"))?; - Ok(()) -} - -// Sets up wal redo manager and repository for tenant. Reduces code duplication. -// Used during pageserver startup, or when new tenant is attached to pageserver. -fn load_local_repo( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - remote_index: &RemoteIndex, -) -> anyhow::Result> { - let mut m = tenants_state::write_tenants(); - let tenant = m.entry(tenant_id).or_insert_with(|| { - // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); - - // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(Repository::new( - conf, - TenantConfOpt::default(), - Arc::new(walredo_mgr), - tenant_id, - remote_index.clone(), - conf.remote_storage_config.is_some(), - )); - Tenant { - state: TenantState::Idle, - repo, + if tenant_timelines.is_empty() { + match remove_if_empty(&timelines_dir) { + Ok(true) => info!( + "Removed empty tenant timelines directory {}", + timelines_dir.display() + ), + Ok(false) => (), + Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), } - }); + } - // Restore tenant config - let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; - tenant.repo.update_tenant_config(tenant_conf)?; - - Ok(Arc::clone(&tenant.repo)) + Ok((tenant_id, tenant_timelines)) +} + +// discover timeline files and extract timeline metadata +// NOTE: ephemeral files are excluded from the list +fn collect_timeline_files( + timeline_dir: &Path, +) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { + let mut timeline_files = HashSet::new(); + let mut timeline_metadata_path = None; + + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse timeline id out of the timeline dir name")?; + let timeline_dir_entries = + std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + for entry in timeline_dir_entries { + let entry_path = entry.context("Failed to list timeline dir entry")?.path(); + if entry_path.is_file() { + if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { + timeline_metadata_path = Some(entry_path); + } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { + debug!("skipping ephemeral file {}", entry_path.display()); + continue; + } else if is_temporary(&entry_path) { + info!("removing temp timeline file at {}", entry_path.display()); + std::fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp download file at {}", + entry_path.display() + ) + })?; + } else { + timeline_files.insert(entry_path); + } + } + } + + // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed + // then attach is lost. There would be no retries for that, + // initial collect will fail because there is no metadata. + // We either need to start download if we see empty dir after restart or attach caller should + // be aware of that and retry attach if awaits_download for timeline switched from true to false + // but timelinne didn't appear locally. + // Check what happens with remote index in that case. + let timeline_metadata_path = match timeline_metadata_path { + Some(path) => path, + None => anyhow::bail!("No metadata file found in the timeline directory"), + }; + let metadata = TimelineMetadata::from_bytes( + &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + ) + .context("Failed to parse timeline metadata file bytes")?; + + anyhow::ensure!( + metadata.ancestor_timeline().is_some() || !timeline_files.is_empty(), + "Timeline has no ancestor and no layer files" + ); + + Ok((timeline_id, metadata, timeline_files)) } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 11be13b80c..4e9a5fc6ec 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -34,11 +34,6 @@ async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { // Break if we're not allowed to write to disk let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - // TODO do this inside repo.compaction_iteration instead. - let _guard = match repo.file_lock.try_read() { - Ok(g) => g, - Err(_) => return Ok(ControlFlow::Break(())), - }; // Run compaction let compaction_period = repo.get_compaction_period(); @@ -233,11 +228,6 @@ async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { // Break if we're not allowed to write to disk let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - // TODO do this inside repo.gc_iteration instead. - let _guard = match repo.file_lock.try_read() { - Ok(g) => g, - Err(_) => return Ok(ControlFlow::Break(())), - }; // Run gc let gc_period = repo.get_gc_period(); diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 936699c2ec..9356893908 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -3,6 +3,7 @@ // use anyhow::{bail, ensure, Context, Result}; +use remote_storage::path_with_suffix_extension; use std::{ fs, @@ -18,12 +19,12 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::import_datadir; use crate::tenant_mgr; use crate::CheckpointConfig; use crate::{ config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; +use crate::{import_datadir, TEMP_FILE_SUFFIX}; use crate::{ layered_repository::{Repository, Timeline}, walredo::WalRedoManager, @@ -105,13 +106,17 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // fn bootstrap_timeline( conf: &'static PageServerConf, - tenantid: ZTenantId, - tli: ZTimelineId, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, repo: &Repository, ) -> Result> { - let initdb_path = conf - .tenant_path(&tenantid) - .join(format!("tmp-timeline-{}", tli)); + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + conf.timelines_path(&tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); // Init temporarily repo to get bootstrap data run_initdb(conf, &initdb_path)?; @@ -123,7 +128,7 @@ fn bootstrap_timeline( // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(tli, lsn)?; + let timeline = repo.create_empty_timeline(timeline_id, lsn)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -134,7 +139,7 @@ fn bootstrap_timeline( info!( "created root timeline {} timeline.lsn {}", - tli, + timeline_id, timeline.get_last_record_lsn() ); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 4e49fd9373..dd946659bb 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,6 +21,7 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; +use remote_storage::path_with_suffix_extension; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -37,7 +38,6 @@ use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; -use crate::config::PageServerConf; use crate::metrics::{ WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, }; @@ -45,6 +45,7 @@ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; +use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, @@ -569,20 +570,24 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result { + fn launch(conf: &PageServerConf, tenant_id: &ZTenantId) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. - let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir"); + let datadir = path_with_suffix_extension( + conf.tenant_path(tenant_id).join("wal-redo-datadir"), + TEMP_FILE_SUFFIX, + ); // Create empty data directory for wal-redo postgres, deleting old one first. if datadir.exists() { - info!("directory {:?} exists, removing", &datadir); - if let Err(e) = fs::remove_dir_all(&datadir) { - error!("could not remove old wal-redo-datadir: {:#}", e); - } + info!( + "old temporary datadir {} exists, removing", + datadir.display() + ); + fs::remove_dir_all(&datadir)?; } - info!("running initdb in {:?}", datadir.display()); + info!("running initdb in {}", datadir.display()); let initdb = Command::new(conf.pg_bin_dir().join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") @@ -591,7 +596,7 @@ impl PostgresRedoProcess { .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .close_fds() .output() - .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?; + .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; if !initdb.status.success() { return Err(Error::new( @@ -645,7 +650,7 @@ impl PostgresRedoProcess { })?; info!( - "launched WAL redo postgres process on {:?}", + "launched WAL redo postgres process on {}", datadir.display() ); diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 4aba2494e9..1d083b3ef9 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -32,33 +32,34 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # Leave the first timeline alone, but corrupt the others in different ways (tenant0, timeline0, pg0) = tenant_timelines[0] + log.info(f"Timeline {tenant0}/{timeline0} is left intact") - # Corrupt metadata file on timeline 1 (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1) - print(f"overwriting metadata file at {metadata_path}") + metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata" f = open(metadata_path, "w") f.write("overwritten with garbage!") f.close() + log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") - # Missing layer files file on timeline 2. (This would actually work - # if we had Cloud Storage enabled in this test.) (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2) + timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Remove it os.remove(f"{timeline_path}/{filename}") + log.info( + f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)" + ) - # Corrupt layer files file on timeline 3 (tenant3, timeline3, pg3) = tenant_timelines[3] - timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3) + timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it f = open(f"{timeline_path}/{filename}", "w") f.write("overwritten with garbage!") f.close() + log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled") env.pageserver.start() @@ -69,20 +70,28 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # But all others are broken # First timeline would not get loaded into pageserver due to corrupt metadata file - (_tenant, _timeline, pg) = tenant_timelines[1] with pytest.raises( Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" ) as err: - pg.start() + pg1.start() + log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") + + # Second timeline has no ancestors, only the metadata file and no layer files + # We don't have the remote storage enabled, which means timeline is in an incorrect state, + # it's not loaded at all + with pytest.raises( + Exception, match=f"Could not get timeline {timeline2} in tenant {tenant2}" + ) as err: + pg2.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline - for n in range(2, 4): - (_tenant, _timeline, pg) = tenant_timelines[n] + for n in range(3, 4): + (bad_tenant, bad_timeline, pg) = tenant_timelines[n] with pytest.raises(Exception, match="extracting base backup failed") as err: pg.start() log.info( - f"compute startup failed lazily for timeline with corrupt layers, during basebackup preparation: {err}" + f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}" ) @@ -107,6 +116,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): tenant_id, _ = env.neon_cli.create_tenant() + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + # Introduce failpoint when creating a new timeline env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") with pytest.raises(Exception, match="before-checkpoint-new-timeline"): @@ -116,6 +127,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env.neon_cli.pageserver_stop(immediate=True) env.neon_cli.pageserver_start() - # Check that tenant with "broken" timeline is not loaded. - with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id}"): - env.neon_cli.list_timelines(tenant_id) + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" From 31ec3b790686615448ee2d00e5b4b9b5ce143b74 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 5 Sep 2022 10:13:36 +0300 Subject: [PATCH 03/33] Use the toolchain file to define current rustc version used --- .dockerignore | 1 + README.md | 19 +++++++++++++++---- rust-toolchain.toml | 7 +++++++ 3 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 rust-toolchain.toml diff --git a/.dockerignore b/.dockerignore index 9f8a22d598..4bc8e5fa13 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ * +!rust-toolchain.toml !Cargo.toml !Cargo.lock !Makefile diff --git a/README.md b/README.md index 57d0a144cb..eb13b111f5 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,17 @@ brew install libpq brew link --force libpq ``` +#### Rustc version + +The project uses [rust toolchain file](./rust-toolchain.toml) to define the version it's built with in CI for testing and local builds. + +This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. + +rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. + +non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. +Never rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. + #### Building on Linux 1. Build neon and patched postgres @@ -78,9 +89,9 @@ brew link --force libpq git clone --recursive https://github.com/neondatabase/neon.git cd neon -# The preferred and default is to make a debug build. This will create a +# The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`nproc`" +# build, utilize "BUILD_TYPE=release make -j`nproc`" make -j`nproc` ``` @@ -94,9 +105,9 @@ make -j`nproc` git clone --recursive https://github.com/neondatabase/neon.git cd neon -# The preferred and default is to make a debug build. This will create a +# The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" make -j`sysctl -n hw.logicalcpu` ``` diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000000..ee699464c6 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,7 @@ +[toolchain] +channel = "1.60" +profile = "default" +# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. +# https://rust-lang.github.io/rustup/concepts/profiles.html +# but we also need `llvm-tools-preview` for coverage data merges on CI +components = ["llvm-tools-preview", "rustfmt", "clippy"] From 923f642549c9b3b96cb53b959f34f2cb47d799e1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 5 Sep 2022 11:18:22 +0300 Subject: [PATCH 04/33] Collect cargo build timings --- .github/workflows/build_and_test.yml | 27 +++++++++++++++++++++------ .github/workflows/codestyle.yml | 5 ++--- README.md | 10 +++++----- rust-toolchain.toml | 8 +++++++- test_runner/fixtures/utils.py | 5 ++++- 5 files changed, 39 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bf9de7d857..7ee694fa16 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,7 +54,11 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] + # TODO this version is currently needed to make build statuses more informative + # and to clear cargo caches in a more transparent way. + # We should rather read this value from the file in the root of the repo, `rust-toolchain.toml` since it's + # truly setting what version of compiler the sources are built with + rust_toolchain: [ '1.60' ] env: BUILD_TYPE: ${{ matrix.build_type }} @@ -100,11 +104,11 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="" - CARGO_FLAGS="--locked" + CARGO_FLAGS="--locked --timings" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features profiling" - CARGO_FLAGS="--locked --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -218,6 +222,17 @@ jobs: name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact path: /tmp/neon + - name: Prepare cargo build timing stats for storing + run: | + mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/" + cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/" + shell: bash -euxo pipefail {0} + - name: Upload cargo build stats + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-build-stats + path: /tmp/neon/cargo-timings/ + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data if: matrix.build_type == 'debug' @@ -233,7 +248,7 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -269,7 +284,7 @@ jobs: fail-fast: false matrix: build_type: [ release ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -341,7 +356,7 @@ jobs: fail-fast: false matrix: build_type: [ debug ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index bc21054e18..ac6bfe655f 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -24,9 +24,8 @@ jobs: strategy: fail-fast: false matrix: - # If we want to duplicate this job for different - # Rust toolchains (e.g. nightly or 1.37.0), add them here. - rust_toolchain: [1.58] + # TODO read from `rust-toolchain.toml` and do the same in the build and test workflow too. + rust_toolchain: ['1.60'] os: [ubuntu-latest, macos-latest] # To support several Postgres versions, add them here. postgres_version: [v14, v15] diff --git a/README.md b/README.md index eb13b111f5..977afc2a2c 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ This file is automatically picked up by [`rustup`](https://rust-lang.github.io/r rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. -Never rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. +Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. #### Building on Linux @@ -90,8 +90,8 @@ git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a -# demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`nproc`" +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`nproc`" make -j`nproc` ``` @@ -106,8 +106,8 @@ git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a -# demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" make -j`sysctl -n hw.logicalcpu` ``` diff --git a/rust-toolchain.toml b/rust-toolchain.toml index ee699464c6..8023348aae 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,11 @@ [toolchain] -channel = "1.60" +# We try to stick to a toolchain version that is widely available on popular distributions, so that most people +# can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later +# version, we can consider updating. As of this writing, 1.60 is available on Debian 'experimental' but not yet on +# 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach +# 'testing' soon (and similarly for the other distributions). +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. +channel = "1.60" # do update CI matrix values when updating this profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 726116e53c..5fb91344ad 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -155,7 +155,7 @@ def get_scale_for_db(size_mb: int) -> int: ATTACHMENT_NAME_REGEX = re.compile( - r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs" + r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs|.+\.html" ) @@ -180,6 +180,9 @@ def allure_attach_from_dir(dir: Path): elif source.endswith(".svg"): attachment_type = "image/svg+xml" extension = "svg" + elif source.endswith(".html"): + attachment_type = "text/html" + extension = "html" else: attachment_type = "text/plain" extension = attachment.suffix.removeprefix(".") From 648e86e9df9c06f3a961cdcca6f1c23f88272b6e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 16:02:29 +0300 Subject: [PATCH 05/33] Use Debian images with libc 2.31 to build legacy compute tools --- Dockerfile.compute-node.legacy | 4 ++-- rust-toolchain.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy index 7689167156..6653d81019 100644 --- a/Dockerfile.compute-node.legacy +++ b/Dockerfile.compute-node.legacy @@ -22,7 +22,7 @@ FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps # # Image with Postgres build deps # -FROM debian:buster-slim AS build-deps +FROM debian:bullseye-slim AS build-deps RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ libcurl4-openssl-dev libossp-uuid-dev @@ -59,7 +59,7 @@ WORKDIR /pg # # Final compute node image to be exported # -FROM debian:buster-slim +FROM debian:bullseye-slim # libreadline-dev is required to run psql RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 8023348aae..1a27e92fec 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -5,7 +5,7 @@ # 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach # 'testing' soon (and similarly for the other distributions). # See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. -channel = "1.60" # do update CI matrix values when updating this +channel = "1.60" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From 18dafbb9ba0f49e65b6382acf009255a13861eab Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 16:47:09 +0300 Subject: [PATCH 06/33] Remove deceiving rust version from the CI files --- .../actions/run-python-test-set/action.yml | 5 +---- .github/workflows/build_and_test.yml | 22 +++++-------------- .github/workflows/codestyle.yml | 17 +++++--------- 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f04f5d11b8..4c18641938 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -5,9 +5,6 @@ inputs: build_type: description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster' required: true - rust_toolchain: - description: 'Rust toolchain version to fetch the caches' - required: false test_selection: description: 'A python test suite to run' required: true @@ -55,7 +52,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact path: /tmp/neon - name: Checkout diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7ee694fa16..d586741d68 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,11 +54,6 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - # TODO this version is currently needed to make build statuses more informative - # and to clear cargo caches in a more transparent way. - # We should rather read this value from the file in the root of the repo, `rust-toolchain.toml` since it's - # truly setting what version of compiler the sources are built with - rust_toolchain: [ '1.60' ] env: BUILD_TYPE: ${{ matrix.build_type }} @@ -130,8 +125,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v8-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -219,7 +214,7 @@ jobs: - name: Upload Neon artifact uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Prepare cargo build timing stats for storing @@ -230,7 +225,7 @@ jobs: - name: Upload cargo build stats uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-build-stats + name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats path: /tmp/neon/cargo-timings/ # XXX: keep this after the binaries.list is formed, so the coverage can properly work later @@ -248,7 +243,6 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -260,7 +254,6 @@ jobs: uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: regress needs_postgres_source: true run_with_real_s3: true @@ -284,7 +277,6 @@ jobs: fail-fast: false matrix: build_type: [ release ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -296,7 +288,6 @@ jobs: uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: performance run_in_parallel: false save_perf_report: true @@ -356,7 +347,6 @@ jobs: fail-fast: false matrix: build_type: [ debug ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -373,12 +363,12 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Get coverage artifact diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index ac6bfe655f..53d0f9c5d8 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -24,8 +24,11 @@ jobs: strategy: fail-fast: false matrix: - # TODO read from `rust-toolchain.toml` and do the same in the build and test workflow too. - rust_toolchain: ['1.60'] + # XXX: both OSes have rustup + # * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools + # * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools + # this is all we need to install our toolchain later via rust-toolchain.toml + # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] # To support several Postgres versions, add them here. postgres_version: [v14, v15] @@ -40,14 +43,6 @@ jobs: submodules: true fetch-depth: 2 - - name: Install rust toolchain ${{ matrix.rust_toolchain }} - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: ${{ matrix.rust_toolchain }} - components: rustfmt, clippy - override: true - - name: Check formatting run: cargo fmt --all -- --check @@ -106,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh From a48f9f377df5c076f0f6afa8b1812709ea334d35 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 10 Sep 2022 01:23:19 +0300 Subject: [PATCH 07/33] Fix typo in issue template --- .github/ISSUE_TEMPLATE/epic-template.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 33ad7b1ef5..7707e0aa67 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -1,6 +1,6 @@ --- name: Epic Template -about: A set of related tasks contributing towards specific outcome, comprizing of +about: A set of related tasks contributing towards specific outcome, comprising of more than 1 week of work. title: 'Epic: ' labels: t/Epic From 698d6d0badad9aa2a12b033a33d28c19ffaec79c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 12 Sep 2022 00:07:34 +0300 Subject: [PATCH 08/33] Use stable coverage API with rustc 1.60 --- scripts/coverage | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/scripts/coverage b/scripts/coverage index af0d067419..1dc92e57cc 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -75,8 +75,6 @@ class Cargo: def rustlib_dir(self) -> Path: if not self._rustlib_dir: cmd = [ - 'cargo', - '-Zunstable-options', 'rustc', '--print=target-libdir', ] @@ -397,7 +395,7 @@ class State: # Enable LLVM's source-based coverage # see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html # see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html - '-Zinstrument-coverage', + '-Cinstrument-coverage', # Link every bit of code to prevent "holes" in coverage report # see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code '-Clink-dead-code', @@ -410,10 +408,6 @@ class State: f'--remap-path-prefix {self.cwd}=', ]) - # XXX: God, have mercy on our souls... - # see: https://github.com/rust-lang/rust/pull/90132 - os.environ['RUSTC_BOOTSTRAP'] = '1' - def _merge_profraw(self) -> bool: profdata_path = self.profdata_dir / '-'.join([ self.profraw_prefix, From 40c845e57d7060b1946e3a9e9d6bf076a8847e52 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 11 Sep 2022 21:48:01 +0300 Subject: [PATCH 09/33] Switch to async for all concurrency in the pageserver. Instead of spawning helper threads, we now use Tokio tasks. There are multiple Tokio runtimes, for different kinds of tasks. One for serving libpq client connections, another for background operations like GC and compaction, and so on. That's not strictly required, we could use just one runtime, but with this you can still get an overview of what's happening with "top -H". There's one subtle behavior in how TenantState is updated. Before this patch, if you deleted all timelines from a tenant, its GC and compaction loops were stopped, and the tenant went back to Idle state. We no longer do that. The empty tenant stays Active. The changes to test_tenant_tasks.py are related to that. There's still plenty of synchronous code and blocking. For example, we still use blocking std::io functions for all file I/O, and the communication with WAL redo processes is still uses low-level unix poll(). We might want to rewrite those later, but this will do for now. The model is that local file I/O is considered to be fast enough that blocking - and preventing other tasks running in the same thread - is acceptable. --- Cargo.lock | 15 +- docs/pageserver-thread-mgmt.md | 47 +- libs/utils/Cargo.toml | 2 + libs/utils/src/lib.rs | 4 +- libs/utils/src/postgres_backend_async.rs | 485 +++++++++++++++ libs/utils/src/seqwait.rs | 53 +- libs/utils/src/seqwait_async.rs | 224 ------- pageserver/Cargo.toml | 5 +- pageserver/src/basebackup.rs | 5 +- pageserver/src/bin/pageserver.rs | 77 ++- pageserver/src/http/routes.rs | 35 +- pageserver/src/layered_repository.rs | 44 +- pageserver/src/layered_repository/timeline.rs | 180 +++--- pageserver/src/lib.rs | 27 +- pageserver/src/page_service.rs | 551 +++++++++--------- pageserver/src/storage_sync.rs | 71 +-- pageserver/src/storage_sync/upload.rs | 2 +- pageserver/src/task_mgr.rs | 463 +++++++++++++++ pageserver/src/tenant_mgr.rs | 255 +++----- pageserver/src/tenant_tasks.rs | 306 +++------- pageserver/src/thread_mgr.rs | 409 ------------- pageserver/src/timelines.rs | 49 +- pageserver/src/walreceiver.rs | 291 ++------- .../src/walreceiver/connection_manager.rs | 87 ++- .../src/walreceiver/walreceiver_connection.rs | 75 +-- test_runner/regress/test_tenant_tasks.py | 15 +- workspace_hack/Cargo.toml | 4 +- 27 files changed, 1840 insertions(+), 1941 deletions(-) create mode 100644 libs/utils/src/postgres_backend_async.rs delete mode 100644 libs/utils/src/seqwait_async.rs create mode 100644 pageserver/src/task_mgr.rs delete mode 100644 pageserver/src/thread_mgr.rs diff --git a/Cargo.lock b/Cargo.lock index 563a998601..e9ebcdc5ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1831,6 +1831,8 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", + "async-trait", "byteorder", "bytes", "chrono", @@ -1871,6 +1873,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", + "tokio-util", "toml_edit", "tracing", "url", @@ -3481,9 +3484,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.34" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09" +checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" dependencies = [ "cfg-if", "log", @@ -3505,11 +3508,11 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.26" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f" +checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" dependencies = [ - "lazy_static", + "once_cell", "valuable", ] @@ -3626,6 +3629,7 @@ name = "utils" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "bincode", "byteorder", "bytes", @@ -3653,6 +3657,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "tokio-rustls", "tracing", "tracing-subscriber", "workspace_hack", diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index 9ee3e40085..e351c972cb 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -1,26 +1,39 @@ ## Thread management -Each thread in the system is tracked by the `thread_mgr` module. It -maintains a registry of threads, and which tenant or timeline they are -operating on. This is used for safe shutdown of a tenant, or the whole -system. +The pageserver uses Tokio for handling concurrency. Everything runs in +Tokio tasks, although some parts are written in blocking style and use +spawn_blocking(). + +Each Tokio task is tracked by the `task_mgr` module. It maintains a +registry of tasks, and which tenant or timeline they are operating +on. ### Handling shutdown -When a tenant or timeline is deleted, we need to shut down all threads -operating on it, before deleting the data on disk. A thread registered -in the thread registry can check if it has been requested to shut down, -by calling `is_shutdown_requested()`. For async operations, there's also -a `shudown_watcher()` async task that can be used to wake up on shutdown. +When a tenant or timeline is deleted, we need to shut down all tasks +operating on it, before deleting the data on disk. There's a function, +`shutdown_tasks`, to request all tasks of a particular tenant or +timeline to shutdown. It will also wait for them to finish. + +A task registered in the task registry can check if it has been +requested to shut down, by calling `is_shutdown_requested()`. There's +also a `shudown_watcher()` Future that can be used with `tokio::select!` +or similar, to wake up on shutdown. + ### Sync vs async -The primary programming model in the page server is synchronous, -blocking code. However, there are some places where async code is -used. Be very careful when mixing sync and async code. - -Async is primarily used to wait for incoming data on network -connections. For example, all WAL receivers have a shared thread pool, -with one async Task for each connection. Once a piece of WAL has been -received from the network, the thread calls the blocking functions in +We use async to wait for incoming data on network connections, and to +perform other long-running operations. For example, each WAL receiver +connection is handled by a tokio Task. Once a piece of WAL has been +received from the network, the task calls the blocking functions in the Repository to process the WAL. + +The core storage code in `layered_repository/` is synchronous, with +blocking locks and I/O calls. The current model is that we consider +disk I/Os to be short enough that we perform them while running in a +Tokio task. If that becomes a problem, we should use `spawn_blocking` +before entering the synchronous parts of the code, or switch to using +tokio I/O functions. + +Be very careful when mixing sync and async code! diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 28ad658de4..ce55277f29 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +async-trait = "0.1" anyhow = "1.0" bincode = "1.3" bytes = "1.0.1" @@ -16,6 +17,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" thiserror = "1.0" tokio = { version = "1.17", features = ["macros"]} +tokio-rustls = "0.23" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } nix = "0.23.0" diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index fa7a37adf1..caa7ac6c09 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -14,11 +14,9 @@ pub mod simple_rcu; /// append only ordered map implemented with a Vec pub mod vec_map; -// Async version of SeqWait. Currently unused. -// pub mod seqwait_async; - pub mod bin_ser; pub mod postgres_backend; +pub mod postgres_backend_async; pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs new file mode 100644 index 0000000000..383ad3742f --- /dev/null +++ b/libs/utils/src/postgres_backend_async.rs @@ -0,0 +1,485 @@ +//! Server-side asynchronous Postgres connection, as limited as we need. +//! To use, create PostgresBackend and run() it, passing the Handler +//! implementation determining how to process the queries. Currently its API +//! is rather narrow, but we can extend it once required. + +use crate::postgres_backend::AuthType; +use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; +use anyhow::{bail, Context, Result}; +use bytes::{Bytes, BytesMut}; +use rand::Rng; +use std::future::Future; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::task::Poll; +use tracing::{debug, error, trace}; + +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_rustls::TlsAcceptor; + +#[async_trait::async_trait] +pub trait Handler { + /// Handle single query. + /// postgres_backend will issue ReadyForQuery after calling this (this + /// might be not what we want after CopyData streaming, but currently we don't + /// care). + async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + + /// Called on startup packet receival, allows to process params. + /// + /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users + /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow + /// to override whole init logic in implementations. + fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + Ok(()) + } + + /// Check auth md5 + fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { + bail!("MD5 auth failed") + } + + /// Check auth jwt + fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { + bail!("JWT auth failed") + } +} + +/// PostgresBackend protocol state. +/// XXX: The order of the constructors matters. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub enum ProtoState { + Initialization, + Encrypted, + Authentication, + Established, + Closed, +} + +#[derive(Clone, Copy)] +pub enum ProcessMsgResult { + Continue, + Break, +} + +/// Always-writeable sock_split stream. +/// May not be readable. See [`PostgresBackend::take_stream_in`] +pub enum Stream { + Unencrypted(tokio::net::TcpStream), + Tls(Box>), + Broken, +} + +impl AsyncWrite for Stream { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Broken => unreachable!(), + } + } + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), + Self::Tls(stream) => Pin::new(stream).poll_flush(cx), + Self::Broken => unreachable!(), + } + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Broken => unreachable!(), + } + } +} +impl AsyncRead for Stream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Broken => unreachable!(), + } + } +} + +pub struct PostgresBackend { + stream: Stream, + // Output buffer. c.f. BeMessage::write why we are using BytesMut here. + buf_out: BytesMut, + + pub state: ProtoState, + + md5_salt: [u8; 4], + auth_type: AuthType, + + peer_addr: SocketAddr, + pub tls_config: Option>, +} + +pub fn query_from_cstring(query_string: Bytes) -> Vec { + let mut query_string = query_string.to_vec(); + if let Some(ch) = query_string.last() { + if *ch == 0 { + query_string.pop(); + } + } + query_string +} + +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); + std::str::from_utf8(without_null).map_err(|e| e.into()) +} + +impl PostgresBackend { + pub fn new( + socket: tokio::net::TcpStream, + auth_type: AuthType, + tls_config: Option>, + ) -> std::io::Result { + let peer_addr = socket.peer_addr()?; + + Ok(Self { + stream: Stream::Unencrypted(socket), + buf_out: BytesMut::with_capacity(10 * 1024), + state: ProtoState::Initialization, + md5_salt: [0u8; 4], + auth_type, + tls_config, + peer_addr, + }) + } + + pub fn get_peer_addr(&self) -> &SocketAddr { + &self.peer_addr + } + + /// Read full message or return None if connection is closed. + pub async fn read_message(&mut self) -> Result> { + use ProtoState::*; + match self.state { + Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, + Authentication | Established => FeMessage::read_fut(&mut self.stream).await, + Closed => Ok(None), + } + } + + /// Flush output buffer into the socket. + pub async fn flush(&mut self) -> std::io::Result<&mut Self> { + self.stream.write_all(&self.buf_out).await?; + self.buf_out.clear(); + Ok(self) + } + + /// Write message into internal output buffer. + pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> { + BeMessage::write(&mut self.buf_out, message)?; + Ok(self) + } + + // Wrapper for run_message_loop() that shuts down socket when we are done + pub async fn run(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + let ret = self.run_message_loop(handler, shutdown_watcher).await; + let _ = self.stream.shutdown(); + ret + } + + async fn run_message_loop( + &mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + trace!("postgres backend to {:?} started", self.peer_addr); + + tokio::select!( + biased; + + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received during handshake"); + return Ok(()) + }, + + result = async { + while self.state < ProtoState::Established { + if let Some(msg) = self.read_message().await? { + trace!("got message {msg:?} during handshake"); + + match self.process_handshake_message(handler, msg).await? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + } else { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + Ok::<(), anyhow::Error>(()) + } => { + // Handshake complete. + result?; + } + ); + + // Authentication completed + let mut query_string = Bytes::new(); + while let Some(msg) = tokio::select!( + biased; + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received in run_message_loop"); + Ok(None) + }, + msg = self.read_message() => { msg }, + )? { + trace!("got message {:?}", msg); + + let result = self.process_message(handler, msg, &mut query_string).await; + self.flush().await?; + match result? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => break, + } + } + + trace!("postgres backend to {:?} exited", self.peer_addr); + Ok(()) + } + + async fn start_tls(&mut self) -> anyhow::Result<()> { + if let Stream::Unencrypted(plain_stream) = + std::mem::replace(&mut self.stream, Stream::Broken) + { + let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap()); + let tls_stream = acceptor.accept(plain_stream).await?; + + self.stream = Stream::Tls(Box::new(tls_stream)); + return Ok(()); + }; + bail!("TLS already started"); + } + + async fn process_handshake_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + ) -> Result { + assert!(self.state < ProtoState::Established); + let have_tls = self.tls_config.is_some(); + match msg { + FeMessage::StartupPacket(m) => { + trace!("got startup message {m:?}"); + + match m { + FeStartupPacket::SslRequest => { + debug!("SSL requested"); + + self.write_message(&BeMessage::EncryptionResponse(have_tls))?; + if have_tls { + self.start_tls().await?; + self.state = ProtoState::Encrypted; + } + } + FeStartupPacket::GssEncRequest => { + debug!("GSS requested"); + self.write_message(&BeMessage::EncryptionResponse(false))?; + } + FeStartupPacket::StartupMessage { .. } => { + if have_tls && !matches!(self.state, ProtoState::Encrypted) { + self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; + bail!("client did not connect with TLS"); + } + + // NB: startup() may change self.auth_type -- we are using that in proxy code + // to bypass auth for new users. + handler.startup(self, &m)?; + + match self.auth_type { + AuthType::Trust => { + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + // The async python driver requires a valid server_version + .write_message(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion("14.1"), + ))? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + AuthType::MD5 => { + rand::thread_rng().fill(&mut self.md5_salt); + self.write_message(&BeMessage::AuthenticationMD5Password( + self.md5_salt, + ))?; + self.state = ProtoState::Authentication; + } + AuthType::ZenithJWT => { + self.write_message(&BeMessage::AuthenticationCleartextPassword)?; + self.state = ProtoState::Authentication; + } + } + } + FeStartupPacket::CancelRequest { .. } => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + } + + FeMessage::PasswordMessage(m) => { + trace!("got password message '{:?}'", m); + + assert!(self.state == ProtoState::Authentication); + + match self.auth_type { + AuthType::Trust => unreachable!(), + AuthType::MD5 => { + let (_, md5_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_md5(self, md5_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + AuthType::ZenithJWT => { + let (_, jwt_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_jwt(self, jwt_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + } + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + + _ => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + Ok(ProcessMsgResult::Continue) + } + + async fn process_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + unnamed_query_string: &mut Bytes, + ) -> Result { + // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth + // TODO: change that to proper top-level match of protocol state with separate message handling for each state + assert!(self.state == ProtoState::Established); + + match msg { + FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { + bail!("protocol violation"); + } + + FeMessage::Query(body) => { + // remove null terminator + let query_string = cstr_to_str(&body)?; + + trace!("got query {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + // ":?" uses the alternate formatting style, which makes anyhow display the + // full cause of the error, not just the top-level context + its trace. + // We don't want to send that in the ErrorResponse though, + // because it's not relevant to the compute node logs. + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + // TODO: untangle convoluted control flow + if e.to_string().contains("failed to run") { + return Ok(ProcessMsgResult::Break); + } + } + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Parse(m) => { + *unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete)?; + } + + FeMessage::Describe(_) => { + self.write_message(&BeMessage::ParameterDescription)? + .write_message(&BeMessage::NoData)?; + } + + FeMessage::Bind(_) => { + self.write_message(&BeMessage::BindComplete)?; + } + + FeMessage::Close(_) => { + self.write_message(&BeMessage::CloseComplete)?; + } + + FeMessage::Execute(_) => { + let query_string = cstr_to_str(unnamed_query_string)?; + trace!("got execute {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + } + // NOTE there is no ReadyForQuery message. This handler is used + // for basebackup and it uses CopyOut which doesn't require + // ReadyForQuery message and backend just switches back to + // processing mode after sending CopyDone or ErrorResponse. + } + + FeMessage::Sync => { + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Terminate => { + return Ok(ProcessMsgResult::Break); + } + + // We prefer explicit pattern matching to wildcards, because + // this helps us spot the places where new variants are missing + FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { + bail!("unexpected message type: {:?}", msg); + } + } + + Ok(ProcessMsgResult::Continue) + } +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index a531975d60..467b900a13 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -4,9 +4,10 @@ use std::cmp::{Eq, Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fmt::Debug; use std::mem; -use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Mutex; use std::time::Duration; +use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::time::timeout; /// An error happened while waiting for a number #[derive(Debug, PartialEq, Eq, thiserror::Error)] @@ -141,10 +142,10 @@ where /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. - pub fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { + pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown), + Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown), Err(e) => Err(e), } } @@ -156,13 +157,18 @@ where /// /// If that hasn't happened after the specified timeout duration, /// [`SeqWaitError::Timeout`] will be returned. - pub fn wait_for_timeout(&self, num: V, timeout_duration: Duration) -> Result<(), SeqWaitError> { + pub async fn wait_for_timeout( + &self, + num: V, + timeout_duration: Duration, + ) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e { - std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout, - std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown, - }), + Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await { + Ok(Ok(())) => Ok(()), + Ok(Err(_)) => Err(SeqWaitError::Shutdown), + Err(_) => Err(SeqWaitError::Timeout), + }, Err(e) => Err(e), } } @@ -179,7 +185,7 @@ where } // Create a new channel. - let (tx, rx) = channel(); + let (tx, rx) = channel(()); internal.waiters.push(Waiter { wake_num: num, wake_channel: tx, @@ -235,7 +241,6 @@ mod tests { use super::*; use std::sync::Arc; use std::thread::sleep; - use std::thread::spawn; use std::time::Duration; impl MonotonicCounter for i32 { @@ -248,25 +253,25 @@ mod tests { } } - #[test] - fn seqwait() { + #[tokio::test] + async fn seqwait() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - spawn(move || { - seq2.wait_for(42).expect("wait_for 42"); + tokio::task::spawn(async move { + seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).expect_err("no 999"); + seq2.wait_for(999).await.expect_err("no 999"); }); - spawn(move || { - seq3.wait_for(42).expect("wait_for 42"); - seq3.wait_for(0).expect("wait_for 0"); + tokio::task::spawn(async move { + seq3.wait_for(42).await.expect("wait_for 42"); + seq3.wait_for(0).await.expect("wait_for 0"); }); sleep(Duration::from_secs(1)); let old = seq.advance(99); assert_eq!(old, 0); - seq.wait_for(100).expect("wait_for 100"); + seq.wait_for(100).await.expect("wait_for 100"); // Calling advance with a smaller value is a no-op assert_eq!(seq.advance(98), 100); @@ -275,16 +280,16 @@ mod tests { seq.shutdown(); } - #[test] - fn seqwait_timeout() { + #[tokio::test] + async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - spawn(move || { + tokio::task::spawn(async move { let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout); + let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_secs(1)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); diff --git a/libs/utils/src/seqwait_async.rs b/libs/utils/src/seqwait_async.rs deleted file mode 100644 index f685e2b569..0000000000 --- a/libs/utils/src/seqwait_async.rs +++ /dev/null @@ -1,224 +0,0 @@ -//! -//! Async version of 'seqwait.rs' -//! -//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. -//! - -#![warn(missing_docs)] - -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::mem; -use std::sync::Mutex; -use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; -use tokio::time::timeout; - -/// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] -#[error("SeqWaitError")] -pub enum SeqWaitError { - /// The wait timeout was reached - Timeout, - /// [`SeqWait::shutdown`] was called - Shutdown, -} - -/// Internal components of a `SeqWait` -struct SeqWaitInt -where - T: Ord, -{ - waiters: BTreeMap, Receiver<()>)>, - current: T, - shutdown: bool, -} - -/// A tool for waiting on a sequence number -/// -/// This provides a way to await the arrival of a number. -/// As soon as the number arrives by another caller calling -/// [`advance`], then the waiter will be woken up. -/// -/// This implementation takes a blocking Mutex on both [`wait_for`] -/// and [`advance`], meaning there may be unexpected executor blocking -/// due to thread scheduling unfairness. There are probably better -/// implementations, but we can probably live with this for now. -/// -/// [`wait_for`]: SeqWait::wait_for -/// [`advance`]: SeqWait::advance -/// -pub struct SeqWait -where - T: Ord, -{ - internal: Mutex>, -} - -impl SeqWait -where - T: Ord + Debug + Copy, -{ - /// Create a new `SeqWait`, initialized to a particular number - pub fn new(starting_num: T) -> Self { - let internal = SeqWaitInt { - waiters: BTreeMap::new(), - current: starting_num, - shutdown: false, - }; - SeqWait { - internal: Mutex::new(internal), - } - } - - /// Shut down a `SeqWait`, causing all waiters (present and - /// future) to return an error. - pub fn shutdown(&self) { - let waiters = { - // Prevent new waiters; wake all those that exist. - // Wake everyone with an error. - let mut internal = self.internal.lock().unwrap(); - - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) - - // Drop the lock as we exit this scope. - }; - - // When we drop the waiters list, each Receiver will - // be woken with an error. - // This drop doesn't need to be explicit; it's done - // here to make it easier to read the code and understand - // the order of events. - drop(waiters); - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> { - let mut rx = { - let mut internal = self.internal.lock().unwrap(); - if internal.current >= num { - return Ok(()); - } - if internal.shutdown { - return Err(SeqWaitError::Shutdown); - } - - // If we already have a channel for waiting on this number, reuse it. - if let Some((_, rx)) = internal.waiters.get_mut(&num) { - // an Err from changed() means the sender was dropped. - rx.clone() - } else { - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.insert(num, (tx, rx.clone())); - rx - } - // Drop the lock as we exit this scope. - }; - rx.changed().await.map_err(|_| SeqWaitError::Shutdown) - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - /// - /// If that hasn't happened after the specified timeout duration, - /// [`SeqWaitError::Timeout`] will be returned. - pub async fn wait_for_timeout( - &self, - num: T, - timeout_duration: Duration, - ) -> Result<(), SeqWaitError> { - timeout(timeout_duration, self.wait_for(num)) - .await - .unwrap_or(Err(SeqWaitError::Timeout)) - } - - /// Announce a new number has arrived - /// - /// All waiters at this value or below will be woken. - /// - /// `advance` will panic if you send it a lower number than - /// a previous call. - pub fn advance(&self, num: T) { - let wake_these = { - let mut internal = self.internal.lock().unwrap(); - - if internal.current > num { - panic!( - "tried to advance backwards, from {:?} to {:?}", - internal.current, num - ); - } - internal.current = num; - - // split_off will give me all the high-numbered waiters, - // so split and then swap. Everything at or above `num` - // stays. - let mut split = internal.waiters.split_off(&num); - std::mem::swap(&mut split, &mut internal.waiters); - - // `split_at` didn't get the value at `num`; if it's - // there take that too. - if let Some(sleeper) = internal.waiters.remove(&num) { - split.insert(num, sleeper); - } - - split - }; - - for (_wake_num, (tx, _rx)) in wake_these { - // This can fail if there are no receivers. - // We don't care; discard the error. - let _ = tx.send(()); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::Arc; - use tokio::time::{sleep, Duration}; - - #[tokio::test] - async fn seqwait() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - let seq3 = Arc::clone(&seq); - tokio::spawn(async move { - seq2.wait_for(42).await.expect("wait_for 42"); - seq2.advance(100); - seq2.wait_for(999).await.expect_err("no 999"); - }); - tokio::spawn(async move { - seq3.wait_for(42).await.expect("wait_for 42"); - seq3.wait_for(0).await.expect("wait_for 0"); - }); - sleep(Duration::from_secs(1)).await; - seq.advance(99); - seq.wait_for(100).await.expect("wait_for 100"); - seq.shutdown(); - } - - #[tokio::test] - async fn seqwait_timeout() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - tokio::spawn(async move { - let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout).await; - assert_eq!(res, Err(SeqWaitError::Timeout)); - }); - sleep(Duration::from_secs(1)).await; - // This will attempt to wake, but nothing will happen - // because the waiter already dropped its Receiver. - seq.advance(99); - } -} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 902765f424..e73c73bd9c 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,8 @@ profiling = ["pprof"] failpoints = ["fail/failpoints"] [dependencies] +async-stream = "0.3" +async-trait = "0.1" chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" @@ -24,6 +26,7 @@ itertools = "0.10.3" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio-util = { version = "0.7.3", features = ["io", "io-util"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } @@ -43,7 +46,7 @@ pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallcl toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" const_format = "0.2.21" -tracing = "0.1.27" +tracing = "0.1.36" signal-hook = "0.3.10" url = "2" nix = "0.23" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index cd99c3c67d..61facc852d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -81,9 +81,8 @@ where // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { - // Backup was requested at a particular LSN. Wait for it to arrive. - info!("waiting for {}", req_lsn); - timeline.wait_lsn(req_lsn)?; + // Backup was requested at a particular LSN. The caller should've + // already checked that it's a valid LSN. // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5a43516728..ec71e5b320 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,7 +4,7 @@ use remote_storage::GenericRemoteStorage; use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use clap::{App, Arg}; use daemonize::Daemonize; @@ -12,13 +12,15 @@ use daemonize::Daemonize; use fail::FailScenario; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, - thread_mgr::ThreadKind, - virtual_file, LOG_FILE_NAME, + http, page_cache, page_service, profiling, task_mgr, + task_mgr::TaskKind, + task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, + }, + tenant_mgr, virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, - http::endpoint, logging, postgres_backend::AuthType, project_git_version, @@ -286,7 +288,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // start profiler (if enabled) let profiler_guard = profiling::init_profiler(conf); - pageserver::tenant_tasks::init_tenant_task_pool()?; + WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?; // initialize authentication for incoming connections let auth = match &conf.auth_type { @@ -307,35 +309,54 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }) .transpose() .context("Failed to init generic remote storage")?; + let remote_index = { + let _rt_guard = BACKGROUND_RUNTIME.enter(); + tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())? + }; - let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?; - - // Spawn a new thread for the http endpoint + // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME. // bind before launching separate thread so the error reported before startup exits - let auth_cloned = auth.clone(); - thread_mgr::spawn( - ThreadKind::HttpEndpointListener, - None, - None, - "http_endpoint_thread", - true, - move || { - let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?; - endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) - }, - )?; - // Spawn a thread to listen for libpq connections. It will spawn further threads + // Create a Service from the router above to handle incoming requests. + { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + + let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?; + let service = + utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(service) + .with_graceful_shutdown(task_mgr::shutdown_watcher()); + + task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::HttpEndpointListener, + None, + None, + "http endpoint listener", + true, + async { + server.await?; + Ok(()) + }, + ); + } + + // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. - thread_mgr::spawn( - ThreadKind::LibpqEndpointListener, + task_mgr::spawn( + COMPUTE_REQUEST_RUNTIME.handle(), + TaskKind::LibpqEndpointListener, None, None, - "libpq endpoint thread", + "libpq endpoint listener", true, - move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), - )?; + async move { + page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await + }, + ); + // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { info!( @@ -352,7 +373,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() signal.name() ); profiling::exit_profiler(conf, &profiler_guard); - pageserver::shutdown_pageserver(0); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } }) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 59142bd9b2..78f83511cb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -161,16 +161,14 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; @@ -184,9 +182,10 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists Err(err) => Err(err), } - }) - .await - .map_err(ApiError::from_err)??; + } + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) + .await + .map_err(ApiError::from_err)?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -426,12 +425,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; - tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_detach", tenant = %tenant_id).entered(); - tenant_mgr::detach_tenant(conf, tenant_id) - }) - .await - .map_err(ApiError::from_err)??; + tenant_mgr::detach_tenant(conf, tenant_id) + .instrument(info_span!("tenant_detach", tenant = %tenant_id)) + .await + .map_err(ApiError::from_err)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -583,7 +578,7 @@ async fn tenant_create_handler(mut request: Request) -> Result, - // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -284,7 +270,7 @@ impl Repository { } /// perform one garbage collection iteration, removing old data files from disk. - /// this function is periodically called by gc thread. + /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// /// 'timelineid' specifies the timeline to GC, or None for all. @@ -299,14 +285,6 @@ impl Repository { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down GC"); - return Ok(GcResult::default()); - } - }; - let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -319,18 +297,10 @@ impl Repository { } /// Perform one compaction iteration. - /// This function is periodically called by compactor thread. + /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> Result<()> { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down compaction"); - return Ok(()); - } - }; - // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -624,10 +594,7 @@ impl Repository { .load_layer_map(new_disk_consistent_lsn) .context("failed to load layermap")?; - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), new_timeline_id), - timeline: Arc::clone(&new_timeline), - }); + new_timeline.launch_wal_receiver()?; Ok(new_timeline) } @@ -642,7 +609,6 @@ impl Repository { ) -> Repository { Repository { tenant_id, - file_lock: RwLock::new(()), conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), @@ -846,7 +812,7 @@ impl Repository { // See comments in [`Repository::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { // We were requested to shut down. Stop and return with the progress we // made. break; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index aa9d636739..60abbe33e6 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -5,16 +5,17 @@ use bytes::Bytes; use fail::fail_point; use itertools::Itertools; use once_cell::sync::OnceCell; +use tokio::task::spawn_blocking; use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; +use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; -use std::{fs, thread}; use crate::layered_repository::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, @@ -46,8 +47,9 @@ use utils::{ use crate::repository::GcResult; use crate::repository::{Key, Value}; -use crate::thread_mgr; -use crate::walreceiver::IS_WAL_RECEIVER; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; @@ -56,7 +58,7 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - tenant_id: ZTenantId, + pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, pub layers: RwLock, @@ -110,11 +112,11 @@ pub struct Timeline { /// to avoid deadlock. write_lock: Mutex<()>, - /// Used to ensure that there is only one thread + /// Used to ensure that there is only task performing flushing at a time layer_flush_lock: Mutex<()>, /// Layer removal lock. - /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], /// and [`Repository::delete_timeline`]. layer_removal_cs: Mutex<()>, @@ -142,10 +144,7 @@ pub struct Timeline { /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, - // TODO task management should be done outside timeline, managed along with other tasks. - #[allow(clippy::type_complexity)] - initial_size_computation_task: - Mutex>, mpsc::Receiver<()>)>>, + initial_size_computation_started: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -413,23 +412,23 @@ impl Timeline { /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// - pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead + pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver, because that could lead // to a deadlock. ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection), + "wait_lsn cannot be called in WAL receiver" ); - self.metrics.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; + let _timer = self.metrics.wait_lsn_time_histo.start_timer(); + + self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await + .with_context(|| + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + )?; Ok(()) } @@ -587,7 +586,7 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - initial_size_computation_task: Mutex::new(None), + initial_size_computation_started: AtomicBool::new(false), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -598,6 +597,43 @@ impl Timeline { result } + pub fn launch_wal_receiver(self: &Arc) -> anyhow::Result<()> { + if !is_etcd_client_initialized() { + if cfg!(test) { + info!("not launching WAL receiver because etcd client hasn't been initialized"); + return Ok(()); + } else { + panic!("etcd client not initialized"); + } + } + + info!( + "launching WAL receiver for timeline {} of tenant {}", + self.timeline_id, self.tenant_id + ); + let tenant_conf_guard = self.tenant_conf.read().unwrap(); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); + let walreceiver_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); + drop(tenant_conf_guard); + let self_clone = Arc::clone(self); + let _ = spawn_connection_manager_task( + self.conf.broker_etcd_prefix.clone(), + self_clone, + walreceiver_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + )?; + + Ok(()) + } + /// /// Scan the timeline directory to populate the layer map. /// Returns all timeline-related files that were found and loaded. @@ -715,61 +751,34 @@ impl Timeline { fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { let timeline_id = self.timeline_id; - let mut task_guard = match self.initial_size_computation_task.try_lock() { - Ok(guard) => guard, - Err(_) => { - debug!("Skipping timeline logical size init: task lock is taken already"); - return; - } - }; - - if let Some((old_task, task_finish_signal)) = task_guard.take() { - // TODO rust 1.61 would allow to remove `task_finish_signal` entirely and call `old_task.is_finished()` instead - match task_finish_signal.try_recv() { - // task has either signaled successfully that it finished or panicked and dropped the sender part without signalling - Ok(()) | Err(mpsc::TryRecvError::Disconnected) => { - match old_task.join() { - // we're here due to OnceCell::get not returning the value - Ok(Ok(())) => { - error!("Timeline {timeline_id} size init task finished, yet the size was not updated, rescheduling the computation") - } - Ok(Err(task_error)) => { - error!("Error during timeline {timeline_id} size init: {task_error:?}") - } - Err(e) => error!("Timeline {timeline_id} size init task panicked: {e:?}"), - } - } - // task had not yet finished: no signal was sent and the sender channel is not dropped - Err(mpsc::TryRecvError::Empty) => { - // let the task finish - *task_guard = Some((old_task, task_finish_signal)); - return; - } - } - } - - if task_guard.is_none() { - let thread_timeline = Arc::clone(self); - let (finish_sender, finish_receiver) = mpsc::channel(); - - match thread::Builder::new() - .name(format!( - "Timeline {timeline_id} initial logical size calculation" - )) - .spawn(move || { - let _enter = info_span!("initial_logical_size_calculation", timeline = %timeline_id).entered(); - let calculated_size = thread_timeline.calculate_logical_size(init_lsn)?; - match thread_timeline.current_logical_size.initial_logical_size.set(calculated_size) { + // Atomically check if the timeline size calculation had already started. + // If the flag was not already set, this sets it. + if !self + .initial_size_computation_started + .swap(true, AtomicOrdering::SeqCst) + { + // We need to start the computation task. + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "initial size calculation", + false, + async move { + let calculated_size = self_clone.calculate_logical_size(init_lsn)?; + let result = spawn_blocking(move || { + self_clone.current_logical_size.initial_logical_size.set(calculated_size) + }).await?; + match result { Ok(()) => info!("Successfully calculated initial logical size"), Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), } - - finish_sender.send(()).ok(); Ok(()) - }) { - Ok(guard) => *task_guard = Some((guard, finish_receiver)), - Err(e) => error!("Failed to spawn timeline {timeline_id} size init task: {e}"), - } + } + .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id)) + ); } } @@ -1099,22 +1108,23 @@ impl Timeline { self.last_freeze_at.store(last_lsn); *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running + // Launch a task to flush the frozen layer to disk, unless + // a task was already running. (If the task was running // at the time that we froze the layer, it must've seen the // the layer we just froze before it exited; see comments // in flush_frozen_layers()) if let Ok(guard) = self.layer_flush_lock.try_lock() { drop(guard); let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, Some(self.tenant_id), Some(self.timeline_id), - "layer flush thread", + "layer flush task", false, - move || self_clone.flush_frozen_layers(false), - )?; + async move { self_clone.flush_frozen_layers(false) }, + ); } } } @@ -1123,8 +1133,8 @@ impl Timeline { /// Flush all frozen layers to disk. /// - /// Only one thread at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another thread is + /// Only one task at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another task is /// currently doing the flushing, this function will wait for it /// to finish. If 'wait' is false, this function will return /// immediately instead. diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 86bbf25b67..8b9251229e 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,10 +12,10 @@ pub mod profiling; pub mod reltag; pub mod repository; pub mod storage_sync; +pub mod task_mgr; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_tasks; -pub mod thread_mgr; pub mod timelines; pub mod virtual_file; pub mod walingest; @@ -28,7 +28,7 @@ use std::collections::HashMap; use tracing::info; use utils::zid::{ZTenantId, ZTimelineId}; -use crate::thread_mgr::ThreadKind; +use crate::task_mgr::TaskKind; /// Current storage format version /// @@ -52,30 +52,31 @@ pub enum CheckpointConfig { Forced, } -pub fn shutdown_pageserver(exit_code: i32) { - // Shut down the libpq endpoint thread. This prevents new connections from +pub async fn shutdown_pageserver(exit_code: i32) { + // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await; - // Shut down any page service threads. - thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); + // Shut down any page service tasks. + task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await; // Shut down all the tenants. This flushes everything to disk and kills - // the checkpoint and GC threads. - tenant_mgr::shutdown_all_tenants(); + // the checkpoint and GC tasks. + tenant_mgr::shutdown_all_tenants().await; // Stop syncing with remote storage. // - // FIXME: Does this wait for the sync thread to finish syncing what's queued up? + // FIXME: Does this wait for the sync tasks to finish syncing what's queued up? // Should it? - thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. - thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None); + // FIXME: We should probably stop accepting commands like attach/detach earlier. + task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await; // There should be nothing left, but let's be sure - thread_mgr::shutdown_threads(None, None, None); + task_mgr::shutdown_tasks(None, None, None).await; info!("Shut down successfully completed"); std::process::exit(exit_code); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 783fcb2412..149144bfe4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -11,17 +11,21 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use futures::{Stream, StreamExt}; use regex::Regex; -use std::io::{self, Read}; +use std::io; use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::Arc; +use tokio_util::io::StreamReader; +use tokio_util::io::SyncIoBridge; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, lsn::Lsn, - postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend}, + postgres_backend::AuthType, + postgres_backend_async::{self, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, simple_rcu::RcuReadGuard, zid::{ZTenantId, ZTimelineId}, @@ -35,9 +39,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; +use crate::task_mgr; +use crate::task_mgr::TaskKind; use crate::tenant_mgr; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; @@ -201,93 +205,49 @@ impl PagestreamBeMessage { } } -/// Implements Read for the server side of CopyIn -struct CopyInReader<'a> { - pgb: &'a mut PostgresBackend, +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { + async_stream::try_stream! { + loop { + let msg = tokio::select! { + biased; - /// Overflow buffer for bytes sent in CopyData messages - /// that the reader (caller of read) hasn't asked for yet. - /// TODO use BytesMut? - buf: Vec, + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + let msg = format!("pageserver is shutting down"); + let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg)); + Err(anyhow::anyhow!(msg)) + } - /// Bytes before `buf_begin` are considered as dropped. - /// This allows us to implement O(1) pop_front on Vec. - /// The Vec won't grow large because we only add to it - /// when it's empty. - buf_begin: usize, -} + msg = pgb.read_message() => { msg } + }; -impl<'a> CopyInReader<'a> { - // NOTE: pgb should be in copy in state already - fn new(pgb: &'a mut PostgresBackend) -> Self { - Self { - pgb, - buf: Vec::<_>::new(), - buf_begin: 0, - } - } -} - -impl<'a> Drop for CopyInReader<'a> { - fn drop(&mut self) { - // Finalize copy protocol so that self.pgb can be reused - // TODO instead, maybe take ownership of pgb and give it back at the end - let mut buf: Vec = vec![]; - let _ = self.read_to_end(&mut buf); - } -} - -impl<'a> Read for CopyInReader<'a> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - while !thread_mgr::is_shutdown_requested() { - // Return from buffer if nonempty - if self.buf_begin < self.buf.len() { - let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin); - buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]); - self.buf_begin += bytes_to_read; - return Ok(bytes_to_read); - } - - // Delete garbage - self.buf.clear(); - self.buf_begin = 0; - - // Wait for client to send CopyData bytes - match self.pgb.read_message() { + match msg { Ok(Some(message)) => { let copy_data_bytes = match message { FeMessage::CopyData(bytes) => bytes, - FeMessage::CopyDone => return Ok(0), + FeMessage::CopyDone => { break }, FeMessage::Sync => continue, m => { let msg = format!("unexpected message {:?}", m); - self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; + break; } }; - // Return as much as we can, saving the rest in self.buf - let mut reader = copy_data_bytes.reader(); - let bytes_read = reader.read(buf)?; - reader.read_to_end(&mut self.buf)?; - return Ok(bytes_read); + yield copy_data_bytes; } Ok(None) => { let msg = "client closed connection"; - self.pgb.write_message(&BeMessage::ErrorResponse(msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(msg))?; + pgb.flush().await?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; } Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(io::Error::new(io::ErrorKind::Other, e)); - } + Err(io::Error::new(io::ErrorKind::Other, e))?; } - } + }; } - - // Shutting down - let msg = "Importer thread was shut down"; - Err(io::Error::new(io::ErrorKind::Other, msg)) } } @@ -296,61 +256,49 @@ impl<'a> Read for CopyInReader<'a> { /// /// Main loop of the page service. /// -/// Listens for connections, and launches a new handler thread for each. +/// Listens for connections, and launches a new handler task for each. /// -pub fn thread_main( +pub async fn libpq_listener_main( conf: &'static PageServerConf, auth: Option>, listener: TcpListener, auth_type: AuthType, ) -> anyhow::Result<()> { listener.set_nonblocking(true)?; - let basic_rt = tokio::runtime::Builder::new_current_thread() - .enable_io() - .build()?; - - let tokio_listener = { - let _guard = basic_rt.enter(); - tokio::net::TcpListener::from_std(listener) - }?; + let tokio_listener = tokio::net::TcpListener::from_std(listener)?; // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = basic_rt.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - biased; + while let Some(res) = tokio::select! { + biased; - _ = shutdown_watcher => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + None } - }) { + + res = tokio_listener.accept() => { + Some(res) + } + } { match res { Ok((socket, peer_addr)) => { - // Connection established. Spawn a new thread to handle it. + // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - // PageRequestHandler threads are not associated with any particular - // timeline in the thread manager. In practice most connections will + // PageRequestHandler tasks are not associated with any particular + // timeline in the task manager. In practice most connections will // only deal with a particular timeline, but we don't know which one // yet. - if let Err(err) = thread_mgr::spawn( - ThreadKind::PageRequestHandler, + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::PageRequestHandler, None, None, - "serving Page Service thread", + "serving compute connection task", false, - move || page_service_conn_main(conf, local_auth, socket, auth_type), - ) { - // Thread creation failed. Log the error and continue. - error!("could not spawn page service thread: {:?}", err); - } + page_service_conn_main(conf, local_auth, socket, auth_type), + ); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -364,13 +312,13 @@ pub fn thread_main( Ok(()) } -fn page_service_conn_main( +async fn page_service_conn_main( conf: &'static PageServerConf, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, ) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on thread exit. + // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); @@ -379,22 +327,17 @@ fn page_service_conn_main( gauge.dec(); } - // We use Tokio to accept the connection, but the rest of the code works with a - // regular socket. Convert. - let socket = socket - .into_std() - .context("could not convert tokio::net:TcpStream to std::net::TcpStream")?; - socket - .set_nonblocking(false) - .context("could not put socket to blocking mode")?; - socket .set_nodelay(true) .context("could not set TCP_NODELAY")?; let mut conn_handler = PageServerHandler::new(conf, auth); - let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - match pgbackend.run(&mut conn_handler) { + let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + + let result = pgbackend + .run(&mut conn_handler, task_mgr::shutdown_watcher) + .await; + match result { Ok(()) => { // we've been requested to shut down Ok(()) @@ -435,92 +378,95 @@ impl PageServerHandler { } } - fn handle_pagerequests( + #[instrument(skip(self, pgb))] + async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - timeline_id: ZTimelineId, tenant_id: ZTenantId, + timeline_id: ZTimelineId, ) -> anyhow::Result<()> { - let _enter = - info_span!("pagestream", timeline = %timeline_id, tenant = %tenant_id).entered(); - // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; - /* switch client to COPYBOTH */ + // switch client to COPYBOTH pgb.write_message(&BeMessage::CopyBothResponse)?; + pgb.flush().await?; - while !thread_mgr::is_shutdown_requested() { - let msg = pgb.read_message(); + loop { + let msg = tokio::select! { + biased; - let profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - match msg { - Ok(message) => { - if let Some(message) = message { - trace!("query: {:?}", message); - - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - _ => continue, - }; - - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - - let response = match zenith_fe_msg { - PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_rel_exists_request(&timeline, &req) - }), - PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_nblocks_request(&timeline, &req) - }), - PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(&timeline, &req) - }), - PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME - .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_db_size_request(&timeline, &req) - }), - }; - - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough - error!("error reading relation or page version: {:?}", e); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); - - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; - } else { - break; - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + break; } - Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(e); - } + + msg = pgb.read_message() => { msg } + }; + + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(m) => { + bail!("unexpected message: {m:?} during COPY"); } - } - drop(profiling_guard); + None => break, // client disconnected + }; + + trace!("query: {:?}", copy_data_bytes); + + let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_str = tenant_id.to_string(); + let timeline_str = timeline_id.to_string(); + + let response = match zenith_fe_msg { + PagestreamFeMessage::Exists(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_exists", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_rel_exists_request(&timeline, &req).await + } + PagestreamFeMessage::Nblocks(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_nblocks_request(&timeline, &req).await + } + PagestreamFeMessage::GetPage(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_page_at_lsn", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_page_at_lsn_request(&timeline, &req).await + } + PagestreamFeMessage::DbSize(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_db_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_db_size_request(&timeline, &req).await + } + }; + + let response = response.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough + error!("error reading relation or page version: {:?}", e); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); + + pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.flush().await?; } Ok(()) } - fn handle_import_basebackup( + #[instrument(skip(self, pgb))] + async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -528,10 +474,7 @@ impl PageServerHandler { base_lsn: Lsn, _end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered(); - + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -550,8 +493,24 @@ impl PageServerHandler { // Import basebackup provided via CopyData info!("importing basebackup"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_basebackup_from_tar(&*timeline, reader, base_lsn)?; + pgb.flush().await?; + + // import_basebackup_from_tar() is not async, mainly because the Tar crate + // it uses is not async. So we need to jump through some hoops: + // - convert the input from client connection to a synchronous Read + // - use block_in_place() + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?; + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -563,11 +522,14 @@ impl PageServerHandler { info!("flushing layers"); timeline.checkpoint(CheckpointConfig::Flush)?; + timeline.launch_wal_receiver()?; + info!("done"); Ok(()) } - fn handle_import_wal( + #[instrument(skip(self, pgb))] + async fn handle_import_wal( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -575,9 +537,7 @@ impl PageServerHandler { start_lsn: Lsn, end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let timeline = repo @@ -591,8 +551,22 @@ impl PageServerHandler { // Import wal provided via CopyData info!("importing wal"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?; + pgb.flush().await?; + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| { + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn) + })?; + info!("wal import complete"); + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO Does it make sense to overshoot? ensure!(timeline.get_last_record_lsn() >= end_lsn); @@ -619,7 +593,7 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( + async fn wait_or_get_last_lsn( timeline: &Timeline, mut lsn: Lsn, latest: bool, @@ -647,7 +621,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -657,7 +631,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -667,15 +641,15 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_rel_exists_request( &self, timeline: &Timeline, req: &PagestreamExistsRequest, ) -> Result { - let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; @@ -684,14 +658,15 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_nblocks_request( &self, timeline: &Timeline, req: &PagestreamNblocksRequest, ) -> Result { - let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; @@ -700,14 +675,15 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + async fn handle_db_size_request( &self, timeline: &Timeline, req: &PagestreamDbSizeRequest, ) -> Result { - let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; @@ -719,15 +695,15 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, ) -> Result { - let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) - .entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; /* // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. @@ -736,6 +712,11 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ + + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -743,23 +724,23 @@ impl PageServerHandler { })) } - fn handle_basebackup_request( + #[instrument(skip(self, pgb))] + async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, + tenant_id: ZTenantId, timeline_id: ZTimelineId, lsn: Option, prev_lsn: Option, - tenant_id: ZTenantId, full_backup: bool, ) -> anyhow::Result<()> { - let span = info_span!("basebackup", timeline = %timeline_id, tenant = %tenant_id, lsn = field::Empty); - let _enter = span.enter(); - info!("starting"); - // check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { + // Backup was requested at a particular LSN. Wait for it to arrive. + info!("waiting for {}", lsn); + timeline.wait_lsn(lsn).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -767,18 +748,22 @@ impl PageServerHandler { // switch client to COPYOUT pgb.write_message(&BeMessage::CopyOutResponse)?; + pgb.flush().await?; /* Send a tarball of the latest layer on the timeline */ - { - let mut writer = CopyDataSink { pgb }; - + let mut writer = CopyDataSink { + pgb, + rt: tokio::runtime::Handle::current(), + }; + tokio::task::block_in_place(|| { let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; - span.record("lsn", &basebackup.lsn.to_string().as_str()); - basebackup.send_tarball()?; - } + tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str()); + basebackup.send_tarball() + })?; pgb.write_message(&BeMessage::CopyDone)?; - info!("done"); + pgb.flush().await?; + info!("basebackup complete"); Ok(()) } @@ -801,7 +786,8 @@ impl PageServerHandler { } } -impl postgres_backend::Handler for PageServerHandler { +#[async_trait::async_trait] +impl postgres_backend_async::Handler for PageServerHandler { fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, @@ -831,11 +817,7 @@ impl postgres_backend::Handler for PageServerHandler { Ok(()) } - fn is_shutdown_requested(&self) -> bool { - thread_mgr::is_shutdown_requested() - } - - fn process_query( + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, @@ -849,12 +831,13 @@ impl postgres_backend::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, timelineid, tenantid)?; + self.handle_pagerequests(pgb, tenant_id, timeline_id) + .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); @@ -864,10 +847,10 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; let lsn = if params.len() == 3 { Some(Lsn::from_str(params[2])?) @@ -876,8 +859,9 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -897,11 +881,11 @@ impl postgres_backend::Handler for PageServerHandler { let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), RowDescriptor::text_col(b"last_lsn"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(end_of_timeline.prev.to_string().as_bytes()), Some(end_of_timeline.last.to_string().as_bytes()), ]))? @@ -917,8 +901,8 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for fullbackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { @@ -932,11 +916,12 @@ impl postgres_backend::Handler for PageServerHandler { None }; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. // Assumes the tenant already exists on this pageserver. @@ -952,18 +937,21 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.starts_with("import wal ") { @@ -974,24 +962,27 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let start_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("failpoints ") { ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); @@ -1016,7 +1007,7 @@ impl postgres_backend::Handler for PageServerHandler { bail!("Invalid failpoints format"); } } - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -1024,7 +1015,7 @@ impl postgres_backend::Handler for PageServerHandler { ensure!(params.len() == 1, "invalid param number for config command"); let tenantid = ZTenantId::from_str(params[0])?; let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"compaction_target_size"), @@ -1035,7 +1026,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"image_creation_threshold"), RowDescriptor::int8_col(b"pitr_interval"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(repo.get_checkpoint_distance().to_string().as_bytes()), Some( repo.get_checkpoint_timeout() @@ -1072,10 +1063,10 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let gc_horizon: u64 = caps .get(4) @@ -1084,8 +1075,8 @@ impl postgres_backend::Handler for PageServerHandler { // Use tenant's pitr setting let pitr = repo.get_pitr_interval(); - let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), RowDescriptor::int8_col(b"layers_needed_by_pitr"), @@ -1094,7 +1085,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"layers_removed"), RowDescriptor::int8_col(b"elapsed"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(result.layers_total.to_string().as_bytes()), Some(result.layers_needed_by_cutoff.to_string().as_bytes()), Some(result.layers_needed_by_pitr.to_string().as_bytes()), @@ -1121,8 +1112,8 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("checkpoint ") { // Run checkpoint immediately on given timeline. @@ -1140,8 +1131,8 @@ impl postgres_backend::Handler for PageServerHandler { // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.checkpoint(CheckpointConfig::Forced)?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static @@ -1158,7 +1149,7 @@ impl postgres_backend::Handler for PageServerHandler { let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { @@ -1167,14 +1158,12 @@ impl postgres_backend::Handler for PageServerHandler { LsnForTimestamp::Past(_lsn) => "past".into(), LsnForTimestamp::NoData(_lsn) => "nodata".into(), }; - pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; + pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } - pgb.flush()?; - Ok(()) } } @@ -1194,6 +1183,7 @@ fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result< /// struct CopyDataSink<'a> { pgb: &'a mut PostgresBackend, + rt: tokio::runtime::Handle, } impl<'a> io::Write for CopyDataSink<'a> { @@ -1205,6 +1195,7 @@ impl<'a> io::Write for CopyDataSink<'a> { // FIXME: flush isn't really required, but makes it easier // to view in wireshark self.pgb.write_message(&BeMessage::CopyData(data))?; + self.rt.block_on(self.pgb.flush())?; trace!("CopyData sent for {} bytes!", data.len()); Ok(data.len()) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 57a964cb67..8ebfa6a935 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -37,7 +37,7 @@ //! | access to this storage | //! +------------------------+ //! -//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. +//! First, during startup, the pageserver inits the storage sync task with the async loop, or leaves the loop uninitialised, if configured so. //! The loop inits the storage connection and checks the remote files stored. //! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). //! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can @@ -158,7 +158,6 @@ use once_cell::sync::OnceCell; use remote_storage::GenericRemoteStorage; use tokio::{ fs, - runtime::Runtime, time::{Duration, Instant}, }; use tracing::*; @@ -174,9 +173,10 @@ use crate::{ exponential_backoff, layered_repository::metadata::{metadata_path, TimelineMetadata}, storage_sync::index::RemoteIndex, + task_mgr, + task_mgr::TaskKind, + task_mgr::BACKGROUND_RUNTIME, tenant_mgr::attach_local_tenants, - thread_mgr, - thread_mgr::ThreadKind, }; use crate::{ metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, @@ -264,7 +264,7 @@ impl SyncQueue { .unwrap() .0; - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { return (HashMap::new(), q.len()); } } @@ -574,7 +574,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub fn spawn_storage_sync_thread( +pub fn spawn_storage_sync_task( conf: &'static PageServerConf, local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, storage: GenericRemoteStorage, @@ -590,11 +590,6 @@ pub fn spawn_storage_sync_thread( None => bail!("Could not get sync queue during the sync loop step, aborting"), }; - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("Failed to create storage sync runtime")?; - // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: // * we need to list every timeline for tenant on S3, that might be a costly operation // * we need to download every timeline for the tenant, to activate it in memory @@ -616,7 +611,7 @@ pub fn spawn_storage_sync_thread( } } - let applicable_index_parts = runtime.block_on(download_index_parts( + let applicable_index_parts = BACKGROUND_RUNTIME.block_on(download_index_parts( conf, &storage, keys_for_index_part_downloads, @@ -625,7 +620,7 @@ pub fn spawn_storage_sync_thread( let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; let mut local_timeline_init_statuses = schedule_first_sync_tasks( - &mut runtime.block_on(remote_index.write()), + &mut BACKGROUND_RUNTIME.block_on(remote_index.write()), sync_queue, timelines_to_sync, ); @@ -634,31 +629,30 @@ pub fn spawn_storage_sync_thread( .extend(empty_tenants.0.into_iter()); let remote_index_clone = remote_index.clone(); - thread_mgr::spawn( - ThreadKind::StorageSync, + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::StorageSync, None, None, - "Remote storage sync thread", + "Remote storage sync task", false, - move || { + async move { storage_sync_loop( - runtime, conf, (storage, remote_index_clone, sync_queue), max_sync_errors, - ); + ) + .await; Ok(()) }, - ) - .context("Failed to spawn remote storage sync thread")?; + ); Ok(SyncStartupData { remote_index, local_timeline_init_statuses, }) } -fn storage_sync_loop( - runtime: Runtime, +async fn storage_sync_loop( conf: &'static PageServerConf, (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, @@ -669,7 +663,7 @@ fn storage_sync_loop( let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { info!("Shutdown requested, stopping"); break; } @@ -683,20 +677,19 @@ fn storage_sync_loop( } // Concurrently perform all the tasks in the batch - let loop_step = runtime.block_on(async { - tokio::select! { - step = process_batches( - conf, - max_sync_errors, - loop_storage, - &index, - batched_tasks, - sync_queue, - ) - .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step), - _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), - } - }); + let loop_step = tokio::select! { + step = process_batches( + conf, + max_sync_errors, + loop_storage, + &index, + batched_tasks, + sync_queue, + ) + .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step) + , + _ = task_mgr::shutdown_watcher() => ControlFlow::Break(()), + }; match loop_step { ControlFlow::Continue(updated_tenants) => { @@ -708,7 +701,7 @@ fn storage_sync_loop( updated_tenants.len() ); let mut timelines_to_attach = TenantTimelineValues::new(); - let index_accessor = runtime.block_on(index.read()); + let index_accessor = index.read().await; for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { Some(tenant_entry) => tenant_entry, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 7070f941f5..a4285e426b 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -153,7 +153,7 @@ pub(super) async fn upload_timeline_layers<'a>( // We have run the upload sync task, but the file we wanted to upload is gone. // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and - // run compaction/gc threads, removing redundant files from disk. + // run compaction/gc tasks, removing redundant files from disk. // It's not good to pause GC/compaction because of those and we would rather skip such uploads. // // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance). diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs new file mode 100644 index 0000000000..2aa803d119 --- /dev/null +++ b/pageserver/src/task_mgr.rs @@ -0,0 +1,463 @@ +//! +//! This module provides centralized handling of tokio tasks in the Page Server. +//! +//! We provide a few basic facilities: +//! - A global registry of tasks that lists what kind of tasks they are, and +//! which tenant or timeline they are working on +//! +//! - The ability to request a task to shut down. +//! +//! +//! # How it works? +//! +//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new +//! task is spawned, a PageServerTask entry is added there, and when a +//! task dies, it removes itself from the hashmap. If you want to kill a +//! task, you can scan the hashmap to find it. +//! +//! # Task shutdown +//! +//! To kill a task, we rely on co-operation from the victim. Each task is +//! expected to periodically call the `is_shutdown_requested()` function, and +//! if it returns true, exit gracefully. In addition to that, when waiting for +//! the network or other long-running operation, you can use +//! `shutdown_watcher()` function to get a Future that will become ready if +//! the current task has been requested to shut down. You can use that with +//! Tokio select!(). +//! +//! +//! TODO: This would be a good place to also handle panics in a somewhat sane way. +//! Depending on what task panics, we might want to kill the whole server, or +//! only a single tenant or timeline. +//! + +// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. +// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. +#![allow(clippy::declare_interior_mutable_const)] + +use std::collections::HashMap; +use std::future::Future; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; + +use futures::FutureExt; +use tokio::runtime::Runtime; +use tokio::sync::watch; +use tokio::task::JoinHandle; +use tokio::task_local; + +use tracing::{debug, error, info, warn}; + +use once_cell::sync::Lazy; + +use utils::zid::{ZTenantId, ZTimelineId}; + +use crate::shutdown_pageserver; + +// +// There are four runtimes: +// +// Compute request runtime +// - used to handle connections from compute nodes. Any tasks related to satisfying +// GetPage requests, base backups, import, and other such compute node operations +// are handled by the Compute request runtime +// - page_service.rs +// - this includes layer downloads from remote storage, if a layer is needed to +// satisfy a GetPage request +// +// Management request runtime +// - used to handle HTTP API requests +// +// WAL receiver runtime: +// - used to handle WAL receiver connections. +// - and to receiver updates from etcd +// +// Background runtime +// - layer flushing +// - garbage collection +// - compaction +// - remote storage uploads +// - initial tenant loading +// +// Everything runs in a tokio task. If you spawn new tasks, spawn it using the correct +// runtime. +// +// There might be situations when one task needs to wait for a task running in another +// Runtime to finish. For example, if a background operation needs a layer from remote +// storage, it will start to download it. If a background operation needs a remote layer, +// and the download was already initiated by a GetPage request, the background task +// will wait for the download - running in the Page server runtime - to finish. +// Another example: the initial tenant loading tasks are launched in the background ops +// runtime. If a GetPage request comes in before the load of a tenant has finished, the +// GetPage request will wait for the tenant load to finish. +// +// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to +// protect data structures. Let's keep it that way. Synchronous code is easier to debug +// and analyze, and there's a lot of hairy, low-level, performance critical code there. +// +// It's nice to have different runtimes, so that you can quickly eyeball how much CPU +// time each class of operations is taking, with 'top -H' or similar. +// +// It's also good to avoid hogging all threads that would be needed to process +// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't +// happen, but still. +// +pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("compute request worker") + .enable_all() + .build() + .expect("Failed to create compute request runtime") +}); + +pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("mgmt request worker") + .enable_all() + .build() + .expect("Failed to create mgmt request runtime") +}); + +pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("walreceiver worker") + .enable_all() + .build() + .expect("Failed to create walreceiver runtime") +}); + +pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("background op worker") + .enable_all() + .build() + .expect("Failed to create background op runtime") +}); + +pub struct PageserverTaskId(u64); + +/// Each task that we track is associated with a "task ID". It's just an +/// increasing number that we assign. Note that it is different from tokio::task::Id. +static NEXT_TASK_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); + +/// Global registry of tasks +static TASKS: Lazy>>> = + Lazy::new(|| Mutex::new(HashMap::new())); + +task_local! { + // There is a Tokio watch channel for each task, which can be used to signal the + // task that it needs to shut down. This task local variable holds the receiving + // end of the channel. The sender is kept in the global registry, so that anyone + // can send the signal to request task shutdown. + static SHUTDOWN_RX: watch::Receiver; + + // Each task holds reference to its own PageServerTask here. + static CURRENT_TASK: Arc; +} + +/// +/// There are many kinds of tasks in the system. Some are associated with a particular +/// tenant or timeline, while others are global. +/// +/// Note that we don't try to limit how many task of a certain kind can be running +/// at the same time. +/// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TaskKind { + // libpq listener task. It just accepts connection and spawns a + // PageRequestHandler task for each connection. + LibpqEndpointListener, + + // HTTP endpoint listener. + HttpEndpointListener, + + // Task that handles a single connection. A PageRequestHandler task + // starts detached from any particular tenant or timeline, but it can be + // associated with one later, after receiving a command from the client. + PageRequestHandler, + + // Manages the WAL receiver connection for one timeline. It subscribes to + // events from etcd, decides which safekeeper to connect to. It spawns a + // separate WalReceiverConnection task to handle each connection. + WalReceiverManager, + + // Handles a connection to a safekeeper, to stream WAL to a timeline. + WalReceiverConnection, + + // Garbage collection worker. One per tenant + GarbageCollector, + + // Compaction. One per tenant. + Compaction, + + // Initial logical size calculation + InitialLogicalSizeCalculation, + + // Task that flushes frozen in-memory layers to disk + LayerFlushTask, + + // Task that manages the remote upload queue + StorageSync, + + // task that handles the initial downloading of all tenants + InitialLoad, + + // task that handles attaching a tenant + Attach, +} + +#[derive(Default)] +struct MutableTaskState { + /// Tenant and timeline that this task is associated with. + tenant_id: Option, + timeline_id: Option, + + /// Handle for waiting for the task to exit. It can be None, if the + /// the task has already exited. + join_handle: Option>, +} + +struct PageServerTask { + #[allow(dead_code)] // unused currently + task_id: PageserverTaskId, + + kind: TaskKind, + + name: String, + + // To request task shutdown, send 'true' to the channel to notify the task. + shutdown_tx: watch::Sender, + + mutable: Mutex, +} + +/// Launch a new task +/// Note: if shutdown_process_on_error is set to true failure +/// of the task will lead to shutdown of entire process +pub fn spawn( + runtime: &tokio::runtime::Handle, + kind: TaskKind, + tenant_id: Option, + timeline_id: Option, + name: &str, + shutdown_process_on_error: bool, + future: F, +) -> PageserverTaskId +where + F: Future> + Send + 'static, +{ + let (shutdown_tx, shutdown_rx) = watch::channel(false); + let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); + let task = Arc::new(PageServerTask { + task_id: PageserverTaskId(task_id), + kind, + name: name.to_string(), + shutdown_tx, + mutable: Mutex::new(MutableTaskState { + tenant_id, + timeline_id, + join_handle: None, + }), + }); + + TASKS.lock().unwrap().insert(task_id, Arc::clone(&task)); + + let mut task_mut = task.mutable.lock().unwrap(); + + let task_name = name.to_string(); + let task_cloned = Arc::clone(&task); + let join_handle = runtime.spawn(task_wrapper( + task_name, + task_id, + task_cloned, + shutdown_rx, + shutdown_process_on_error, + future, + )); + task_mut.join_handle = Some(join_handle); + drop(task_mut); + + // The task is now running. Nothing more to do here + PageserverTaskId(task_id) +} + +/// This wrapper function runs in a newly-spawned task. It initializes the +/// task-local variables and calls the payload function. +async fn task_wrapper( + task_name: String, + task_id: u64, + task: Arc, + shutdown_rx: watch::Receiver, + shutdown_process_on_error: bool, + future: F, +) where + F: Future> + Send + 'static, +{ + debug!("Starting task '{}'", task_name); + + let result = SHUTDOWN_RX + .scope( + shutdown_rx, + CURRENT_TASK.scope(task, { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + AssertUnwindSafe(future).catch_unwind() + }), + ) + .await; + task_finish(result, task_name, task_id, shutdown_process_on_error).await; +} + +async fn task_finish( + result: std::result::Result< + anyhow::Result<()>, + std::boxed::Box, + >, + task_name: String, + task_id: u64, + shutdown_process_on_error: bool, +) { + // Remove our entry from the global hashmap. + let task = TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); + + let mut shutdown_process = false; + { + let task_mut = task.mutable.lock().unwrap(); + + match result { + Ok(Ok(())) => { + debug!("Task '{}' exited normally", task_name); + } + Ok(Err(err)) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + Err(err) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + } + } + + if shutdown_process { + shutdown_pageserver(1).await; + } +} + +// expected to be called from the task of the given id. +pub fn associate_with(tenant_id: Option, timeline_id: Option) { + CURRENT_TASK.with(|ct| { + let mut task_mut = ct.mutable.lock().unwrap(); + task_mut.tenant_id = tenant_id; + task_mut.timeline_id = timeline_id; + }); +} + +/// Is there a task running that matches the criteria + +/// Signal and wait for tasks to shut down. +/// +/// +/// The arguments are used to select the tasks to kill. Any None arguments are +/// ignored. For example, to shut down all WalReceiver tasks: +/// +/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None) +/// +/// Or to shut down all tasks for given timeline: +/// +/// shutdown_tasks(None, Some(tenantid), Some(timelineid)) +/// +pub async fn shutdown_tasks( + kind: Option, + tenant_id: Option, + timeline_id: Option, +) { + let mut victim_tasks = Vec::new(); + + { + let tasks = TASKS.lock().unwrap(); + for task in tasks.values() { + let task_mut = task.mutable.lock().unwrap(); + if (kind.is_none() || Some(task.kind) == kind) + && (tenant_id.is_none() || task_mut.tenant_id == tenant_id) + && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) + { + let _ = task.shutdown_tx.send_replace(true); + victim_tasks.push(Arc::clone(task)); + } + } + } + + for task in victim_tasks { + let join_handle = { + let mut task_mut = task.mutable.lock().unwrap(); + info!("waiting for {} to shut down", task.name); + let join_handle = task_mut.join_handle.take(); + drop(task_mut); + join_handle + }; + if let Some(join_handle) = join_handle { + let _ = join_handle.await; + } else { + // Possibly one of: + // * The task had not even fully started yet. + // * It was shut down concurrently and already exited + } + } +} + +pub fn current_task_kind() -> Option { + CURRENT_TASK.try_with(|ct| ct.kind).ok() +} + +/// A Future that can be used to check if the current task has been requested to +/// shut down. +pub async fn shutdown_watcher() { + let mut shutdown_rx = SHUTDOWN_RX + .try_with(|rx| rx.clone()) + .expect("shutdown_requested() called in an unexpected task or thread"); + + while !*shutdown_rx.borrow() { + if shutdown_rx.changed().await.is_err() { + break; + } + } +} + +/// Has the current task been requested to shut down? +pub fn is_shutdown_requested() -> bool { + if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) { + *shutdown_rx.borrow() + } else { + if !cfg!(test) { + warn!("is_shutdown_requested() called in an unexpected task or thread"); + } + false + } +} diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index baa58f5eb5..db256b0f65 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -5,14 +5,14 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; use crate::layered_repository::ephemeral_file::is_ephemeral_file; use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; -use crate::layered_repository::{Repository, Timeline}; +use crate::layered_repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::task_mgr::{self, TaskKind}; use crate::tenant_config::TenantConfOpt; -use crate::thread_mgr::ThreadKind; -use crate::walredo::PostgresRedoManager; -use crate::{thread_mgr, timelines, walreceiver, TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::Context; +use crate::walredo::{PostgresRedoManager, WalRedoManager}; +use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; +use anyhow::{ensure, Context}; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::{self, Entry}; @@ -21,34 +21,24 @@ use std::ffi::OsStr; use std::fmt; use std::path::{Path, PathBuf}; use std::sync::Arc; -use tokio::sync::mpsc; use tracing::*; -pub use tenants_state::try_send_timeline_update; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::crashsafe_dir; +use utils::zid::{ZTenantId, ZTimelineId}; mod tenants_state { - use anyhow::ensure; use once_cell::sync::Lazy; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; - use tokio::sync::mpsc; - use tracing::{debug, error}; use utils::zid::ZTenantId; - use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; + use crate::tenant_mgr::Tenant; static TENANTS: Lazy>> = Lazy::new(|| RwLock::new(HashMap::new())); - /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, - /// so that it can enable/disable corresponding processes. - static TIMELINE_UPDATE_SENDER: Lazy< - RwLock>>, - > = Lazy::new(|| RwLock::new(None)); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { TENANTS .read() @@ -60,39 +50,6 @@ mod tenants_state { .write() .expect("Failed to write() tenants lock, it got poisoned") } - - pub(super) fn set_timeline_update_sender( - timeline_updates_sender: mpsc::UnboundedSender, - ) -> anyhow::Result<()> { - let mut sender_guard = TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned"); - ensure!(sender_guard.is_none(), "Timeline update sender already set"); - *sender_guard = Some(timeline_updates_sender); - Ok(()) - } - - pub fn try_send_timeline_update(update: LocalTimelineUpdate) { - match TIMELINE_UPDATE_SENDER - .read() - .expect("Failed to read() timeline_update_sender lock, it got poisoned") - .as_ref() - { - Some(sender) => { - if let Err(e) = sender.send(update) { - error!("Failed to send timeline update: {}", e); - } - } - None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"), - } - } - - pub(super) fn stop_timeline_update_sender() { - TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned") - .take(); - } } struct Tenant { @@ -103,9 +60,6 @@ struct Tenant { #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] pub enum TenantState { - // All data for this tenant is complete on local disk, but we haven't loaded the Repository, - // Timeline and Layer structs into memory yet, so it cannot be accessed yet. - //Ready, // This tenant exists on local disk, and the layer map has been loaded into memory. // The local disk might have some newer files that don't exist in cloud storage yet. Active, @@ -139,10 +93,6 @@ pub fn init_tenant_mgr( remote_storage: Option, ) -> anyhow::Result { let _entered = info_span!("init_tenant_mgr").entered(); - let (timeline_updates_sender, timeline_updates_receiver) = - mpsc::unbounded_channel::(); - tenants_state::set_timeline_update_sender(timeline_updates_sender)?; - walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; let local_tenant_files = local_tenant_timeline_files(conf) .context("Failed to collect local tenant timeline files")?; @@ -156,7 +106,7 @@ pub fn init_tenant_mgr( let SyncStartupData { remote_index, local_timeline_init_statuses, - } = storage_sync::spawn_storage_sync_thread( + } = storage_sync::spawn_storage_sync_task( conf, local_tenant_files, storage, @@ -185,27 +135,6 @@ pub fn init_tenant_mgr( Ok(remote_index) } -pub enum LocalTimelineUpdate { - Detach { - id: ZTenantTimelineId, - // used to signal to the detach caller that walreceiver successfully terminated for specified id - join_confirmation_sender: std::sync::mpsc::Sender<()>, - }, - Attach { - id: ZTenantTimelineId, - timeline: Arc, - }, -} - -impl std::fmt::Debug for LocalTimelineUpdate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(), - Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(), - } - } -} - /// Reads local files to load tenants and their timelines given into pageserver's memory. /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", @@ -274,24 +203,26 @@ fn load_local_repo( /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// -pub fn shutdown_all_tenants() { - tenants_state::stop_timeline_update_sender(); - let mut m = tenants_state::write_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - match tenant.state { - TenantState::Active | TenantState::Idle | TenantState::Stopping => { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) +pub async fn shutdown_all_tenants() { + let tenantids = { + let mut m = tenants_state::write_tenants(); + let mut tenantids = Vec::new(); + for (tenantid, tenant) in m.iter_mut() { + match tenant.state { + TenantState::Active | TenantState::Idle | TenantState::Stopping => { + tenant.state = TenantState::Stopping; + tenantids.push(*tenantid) + } + TenantState::Broken => {} } - TenantState::Broken => {} } - } - drop(m); + drop(m); + tenantids + }; - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - // Ok, no background threads running anymore. Flush any remaining data in + // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from @@ -314,7 +245,40 @@ pub fn shutdown_all_tenants() { } } -pub fn create_tenant_repository( +fn create_repo( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: ZTenantId, + wal_redo_manager: Arc, + remote_index: RemoteIndex, +) -> anyhow::Result> { + let repo_dir = conf.tenant_path(&tenant_id); + ensure!( + !repo_dir.exists(), + "cannot create new tenant repo: '{}' directory already exists", + tenant_id + ); + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&repo_dir) + .with_context(|| format!("could not create directory {}", repo_dir.display()))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; + info!("created directory structure in {}", repo_dir.display()); + + // Save tenant's config + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + + Ok(Arc::new(Repository::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + ))) +} + +pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, @@ -327,17 +291,12 @@ pub fn create_tenant_repository( } Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let repo = timelines::create_repo( - conf, - tenant_conf, - tenant_id, - wal_redo_manager, - remote_index, - )?; + let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; v.insert(Tenant { - state: TenantState::Idle, + state: TenantState::Active, repo, }); + crate::tenant_tasks::start_background_loops(tenant_id); Ok(Some(tenant_id)) } } @@ -360,13 +319,15 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { } pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - let old_state = tenant.state; - tenant.state = new_state; - drop(m); + let old_state = { + let mut m = tenants_state::write_tenants(); + let tenant = m + .get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))?; + let old_state = tenant.state; + tenant.state = new_state; + old_state + }; match (old_state, new_state) { (TenantState::Broken, TenantState::Broken) @@ -389,24 +350,15 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. - // TODO maybe use tokio::sync::watch instead? - crate::tenant_tasks::start_compaction_loop(tenant_id)?; - crate::tenant_tasks::start_gc_loop(tenant_id)?; + crate::tenant_tasks::start_background_loops(tenant_id); } (TenantState::Idle, TenantState::Stopping) => { info!("stopping idle tenant {tenant_id}"); } (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { - info!("stopping tenant {tenant_id} threads due to new state {new_state}"); - thread_mgr::shutdown_threads( - Some(ThreadKind::WalReceiverManager), - Some(tenant_id), - None, - ); + info!("stopping tenant {tenant_id} tasks due to new state {new_state}"); - // Wait until all gc/compaction tasks finish - let repo = get_repository_for_tenant(tenant_id)?; - let _guard = repo.file_lock.write().unwrap(); + // Note: The caller is responsible for waiting for any tasks to finish. } } @@ -422,28 +374,28 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result<()> { +pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the threads to join + // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join // we cannot create new timelines and tenants, and that can take quite some time, // it can even become stuck due to a bug making whole pageserver unavailable for some operations // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests // will synchronize and either fail with the not found error or succeed - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - debug!("waiting for wal receiver to shutdown"); - let _ = receiver.recv(); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(tenant_id), + Some(timeline_id), + ) + .await; debug!("wal receiver shutdown confirmed"); - debug!("waiting for threads to shutdown"); - thread_mgr::shutdown_threads(None, None, Some(timeline_id)); - debug!("thread shutdown completed"); + + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; + info!("timeline task shutdown completed"); match tenants_state::read_tenants().get(&tenant_id) { Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), @@ -452,36 +404,17 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow Ok(()) } -pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> { +pub async fn detach_tenant( + conf: &'static PageServerConf, + tenant_id: ZTenantId, +) -> anyhow::Result<()> { set_tenant_state(tenant_id, TenantState::Stopping)?; - // shutdown the tenant and timeline threads: gc, compaction, page service threads) - thread_mgr::shutdown_threads(None, Some(tenant_id), None); + // shutdown all tenant and timeline tasks: gc, compaction, page service) + task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - let mut walreceiver_join_handles = Vec::new(); - let removed_tenant = { + { let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id) - }; - if let Some(tenant) = removed_tenant { - for (timeline_id, _) in tenant.repo.list_timelines() { - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - walreceiver_join_handles.push((timeline_id, receiver)); - } - } - - // wait for wal receivers to stop without holding the lock, because walreceiver - // will attempt to change tenant state which is protected by the same global tenants lock. - // TODO do we need a timeout here? how to handle it? - // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 - // need to use crossbeam-channel - for (timeline_id, join_handle) in walreceiver_join_handles { - info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); - join_handle.recv().ok(); - info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); + tenants_accessor.remove(&tenant_id); } // If removal fails there will be no way to successfully retry detach, diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 4e9a5fc6ec..9aaafe7f92 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -1,270 +1,130 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC -use std::collections::HashMap; -use std::ops::ControlFlow; use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant_mgr; use crate::tenant_mgr::TenantState; -use crate::thread_mgr::ThreadKind; -use crate::{tenant_mgr, thread_mgr}; -use anyhow::{self, Context}; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use once_cell::sync::OnceCell; -use tokio::sync::mpsc; -use tokio::sync::watch; use tracing::*; use utils::zid::ZTenantId; +pub fn start_background_loops(tenant_id: ZTenantId) { + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::Compaction, + Some(tenant_id), + None, + &format!("compactor for tenant {tenant_id}"), + false, + compaction_loop(tenant_id), + ); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::GarbageCollector, + Some(tenant_id), + None, + &format!("garbage collector for tenant {tenant_id}"), + false, + gc_loop(tenant_id), + ); +} + /// /// Compaction task's main loop /// -async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting compaction loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); + + // Run blocking part of the task - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run compaction - let compaction_period = repo.get_compaction_period(); - repo.compaction_iteration()?; - Ok(ControlFlow::Continue(compaction_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { + let mut sleep_duration = repo.get_compaction_period(); + if let Err(e) = repo.compaction_iteration() { error!("Compaction failed, retrying: {}", e); - Duration::from_secs(2) + sleep_duration = Duration::from_secs(2) } - Err(e) => { - error!("Compaction join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } } } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - trace!( + info!( "compaction loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); -} - -static START_GC_LOOP: OnceCell> = OnceCell::new(); -static START_COMPACTION_LOOP: OnceCell> = OnceCell::new(); - -/// Spawn a task that will periodically schedule garbage collection until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_GC_LOOP - .get() - .context("Failed to get START_GC_LOOP")? - .blocking_send(tenantid) - .context("Failed to send to START_GC_LOOP channel")?; - Ok(()) -} - -/// Spawn a task that will periodically schedule compaction until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_COMPACTION_LOOP - .get() - .context("failed to get START_COMPACTION_LOOP")? - .blocking_send(tenantid) - .context("failed to send to START_COMPACTION_LOOP")?; - Ok(()) -} - -/// Spawn the TenantTaskManager -/// This needs to be called before start_gc_loop or start_compaction_loop -pub fn init_tenant_task_pool() -> anyhow::Result<()> { - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("tenant-task-worker") - .enable_all() - .on_thread_start(|| { - thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker") - }) - .on_thread_stop(thread_mgr::deregister) - .build()?; - - let (gc_send, mut gc_recv) = mpsc::channel::(100); - START_GC_LOOP - .set(gc_send) - .expect("Failed to set START_GC_LOOP"); - - let (compaction_send, mut compaction_recv) = mpsc::channel::(100); - START_COMPACTION_LOOP - .set(compaction_send) - .expect("Failed to set START_COMPACTION_LOOP"); - - // TODO this is getting repetitive - let mut gc_loops = HashMap::>::new(); - let mut compaction_loops = HashMap::>::new(); - - thread_mgr::spawn( - ThreadKind::TenantTaskManager, - None, - None, - "Tenant task manager main thread", - true, - move || { - runtime.block_on(async move { - let mut futures = FuturesUnordered::new(); - loop { - tokio::select! { - _ = thread_mgr::shutdown_watcher() => { - // Send cancellation to all tasks - for (_, cancel) in gc_loops.drain() { - cancel.send(()).ok(); - } - for (_, cancel) in compaction_loops.drain() { - cancel.send(()).ok(); - } - - // Exit after all tasks finish - while let Some(result) = futures.next().await { - match result { - Ok(()) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Err(e) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - } - } - break; - }, - tenantid = gc_recv.recv() => { - let tenantid = tenantid.expect("Gc task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(gc_loop(tenantid, cancel_recv) - .instrument(info_span!("gc loop", tenant = %tenantid))); - if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - tenantid = compaction_recv.recv() => { - let tenantid = tenantid.expect("Compaction task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv) - .instrument(info_span!("compaction loop", tenant = %tenantid))); - if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - result = futures.next() => { - // Log and count any unhandled panics - match result { - Some(Ok(())) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Some(Err(e)) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - None => {}, - }; - }, - } - } - }); - Ok(()) - }, - )?; - - Ok(()) + result } /// /// GC task's main loop /// -async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting gc loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run gc let gc_period = repo.get_gc_period(); let gc_horizon = repo.get_gc_horizon(); + let mut sleep_duration = gc_period; if gc_horizon > 0 { - repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; + if let Err(e) = repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false) + { + error!("Gc failed, retrying: {}", e); + sleep_duration = Duration::from_secs(2) + } } - Ok(ControlFlow::Continue(gc_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { - error!("Gc failed, retrying: {}", e); - Duration::from_secs(2) + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, } - Err(e) => { - error!("Gc join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, } } - trace!( + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + info!( "GC loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); + result } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs deleted file mode 100644 index cdd38febbc..0000000000 --- a/pageserver/src/thread_mgr.rs +++ /dev/null @@ -1,409 +0,0 @@ -//! -//! This module provides centralized handling of threads in the Page Server. -//! -//! We provide a few basic facilities: -//! - A global registry of threads that lists what kind of threads they are, and -//! which tenant or timeline they are working on -//! -//! - The ability to request a thread to shut down. -//! -//! -//! # How it works? -//! -//! There is a global hashmap of all the threads (`THREADS`). Whenever a new -//! thread is spawned, a PageServerThread entry is added there, and when a -//! thread dies, it removes itself from the hashmap. If you want to kill a -//! thread, you can scan the hashmap to find it. -//! -//! # Thread shutdown -//! -//! To kill a thread, we rely on co-operation from the victim. Each thread is -//! expected to periodically call the `is_shutdown_requested()` function, and -//! if it returns true, exit gracefully. In addition to that, when waiting for -//! the network or other long-running operation, you can use -//! `shutdown_watcher()` function to get a Future that will become ready if -//! the current thread has been requested to shut down. You can use that with -//! Tokio select!(), but note that it relies on thread-local storage, so it -//! will only work with the "current-thread" Tokio runtime! -//! -//! -//! TODO: This would be a good place to also handle panics in a somewhat sane way. -//! Depending on what thread panics, we might want to kill the whole server, or -//! only a single tenant or timeline. -//! - -use std::cell::RefCell; -use std::collections::HashMap; -use std::panic; -use std::panic::AssertUnwindSafe; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::{Arc, Mutex}; -use std::thread; -use std::thread::JoinHandle; - -use tokio::sync::watch; - -use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - -use utils::zid::{ZTenantId, ZTimelineId}; - -use crate::shutdown_pageserver; - -/// Each thread that we track is associated with a "thread ID". It's just -/// an increasing number that we assign, not related to any system thread -/// id. -static NEXT_THREAD_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); - -/// Global registry of threads -static THREADS: Lazy>>> = - Lazy::new(|| Mutex::new(HashMap::new())); - -// There is a Tokio watch channel for each thread, which can be used to signal the -// thread that it needs to shut down. This thread local variable holds the receiving -// end of the channel. The sender is kept in the global registry, so that anyone -// can send the signal to request thread shutdown. -thread_local!(static SHUTDOWN_RX: RefCell>> = RefCell::new(None)); - -// Each thread holds reference to its own PageServerThread here. -thread_local!(static CURRENT_THREAD: RefCell>> = RefCell::new(None)); - -/// -/// There are many kinds of threads in the system. Some are associated with a particular -/// tenant or timeline, while others are global. -/// -/// Note that we don't try to limit how may threads of a certain kind can be running -/// at the same time. -/// -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ThreadKind { - // libpq listener thread. It just accepts connection and spawns a - // PageRequestHandler thread for each connection. - LibpqEndpointListener, - - // HTTP endpoint listener. - HttpEndpointListener, - - // Thread that handles a single connection. A PageRequestHandler thread - // starts detached from any particular tenant or timeline, but it can be - // associated with one later, after receiving a command from the client. - PageRequestHandler, - - // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. - WalReceiverManager, - - // Thread that schedules new compaction and gc jobs - TenantTaskManager, - - // Worker thread for tenant tasks thread pool - TenantTaskWorker, - - // Thread that flushes frozen in-memory layers to disk - LayerFlushThread, - - // Thread for synchronizing pageserver layer files with the remote storage. - // Shared by all tenants. - StorageSync, -} - -#[derive(Default)] -struct MutableThreadState { - /// Tenant and timeline that this thread is associated with. - tenant_id: Option, - timeline_id: Option, - - /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. OR if this thread is managed externally - /// and was not spawned through thread_mgr.rs::spawn function. - join_handle: Option>, -} - -struct PageServerThread { - thread_id: u64, - - kind: ThreadKind, - - name: String, - - // To request thread shutdown, set the flag, and send a dummy message to the - // channel to notify it. - shutdown_requested: AtomicBool, - shutdown_tx: watch::Sender<()>, - - mutable: Mutex, -} - -/// Launch a new thread -/// Note: if shutdown_process_on_error is set to true failure -/// of the thread will lead to shutdown of entire process -pub fn spawn( - kind: ThreadKind, - tenant_id: Option, - timeline_id: Option, - name: &str, - shutdown_process_on_error: bool, - f: F, -) -> std::io::Result -where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_string(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id, - timeline_id, - join_handle: None, - }), - }); - - THREADS - .lock() - .unwrap() - .insert(thread_id, Arc::clone(&thread)); - - let mut thread_mut = thread.mutable.lock().unwrap(); - - let thread_cloned = Arc::clone(&thread); - let thread_name = name.to_string(); - let join_handle = match thread::Builder::new() - .name(name.to_string()) - .spawn(move || { - thread_wrapper( - thread_name, - thread_id, - thread_cloned, - shutdown_rx, - shutdown_process_on_error, - f, - ) - }) { - Ok(handle) => handle, - Err(err) => { - error!("Failed to spawn thread '{}': {}", name, err); - // Could not spawn the thread. Remove the entry - THREADS.lock().unwrap().remove(&thread_id); - return Err(err); - } - }; - thread_mut.join_handle = Some(join_handle); - drop(thread_mut); - - // The thread is now running. Nothing more to do here - Ok(thread_id) -} - -/// This wrapper function runs in a newly-spawned thread. It initializes the -/// thread-local variables and calls the payload function -fn thread_wrapper( - thread_name: String, - thread_id: u64, - thread: Arc, - shutdown_rx: watch::Receiver<()>, - shutdown_process_on_error: bool, - f: F, -) where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - CURRENT_THREAD.with(|ct| { - *ct.borrow_mut() = Some(thread); - }); - - debug!("Starting thread '{}'", thread_name); - - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - let result = panic::catch_unwind(AssertUnwindSafe(f)); - - // Remove our entry from the global hashmap. - let thread = THREADS - .lock() - .unwrap() - .remove(&thread_id) - .expect("no thread in registry"); - - let thread_mut = thread.mutable.lock().unwrap(); - match result { - Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), - Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - } -} - -// expected to be called from the thread of the given id. -pub fn associate_with(tenant_id: Option, timeline_id: Option) { - CURRENT_THREAD.with(|ct| { - let borrowed = ct.borrow(); - let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap(); - thread_mut.tenant_id = tenant_id; - thread_mut.timeline_id = timeline_id; - }); -} - -/// Is there a thread running that matches the criteria - -/// Signal and wait for threads to shut down. -/// -/// -/// The arguments are used to select the threads to kill. Any None arguments are -/// ignored. For example, to shut down all WalReceiver threads: -/// -/// shutdown_threads(Some(ThreadKind::WalReceiver), None, None) -/// -/// Or to shut down all threads for given timeline: -/// -/// shutdown_threads(None, Some(timelineid), None) -/// -pub fn shutdown_threads( - kind: Option, - tenant_id: Option, - timeline_id: Option, -) { - let mut victim_threads = Vec::new(); - - let threads = THREADS.lock().unwrap(); - for thread in threads.values() { - let thread_mut = thread.mutable.lock().unwrap(); - if (kind.is_none() || Some(thread.kind) == kind) - && (tenant_id.is_none() || thread_mut.tenant_id == tenant_id) - && (timeline_id.is_none() || thread_mut.timeline_id == timeline_id) - { - thread.shutdown_requested.store(true, Ordering::Relaxed); - // FIXME: handle error? - let _ = thread.shutdown_tx.send(()); - victim_threads.push(Arc::clone(thread)); - } - } - drop(threads); - - for thread in victim_threads { - let mut thread_mut = thread.mutable.lock().unwrap(); - info!("waiting for {} to shut down", thread.name); - if let Some(join_handle) = thread_mut.join_handle.take() { - drop(thread_mut); - let _ = join_handle.join(); - } else { - // Possibly one of: - // * The thread had not even fully started yet. - // * It was shut down concurrently and already exited - // * Is managed through `register`/`deregister` fns without providing a join handle - } - } -} - -/// A Future that can be used to check if the current thread has been requested to -/// shut down. -pub async fn shutdown_watcher() { - let _ = SHUTDOWN_RX - .with(|rx| { - rx.borrow() - .as_ref() - .expect("shutdown_requested() called in an unexpected thread") - .clone() - }) - .changed() - .await; -} - -/// Has the current thread been requested to shut down? -pub fn is_shutdown_requested() -> bool { - CURRENT_THREAD.with(|ct| { - if let Some(ct) = ct.borrow().as_ref() { - ct.shutdown_requested.load(Ordering::Relaxed) - } else { - if !cfg!(test) { - warn!("is_shutdown_requested() called in an unexpected thread"); - } - false - } - }) -} - -/// Needed to register threads that were not spawned through spawn function. -/// For example tokio blocking threads. This function is expected to be used -/// in tandem with `deregister`. -/// NOTE: threads registered through this function cannot be joined -pub fn register(kind: ThreadKind, name: &str) { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - if borrowed.is_some() { - panic!("thread already registered") - }; - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_owned(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id: None, - timeline_id: None, - join_handle: None, - }), - }); - - *borrowed = Some(Arc::clone(&thread)); - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - - THREADS.lock().unwrap().insert(thread_id, thread); - }); -} - -// Expected to be used in tandem with `register`. See the doc for `register` for more details -pub fn deregister() { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - let thread = match borrowed.take() { - Some(thread) => thread, - None => panic!("calling deregister on unregistered thread"), - }; - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = None; - }); - - THREADS.lock().unwrap().remove(&thread.thread_id) - }); -} diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 9356893908..35dec54d5c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,7 +2,7 @@ //! Timeline management code // -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, Context, Result}; use remote_storage::path_with_suffix_extension; use std::{ @@ -14,21 +14,15 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::config::PageServerConf; +use crate::layered_repository::{Repository, Timeline}; use crate::tenant_mgr; use crate::CheckpointConfig; -use crate::{ - config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, -}; use crate::{import_datadir, TEMP_FILE_SUFFIX}; -use crate::{ - layered_repository::{Repository, Timeline}, - walredo::WalRedoManager, -}; #[derive(Debug, Clone, Copy)] pub struct PointInTime { @@ -36,39 +30,6 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn create_repo( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, - wal_redo_manager: Arc, - remote_index: RemoteIndex, -) -> Result> { - let repo_dir = conf.tenant_path(&tenant_id); - ensure!( - !repo_dir.exists(), - "cannot create new tenant repo: '{}' directory already exists", - tenant_id - ); - - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); - - // Save tenant's config - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; - - Ok(Arc::new(Repository::new( - conf, - tenant_conf, - wal_redo_manager, - tenant_id, - remote_index, - conf.remote_storage_config.is_some(), - ))) -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -158,7 +119,7 @@ fn bootstrap_timeline( /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, /// a new unique ID is generated. /// -pub(crate) fn create_timeline( +pub(crate) async fn create_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, new_timeline_id: Option, @@ -187,7 +148,7 @@ pub(crate) fn create_timeline( // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn)?; + ancestor_timeline.wait_lsn(*lsn).await?; let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index d6420e1d18..deac299747 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -23,131 +23,61 @@ mod connection_manager; mod walreceiver_connection; +use crate::config::PageServerConf; +use crate::task_mgr::WALRECEIVER_RUNTIME; + use anyhow::{ensure, Context}; use etcd_broker::Client; use itertools::Itertools; -use std::cell::Cell; -use std::collections::{hash_map, HashMap, HashSet}; +use once_cell::sync::OnceCell; use std::future::Future; -use std::num::NonZeroU64; use std::sync::Arc; -use std::thread_local; -use std::time::Duration; -use tokio::{ - select, - sync::{mpsc, watch}, - task::JoinHandle, -}; +use tokio::sync::watch; use tracing::*; use url::Url; -use crate::config::PageServerConf; -use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; -use crate::thread_mgr::{self, ThreadKind}; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +pub use connection_manager::spawn_connection_manager_task; -thread_local! { - // Boolean that is true only for WAL receiver threads - // - // This is used in `wait_lsn` to guard against usage that might lead to a deadlock. - pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); -} +static ETCD_CLIENT: OnceCell = OnceCell::new(); -/// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. -/// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. -pub fn init_wal_receiver_main_thread( - conf: &'static PageServerConf, - mut timeline_updates_receiver: mpsc::UnboundedReceiver, -) -> anyhow::Result<()> { +/// +/// Initialize the etcd client. This must be called once at page server startup. +/// +pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> { let etcd_endpoints = conf.broker_endpoints.clone(); ensure!( !etcd_endpoints.is_empty(), "Cannot start wal receiver: etcd endpoints are empty" ); - let broker_prefix = &conf.broker_etcd_prefix; - info!( - "Starting wal receiver main thread, etcd endpoints: {}", - etcd_endpoints.iter().map(Url::to_string).join(", ") - ); - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("wal-receiver-runtime-thread") - .enable_all() - .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) - .build() - .context("Failed to create storage sync runtime")?; - let etcd_client = runtime - .block_on(Client::connect(etcd_endpoints, None)) + let etcd_client = Client::connect(etcd_endpoints.clone(), None) + .await .context("Failed to connect to etcd")?; - thread_mgr::spawn( - ThreadKind::WalReceiverManager, - None, - None, - "WAL receiver manager main thread", - true, - move || { - runtime.block_on(async move { - let mut local_timeline_wal_receivers = HashMap::new(); - loop { - select! { - _ = thread_mgr::shutdown_watcher() => { - info!("Shutdown signal received"); - shutdown_all_wal_connections(&mut local_timeline_wal_receivers).await; - break; - }, - _ = wal_receiver_main_thread_loop_step( - broker_prefix, - &etcd_client, - &mut timeline_updates_receiver, - &mut local_timeline_wal_receivers, - ) => {}, - } - } - }.instrument(info_span!("wal_receiver_main"))); + // FIXME: Should we still allow the pageserver to start, if etcd + // doesn't work? It could still serve GetPage requests, with the + // data it has locally and from what it can download from remote + // storage + if ETCD_CLIENT.set(etcd_client).is_err() { + panic!("etcd already initialized"); + } - info!("Wal receiver main thread stopped"); - Ok(()) - }, - ) - .map(|_thread_id| ()) - .context("Failed to spawn wal receiver main thread") + info!( + "Initialized etcd client with endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") + ); + Ok(()) } -async fn shutdown_all_wal_connections( - local_timeline_wal_receivers: &mut HashMap>>, -) { - info!("Shutting down all WAL connections"); - let mut broker_join_handles = Vec::new(); - for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { - for (timeline_id, handles) in timelines { - handles.cancellation.send(()).ok(); - broker_join_handles.push(( - ZTenantTimelineId::new(tenant_id, timeline_id), - handles.handle, - )); - } - } +/// +/// Get a handle to the etcd client +/// +pub fn get_etcd_client() -> &'static etcd_broker::Client { + ETCD_CLIENT.get().expect("etcd client not initialized") +} - let mut tenants = HashSet::with_capacity(broker_join_handles.len()); - for (id, broker_join_handle) in broker_join_handles { - tenants.insert(id.tenant_id); - debug!("Waiting for wal broker for timeline {id} to finish"); - if let Err(e) = broker_join_handle.await { - error!("Failed to join on wal broker for timeline {id}: {e}"); - } - } - if let Err(e) = tokio::task::spawn_blocking(move || { - for tenant_id in tenants { - if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { - error!("Failed to make tenant {tenant_id} idle: {e:?}"); - } - } - }) - .await - { - error!("Failed to await a task to make all tenants idle: {e:?}"); - } +pub fn is_etcd_client_initialized() -> bool { + ETCD_CLIENT.get().is_some() } /// A handle of an asynchronous task. @@ -157,8 +87,7 @@ async fn shutdown_all_wal_connections( /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. /// That may lead to certain events not being observed by the listener. #[derive(Debug)] -struct TaskHandle { - handle: JoinHandle>, +pub struct TaskHandle { events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } @@ -167,7 +96,7 @@ struct TaskHandle { pub enum TaskEvent { Started, NewEvent(E), - End(Result<(), String>), + End, } impl TaskHandle { @@ -184,164 +113,28 @@ impl TaskHandle { let events_sender = Arc::new(events_sender); let sender = Arc::clone(&events_sender); - let handle = tokio::task::spawn(async move { + let _ = WALRECEIVER_RUNTIME.spawn(async move { events_sender.send(TaskEvent::Started).ok(); task(sender, cancellation_receiver).await }); TaskHandle { - handle, events_receiver, cancellation, } } async fn next_task_event(&mut self) -> TaskEvent { - select! { - next_task_event = self.events_receiver.changed() => match next_task_event { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await, - }, - task_completion_result = join_on_handle(&mut self.handle) => task_completion_result, + match self.events_receiver.changed().await { + Ok(()) => self.events_receiver.borrow().clone(), + Err(_task_channel_part_dropped) => TaskEvent::End, } } /// Aborts current task, waiting for it to finish. - async fn shutdown(self) { + pub async fn shutdown(mut self) { self.cancellation.send(()).ok(); - if let Err(e) = self.handle.await { - error!("Task failed to shut down: {e}") - } + // wait until the sender is dropped + while self.events_receiver.changed().await.is_ok() {} } } - -async fn join_on_handle(handle: &mut JoinHandle>) -> TaskEvent { - match handle.await { - Ok(task_result) => TaskEvent::End(task_result), - Err(e) => { - if e.is_cancelled() { - TaskEvent::End(Ok(())) - } else { - TaskEvent::End(Err(format!("WAL receiver task panicked: {e}"))) - } - } - } -} - -/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. -/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. -/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. -/// -/// Cannot fail, should always try to process the next timeline event even if the other one was not processed properly. -async fn wal_receiver_main_thread_loop_step<'a>( - broker_prefix: &'a str, - etcd_client: &'a Client, - timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, - local_timeline_wal_receivers: &'a mut HashMap>>, -) { - // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. - match timeline_updates_receiver.recv().await { - Some(update) => { - info!("Processing timeline update: {update:?}"); - match update { - // Timeline got detached, stop all related tasks and remove public timeline data. - LocalTimelineUpdate::Detach { - id, - join_confirmation_sender, - } => { - match local_timeline_wal_receivers.get_mut(&id.tenant_id) { - Some(wal_receivers) => { - if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { - o.remove().shutdown().await - } - if wal_receivers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { - error!("Failed to make tenant idle for id {id}: {e:#}"); - } - } - } - None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), - }; - if let Err(e) = join_confirmation_sender.send(()) { - warn!("cannot send wal_receiver shutdown confirmation {e}") - } else { - info!("confirm walreceiver shutdown for {id}"); - } - } - // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. - LocalTimelineUpdate::Attach { id, timeline } => { - let timeline_connection_managers = local_timeline_wal_receivers - .entry(id.tenant_id) - .or_default(); - - if timeline_connection_managers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await - { - error!("Failed to make tenant active for id {id}: {e:#}"); - return; - } - } - - let vacant_connection_manager_entry = - match timeline_connection_managers.entry(id.timeline_id) { - hash_map::Entry::Occupied(_) => { - debug!("Attepted to readd an existing timeline {id}, ignoring"); - return; - } - hash_map::Entry::Vacant(v) => v, - }; - - let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = - match fetch_tenant_settings(id.tenant_id).await { - Ok(settings) => settings, - Err(e) => { - error!("Failed to fetch tenant settings for id {id}: {e:#}"); - return; - } - }; - - vacant_connection_manager_entry.insert( - connection_manager::spawn_connection_manager_task( - id, - broker_prefix.to_owned(), - etcd_client.clone(), - timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - ), - ); - } - } - } - None => { - info!("Local timeline update channel closed"); - shutdown_all_wal_connections(local_timeline_wal_receivers).await; - } - } -} - -async fn fetch_tenant_settings( - tenant_id: ZTenantId, -) -> anyhow::Result<(Duration, Duration, NonZeroU64)> { - tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - Ok::<_, anyhow::Error>(( - repo.get_wal_receiver_connect_timeout(), - repo.get_lagging_wal_timeout(), - repo.get_max_lsn_wal_lag(), - )) - }) - .await - .with_context(|| format!("Failed to join on tenant {tenant_id} settings fetch task"))? -} - -async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - tokio::task::spawn_blocking(move || { - tenant_mgr::set_tenant_state(tenant_id, new_state) - .with_context(|| format!("Failed to activate tenant {tenant_id}")) - }) - .await - .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? -} diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 0261203049..1fcb768ddf 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -17,6 +17,9 @@ use std::{ }; use crate::layered_repository::Timeline; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::task_mgr::WALRECEIVER_RUNTIME; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -26,7 +29,10 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use crate::{ + exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, +}; use utils::{ lsn::Lsn, zid::{NodeId, ZTenantTimelineId}, @@ -35,29 +41,38 @@ use utils::{ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; /// Spawns the loop to take care of the timeline's WAL streaming connection. -pub(super) fn spawn_connection_manager_task( - id: ZTenantTimelineId, +pub fn spawn_connection_manager_task( broker_loop_prefix: String, - mut client: Client, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, -) -> TaskHandle<()> { - TaskHandle::spawn(move |_, mut cancellation| { +) -> anyhow::Result<()> { + let mut etcd_client = get_etcd_client().clone(); + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverManager, + Some(tenant_id), + Some(timeline_id), + &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + false, async move { info!("WAL receiver broker started, connecting to etcd"); let mut walreceiver_state = WalreceiverState::new( - id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, ); loop { select! { - _ = cancellation.changed() => { - info!("Broker subscription init cancelled, shutting down"); + _ = task_mgr::shutdown_watcher() => { + info!("WAL receiver shutdown requested, shutting down"); + // Kill current connection, if any if let Some(wal_connection) = walreceiver_state.wal_connection.take() { wal_connection.connection_task.shutdown().await; @@ -67,14 +82,15 @@ pub(super) fn spawn_connection_manager_task( _ = connection_manager_loop_step( &broker_loop_prefix, - &mut client, + &mut etcd_client, &mut walreceiver_state, ) => {}, } } } - .instrument(info_span!("wal_connection_manager", id = %id)) - }) + .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + ); + Ok(()) } /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. @@ -85,7 +101,10 @@ async fn connection_manager_loop_step( etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, ) { - let id = walreceiver_state.id; + let id = ZTenantTimelineId { + tenant_id: walreceiver_state.timeline.tenant_id, + timeline_id: walreceiver_state.timeline.timeline_id, + }; // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, // running the entire loop step as much as possible to an end. @@ -98,6 +117,14 @@ async fn connection_manager_loop_step( loop { let time_until_next_retry = walreceiver_state.time_until_next_retry(); + // These things are happening concurrently: + // + // - keep receiving WAL on the current connection + // - if the shared state says we need to change connection, disconnect and return + // - this runs in a separate task and we receive updates via a watch channel + // - change connection if the rules decide so, or if the current connection dies + // - receive updates from broker + // - this might change the current desired connection select! { broker_connection_result = &mut broker_subscription.watcher_handle => { cleanup_broker_connection(broker_connection_result, walreceiver_state); @@ -110,7 +137,8 @@ async fn connection_manager_loop_step( None => None, } } => { - let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); + let wal_connection = walreceiver_state.wal_connection.as_mut() + .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { TaskEvent::Started => {}, TaskEvent::NewEvent(status) => { @@ -123,16 +151,14 @@ async fn connection_manager_loop_step( } wal_connection.status = status; }, - TaskEvent::End(end_result) => { - match end_result { - Ok(()) => debug!("WAL receiving task finished"), - Err(e) => warn!("WAL receiving task failed: {e}"), - }; + TaskEvent::End => { + debug!("WAL receiving task finished"); walreceiver_state.drop_old_connection(false).await; }, } }, + // Got a new update from etcd broker_update = broker_subscription.value_updates.recv() => { match broker_update { Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), @@ -241,8 +267,9 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, + /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -299,15 +326,18 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( - id: ZTenantTimelineId, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, ) -> Self { + let id = ZTenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }; Self { id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, @@ -323,10 +353,11 @@ impl WalreceiverState { let id = self.id; let connect_timeout = self.wal_connect_timeout; + let timeline = Arc::clone(&self.timeline); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { super::walreceiver_connection::handle_walreceiver_connection( - id, + timeline, &new_wal_source_connstr, events_sender.as_ref(), cancellation, @@ -520,7 +551,7 @@ impl WalreceiverState { let current_lsn = match existing_wal_connection.status.streaming_lsn { Some(lsn) => lsn, - None => self.local_timeline.get_last_record_lsn(), + None => self.timeline.get_last_record_lsn(), }; let current_commit_lsn = existing_wal_connection .status @@ -1328,7 +1359,7 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - local_timeline: harness + timeline: harness .load() .create_empty_timeline(TIMELINE_ID, Lsn(0)) .expect("Failed to create an empty timeline for dummy wal connection manager"), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 4c30481e02..e8fa9f9aca 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -21,11 +21,17 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ - layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, + layered_repository::{Timeline, WalReceiverInfo}, + task_mgr, + task_mgr::TaskKind, + task_mgr::WALRECEIVER_RUNTIME, + tenant_mgr, + walingest::WalIngest, walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; +use utils::zid::ZTenantTimelineId; +use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; /// Status of the connection. #[derive(Debug, Clone)] @@ -48,7 +54,7 @@ pub struct WalConnectionStatus { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. pub async fn handle_walreceiver_connection( - id: ZTenantTimelineId, + timeline: Arc, wal_source_connstr: &str, events_sender: &watch::Sender>, mut cancellation: watch::Receiver<()>, @@ -83,24 +89,31 @@ pub async fn handle_walreceiver_connection( // The connection object performs the actual communication with the database, // so spawn it off to run on its own. let mut connection_cancellation = cancellation.clone(); - tokio::spawn( + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverConnection, + Some(timeline.tenant_id), + Some(timeline.timeline_id), + "walreceiver connection", + false, async move { select! { - connection_result = connection => match connection_result{ - Ok(()) => info!("Walreceiver db connection closed"), - Err(connection_error) => { - if connection_error.is_closed() { - info!("Connection closed regularly: {connection_error}") - } else { - warn!("Connection aborted: {connection_error}") - } - } - }, + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, - _ = connection_cancellation.changed() => info!("Connection cancelled"), + _ = connection_cancellation.changed() => info!("Connection cancelled"), } + Ok(()) } - .instrument(info_span!("safekeeper_handle_db")), + .instrument(info_span!("walreceiver connection")), ); // Immediately increment the gauge, then create a job to decrement it on task exit. @@ -117,10 +130,6 @@ pub async fn handle_walreceiver_connection( let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = id; connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); @@ -130,17 +139,10 @@ pub async fn handle_walreceiver_connection( return Ok(()); } - let (repo, timeline) = tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - let timeline = repo.get_timeline(timeline_id) - .with_context(|| { - format!("local timeline {timeline_id} not found for tenant {tenant_id}") - })?; - Ok::<_, anyhow::Error>((repo, timeline)) - }) - .await - .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??; + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; // // Start streaming the WAL, from where we left off previously. @@ -273,11 +275,12 @@ pub async fn handle_walreceiver_connection( } } - let timeline_to_check = Arc::clone(&timeline); - tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) - .await - .with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))? - .with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?; + timeline.check_checkpoint_distance().with_context(|| { + format!( + "Failed to check checkpoint distance for timeline {}", + timeline.timeline_id + ) + })?; if let Some(last_lsn) = status_update { let remote_index = repo.get_remote_index(); diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index befa4616be..315ec7f306 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_until from fixtures.types import ZTenantId, ZTimelineId @@ -39,9 +40,6 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) - def assert_idle(tenant): - assert get_state(tenant) == "Idle" - # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) @@ -51,18 +49,21 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Stop compute pg.stop() - # Detach all tenants and wait for them to go idle - # TODO they should be already idle since there are no active computes + # Delete all timelines on all tenants for tenant_info in client.tenant_list(): tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) - wait_until(10, 0.2, lambda: assert_idle(tenant_id)) - # Assert that all tasks finish quickly after tenants go idle + # Assert that all tasks finish quickly after tenant is detached + assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + client.tenant_detach(tenant) + client.tenant_detach(env.initial_tenant) + def assert_tasks_finish(): tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended assert tasks_panicked == 0 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index bfe61b9ced..096b3a5d70 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -47,9 +47,9 @@ scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } -tokio-util = { version = "0.7", features = ["codec", "io", "tracing"] } +tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } From 2a837d7de71a3f8bd74bbaa0d85f056bdac6f861 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 00:04:33 +0300 Subject: [PATCH 10/33] Create tenants in temporary directory first (#2426) --- pageserver/src/layered_repository.rs | 59 ++++++++++--- pageserver/src/tenant_mgr.rs | 127 ++++++++++++++++++++------- test_runner/regress/test_tenants.py | 41 ++++++++- 3 files changed, 182 insertions(+), 45 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 768bdd396b..ecc0bfe3b5 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -21,6 +21,8 @@ use std::collections::BTreeSet; use std::collections::HashMap; use std::fs; use std::fs::File; +use std::fs::OpenOptions; +use std::io::Write; use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; @@ -38,6 +40,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::metrics::STORAGE_TIME; use crate::repository::GcResult; use crate::task_mgr; +use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -663,14 +666,14 @@ impl Repository { } pub fn persist_tenant_config( - conf: &'static PageServerConf, - tenant_id: ZTenantId, + target_config_path: &Path, tenant_conf: TenantConfOpt, + first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving tenantconf").entered(); - let target_config_path = TenantConf::path(conf, tenant_id); - info!("save tenantconf to {}", target_config_path.display()); + info!("persisting tenantconf to {}", target_config_path.display()); + // TODO this will prepend comments endlessly let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. @@ -681,12 +684,48 @@ impl Repository { // Convert the config to a toml file. conf_content += &toml_edit::easy::to_string(&tenant_conf)?; - fs::write(&target_config_path, conf_content).with_context(|| { - format!( - "Failed to write config file into path '{}'", - target_config_path.display() - ) - }) + let mut target_config_file = VirtualFile::open_with_options( + target_config_path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + target_config_file + .write(conf_content.as_bytes()) + .context("Failed to write toml bytes into file") + .and_then(|_| { + target_config_file + .sync_all() + .context("Faile to fsync config file") + }) + .with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + })?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + target_config_path + .parent() + .context("Config file does not have a parent") + .and_then(|target_config_parent| { + File::open(target_config_parent).context("Failed to open config parent") + }) + .and_then(|tenant_dir| { + tenant_dir + .sync_all() + .context("Failed to fsync config parent") + }) + .with_context(|| { + format!( + "Failed to fsync on firts save for config {}", + target_config_path.display() + ) + })?; + } + + Ok(()) } // diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index db256b0f65..a9f015229f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,16 +9,14 @@ use crate::layered_repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::task_mgr::{self, TaskKind}; -use crate::tenant_config::TenantConfOpt; +use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::walredo::{PostgresRedoManager, WalRedoManager}; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::{ensure, Context}; -use remote_storage::GenericRemoteStorage; -use serde::{Deserialize, Serialize}; -use std::collections::hash_map::{self, Entry}; -use std::collections::{HashMap, HashSet}; +use anyhow::Context; +use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; +use std::collections::{hash_map, HashMap, HashSet}; use std::ffi::OsStr; -use std::fmt; +use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; use tracing::*; @@ -58,7 +56,7 @@ struct Tenant { repo: Arc, } -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TenantState { // This tenant exists on local disk, and the layer map has been loaded into memory. // The local disk might have some newer files that don't exist in cloud storage yet. @@ -74,8 +72,8 @@ pub enum TenantState { Broken, } -impl fmt::Display for TenantState { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for TenantState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Active => f.write_str("Active"), Self::Idle => f.write_str("Idle"), @@ -252,21 +250,71 @@ fn create_repo( wal_redo_manager: Arc, remote_index: RemoteIndex, ) -> anyhow::Result> { - let repo_dir = conf.tenant_path(&tenant_id); - ensure!( - !repo_dir.exists(), - "cannot create new tenant repo: '{}' directory already exists", - tenant_id + let target_tenant_directory = conf.tenant_path(&tenant_id); + anyhow::ensure!( + !target_tenant_directory.exists(), + "cannot create new tenant repo: '{tenant_id}' directory already exists", ); - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); + let temporary_tenant_dir = + path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX); + debug!( + "Creating temporary directory structure in {}", + temporary_tenant_dir.display() + ); - // Save tenant's config - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + let temporary_tenant_timelines_dir = rebase_directory( + &conf.timelines_path(&tenant_id), + &target_tenant_directory, + &temporary_tenant_dir, + )?; + let temporary_tenant_config_path = rebase_directory( + &TenantConf::path(conf, tenant_id), + &target_tenant_directory, + &temporary_tenant_dir, + )?; + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| { + format!( + "could not create temporary tenant directory {}", + temporary_tenant_dir.display() + ) + })?; + // first, create a config in the top-level temp directory, fsync the file + Repository::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; + // then, create a subdirectory in the top-level temp directory, fsynced + crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { + format!( + "could not create temporary tenant timelines directory {}", + temporary_tenant_timelines_dir.display() + ) + })?; + + fail::fail_point!("tenant-creation-before-tmp-rename", |_| { + anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); + }); + + // move-rename tmp directory with all files synced into a permanent directory, fsync its parent + fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| { + format!( + "failed to move temporary tenant directory {} into the permanent one {}", + temporary_tenant_dir.display(), + target_tenant_directory.display() + ) + })?; + let target_dir_parent = target_tenant_directory.parent().with_context(|| { + format!( + "Failed to get tenant dir parent for {}", + target_tenant_directory.display() + ) + })?; + fs::File::open(target_dir_parent)?.sync_all()?; + + info!( + "created directory structure in {}", + target_tenant_directory.display() + ); Ok(Arc::new(Repository::new( conf, @@ -278,6 +326,17 @@ fn create_repo( ))) } +fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result { + let relative_path = original_path.strip_prefix(base).with_context(|| { + format!( + "Failed to strip base prefix '{}' off path '{}'", + base.display(), + original_path.display() + ) + })?; + Ok(new_base.join(relative_path)) +} + pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, @@ -285,11 +344,11 @@ pub fn create_tenant( remote_index: RemoteIndex, ) -> anyhow::Result> { match tenants_state::write_tenants().entry(tenant_id) { - Entry::Occupied(_) => { + hash_map::Entry::Occupied(_) => { debug!("tenant {tenant_id} already exists"); Ok(None) } - Entry::Vacant(v) => { + hash_map::Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; v.insert(Tenant { @@ -310,7 +369,7 @@ pub fn update_tenant_config( info!("configuring tenant {tenant_id}"); get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + Repository::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; Ok(()) } @@ -424,7 +483,7 @@ pub async fn detach_tenant( // we will attempt to remove files which no longer exist. This can be fixed by having shutdown // mechanism for repository that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); - std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { + fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( "Failed to remove local tenant directory '{}'", local_tenant_directory.display() @@ -472,7 +531,7 @@ fn local_tenant_timeline_files( let mut local_tenant_timeline_files = TenantTimelineValues::new(); let tenants_dir = config.tenants_path(); - for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + for tenants_dir_entry in fs::read_dir(&tenants_dir) .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? { match &tenants_dir_entry { @@ -483,7 +542,7 @@ fn local_tenant_timeline_files( "Found temporary tenant directory, removing: {}", tenant_dir_path.display() ); - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { + if let Err(e) = fs::remove_dir_all(&tenant_dir_path) { error!( "Failed to remove temporary directory '{}': {:?}", tenant_dir_path.display(), @@ -545,7 +604,7 @@ fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result { .is_none(); if directory_is_empty { - std::fs::remove_dir_all(&tenant_dir_path).with_context(|| { + fs::remove_dir_all(&tenant_dir_path).with_context(|| { format!( "Failed to remove empty directory '{}'", tenant_dir_path.display(), @@ -582,7 +641,7 @@ fn collect_timelines_for_tenant( let timelines_dir = config.timelines_path(&tenant_id); let mut tenant_timelines = HashMap::new(); - for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + for timelines_dir_entry in fs::read_dir(&timelines_dir) .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? { match timelines_dir_entry { @@ -593,7 +652,7 @@ fn collect_timelines_for_tenant( "Found temporary timeline directory, removing: {}", timeline_dir.display() ); - if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + if let Err(e) = fs::remove_dir_all(&timeline_dir) { error!( "Failed to remove temporary directory '{}': {:?}", timeline_dir.display(), @@ -660,7 +719,7 @@ fn collect_timeline_files( .parse::() .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = - std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; for entry in timeline_dir_entries { let entry_path = entry.context("Failed to list timeline dir entry")?.path(); if entry_path.is_file() { @@ -671,7 +730,7 @@ fn collect_timeline_files( continue; } else if is_temporary(&entry_path) { info!("removing temp timeline file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { + fs::remove_file(&entry_path).with_context(|| { format!( "failed to remove temp download file at {}", entry_path.display() @@ -695,7 +754,7 @@ fn collect_timeline_files( None => anyhow::bail!("No metadata file found in the timeline directory"), }; let metadata = TimelineMetadata::from_bytes( - &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, ) .context("Failed to parse timeline metadata file bytes")?; diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 767f94d167..bd53aae25c 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,16 +1,55 @@ import os from contextlib import closing from datetime import datetime +from pathlib import Path from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.types import Lsn, ZTenantId from prometheus_client.samples import Sample +def test_tenant_creation_fails(neon_simple_env: NeonEnv): + tenants_dir = Path(neon_simple_env.repo_dir) / "tenants" + initial_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) + + neon_simple_env.pageserver.safe_psql("failpoints tenant-creation-before-tmp-rename=return") + with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): + _ = neon_simple_env.neon_cli.create_tenant() + + new_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + assert initial_tenants == new_tenants, "should not create new tenants" + + new_tenant_dirs = list(set([d for d in tenants_dir.iterdir()]) - initial_tenant_dirs) + assert len(new_tenant_dirs) == 1, "should have new tenant directory created" + tmp_tenant_dir = new_tenant_dirs[0] + assert str(tmp_tenant_dir).endswith( + ".___temp" + ), "new tenant directory created should be a temporary one" + + neon_simple_env.pageserver.stop() + neon_simple_env.pageserver.start() + + tenants_after_restart = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + dirs_after_restart = set([d for d in tenants_dir.iterdir()]) + assert ( + tenants_after_restart == initial_tenants + ), "should load all non-corrupt tenants after restart" + assert ( + dirs_after_restart == initial_tenant_dirs + ), "pageserver should clean its temp tenant dirs on restart" + + @pytest.mark.parametrize("with_safekeepers", [False, True]) def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): if with_safekeepers: From 4f7557fb58145022450bfb926913b9016c19aab9 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 09:45:45 +0100 Subject: [PATCH 11/33] github/workflows: Create projects using API (#2403) * github/actions: add neon projects related actions * workflows/benchmarking: create projects using API * workflows/pg_clients: create projects using API --- .../actions/neon-project-create/action.yml | 81 +++++++++++++ .../actions/neon-project-delete/action.yml | 54 +++++++++ .github/workflows/benchmarking.yml | 113 +++++++++++------- .github/workflows/pg_clients.yml | 18 ++- 4 files changed, 223 insertions(+), 43 deletions(-) create mode 100644 .github/actions/neon-project-create/action.yml create mode 100644 .github/actions/neon-project-delete/action.yml diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml new file mode 100644 index 0000000000..d4fced4196 --- /dev/null +++ b/.github/actions/neon-project-create/action.yml @@ -0,0 +1,81 @@ +name: 'Create Neon Project' +description: 'Create Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + region_id: + desctiption: 'Region ID, if not set the project will be created in the default region' + required: false +outputs: + dsn: + description: 'Created Project DSN (for main database)' + value: ${{ steps.create-neon-project.outputs.dsn }} + project_id: + description: 'Created Project ID' + value: ${{ steps.create-neon-project.outputs.project_id }} + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + REGION_ID=${REGION_ID:-eu-west-1} + ;; + staging) + API_HOST=console.stage.neon.tech + REGION_ID=${REGION_ID:-us-east-1} + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "::set-output name=api_host::${API_HOST}" + echo "::set-output name=region_id::${REGION_ID}" + env: + ENVIRONMENT: ${{ inputs.environment }} + REGION_ID: ${{ inputs.region_id }} + + - name: Create Neon Project + id: create-neon-project + # A shell without `set -x` to not to expose password/dsn in logs + shell: bash -euo pipefail {0} + run: | + project=$(curl \ + "https://${API_HOST}/api/v1/projects" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" \ + --data "{ + \"project\": { + \"platform_id\": \"serverless\", + \"region_id\": \"${REGION_ID}\", + \"settings\": { } + } + }") + + # Mask password + echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')" + + dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main + echo "::add-mask::${dsn}" + echo "::set-output name=dsn::${dsn}" + + project_id=$(echo $project | jq --raw-output '.id') + echo "::set-output name=project_id::${project_id}" + env: + API_KEY: ${{ inputs.api_key }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} + REGION_ID: ${{ steps.parse-input.outputs.region_id }} diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml new file mode 100644 index 0000000000..e7c6f58901 --- /dev/null +++ b/.github/actions/neon-project-delete/action.yml @@ -0,0 +1,54 @@ +name: 'Delete Neon Project' +description: 'Delete Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + project_id: + desctiption: 'ID of the Project to delete' + required: true + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + ;; + staging) + API_HOST=console.stage.neon.tech + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "::set-output name=api_host::${API_HOST}" + env: + ENVIRONMENT: ${{ inputs.environment }} + + - name: Delete Neon Project + shell: bash -euxo pipefail {0} + run: | + # Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed + if [ -n "${PROJECT_ID}" ]; then + curl -X "POST" \ + "https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + fi + env: + API_KEY: ${{ inputs.api_key }} + PROJECT_ID: ${{ inputs.project_id }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4c58dda6b6..49fbc74dd6 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -14,6 +14,13 @@ on: - cron: '36 4 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually + inputs: + environment: + description: 'Environment to run remote tests on (dev or staging)' + required: false + region_id: + description: 'Use a particular region. If empty the default one will be used' + false: true defaults: run: @@ -62,19 +69,12 @@ jobs: echo Pgbench $POSTGRES_DISTRIB_DIR/bin/pgbench --version - # FIXME cluster setup is skipped due to various changes in console API - # for now pre created cluster is used. When API gain some stability - # after massive changes dynamic cluster setup will be revived. - # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity - - name: Setup cluster - env: - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - run: | - set -e - - echo "Starting cluster" - # wake up the cluster - $POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1" + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: ${{ github.event.inputs.environment || 'staging' }} + api_key: ${{ ( github.event.inputs.environment || 'staging' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }} - name: Run benchmark # pgbench is installed system wide from official repo @@ -97,7 +97,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" PLATFORM: "neon-staging" - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally run: | # just to be sure that no data was cached on self hosted runner @@ -115,6 +115,14 @@ jobs: run: | REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh + - name: Delete Neon Project + if: ${{ always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: staging + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 @@ -131,11 +139,12 @@ jobs: POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote + SAVE_PERF_REPORT: true strategy: fail-fast: false matrix: - connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ] + platform: [ neon-captest, rds-aurora ] runs-on: dev container: @@ -147,38 +156,52 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Calculate platform - id: calculate-platform - env: - CONNSTR: ${{ matrix.connstr }} - run: | - if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then - PLATFORM=neon-captest - elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then - PLATFORM=rds-aurora - else - echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only" - exit 1 - fi - - echo "::set-output name=PLATFORM::${PLATFORM}" - - name: Install Deps run: | sudo apt -y update sudo apt install -y postgresql-14 + - name: Create Neon Project + if: matrix.platform == 'neon-captest' + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: ${{ github.event.inputs.environment || 'dev' }} + api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${PLATFORM}" in + neon-captest) + CONNSTR=${{ steps.create-neon-project.outputs.dsn }} + ;; + rds-aurora) + CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} + ;; + *) + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest' or 'rds-aurora'" + exit 1 + ;; + esac + + echo "::set-output name=connstr::${CONNSTR}" + + psql ${CONNSTR} -c "SELECT version();" + env: + PLATFORM: ${{ matrix.platform }} + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -188,25 +211,25 @@ jobs: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - - name: Benchmark simple-update + - name: Benchmark select-only uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -216,6 +239,14 @@ jobs: action: generate build_type: ${{ env.BUILD_TYPE }} + - name: Delete Neon Project + if: ${{ matrix.platform == 'neon-captest' && always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: dev + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_CAPTEST_API_KEY }} + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index bf14865db2..d04d002811 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -47,11 +47,17 @@ jobs: shell: bash -euxo pipefail {0} run: ./scripts/pysync + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: staging + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Run pytest env: REMOTE_ENV: 1 - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 shell: bash -euxo pipefail {0} run: | @@ -65,6 +71,14 @@ jobs: -m "remote_cluster" \ -rA "test_runner/pg_clients" + - name: Delete Neon Project + if: ${{ always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: staging + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. # It will be fixed after switching to gen2 runner - name: Upload python test logs From f44afbaf62efb2910cefb671457fe60ada9163d5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 13 Sep 2022 12:26:20 +0300 Subject: [PATCH 12/33] Changes of neon extension to support local prefetch (#2369) * Changes of neon extension to support local prefetch * Catch exceptions in pageserver_receive * Bump posgres version * Bump posgres version * Bump posgres version * Bump posgres version --- pgxn/neon/libpagestore.c | 158 +++++++++++++++++++++-------------- pgxn/neon/pagestore_client.h | 6 +- pgxn/neon/pagestore_smgr.c | 139 ++++++++++++++++++++++++++++-- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 5 files changed, 233 insertions(+), 74 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 649fc1037e..d0572e66cb 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -43,11 +43,6 @@ PGconn *pageserver_conn = NULL; char *page_server_connstring_raw; -static ZenithResponse *pageserver_call(ZenithRequest *request); -page_server_api api = { - .request = pageserver_call -}; - static void pageserver_connect() { @@ -154,60 +149,86 @@ retry: } -static ZenithResponse * -pageserver_call(ZenithRequest *request) +static void +pageserver_disconnect(void) +{ + /* + * If anything goes wrong while we were sending a request, it's not + * clear what state the connection is in. For example, if we sent the + * request but didn't receive a response yet, we might receive the + * response some time later after we have already sent a new unrelated + * request. Close the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } +} + +static void +pageserver_send(ZenithRequest *request) { StringInfoData req_buff; + + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + + if (!connected) + pageserver_connect(); + + req_buff = zm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output + * and TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + { + char* msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); + neon_log(ERROR, "failed to send page request: %s", msg); + } + pfree(req_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) request); + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } +} + +static ZenithResponse * +pageserver_receive(void) +{ StringInfoData resp_buff; ZenithResponse *resp; PG_TRY(); { - /* If the connection was lost for some reason, reconnect */ - if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) - { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } - - if (!connected) - pageserver_connect(); - - req_buff = zm_pack_request(request); - - /* - * Send request. - * - * In principle, this could block if the output buffer is full, and we - * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output - * and TCP buffer. - */ - if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) - { - neon_log(ERROR, "failed to send page request: %s", - PQerrorMessage(pageserver_conn)); - } - pfree(req_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = zm_to_string((ZenithMessage *) request); - - neon_log(PageStoreTrace, "sent request: %s", msg); - pfree(msg); - } - /* read response */ resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); resp_buff.cursor = 0; - if (resp_buff.len == -1) - neon_log(ERROR, "end of COPY"); - else if (resp_buff.len == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); - + if (resp_buff.len < 0) + { + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + } resp = zm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); @@ -221,20 +242,7 @@ pageserver_call(ZenithRequest *request) } PG_CATCH(); { - /* - * If anything goes wrong while we were sending a request, it's not - * clear what state the connection is in. For example, if we sent the - * request but didn't receive a response yet, we might receive the - * response some time later after we have already sent a new unrelated - * request. Close the connection to avoid getting confused. - */ - if (connected) - { - neon_log(LOG, "dropping connection to page server due to error"); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } + pageserver_disconnect(); PG_RE_THROW(); } PG_END_TRY(); @@ -243,6 +251,32 @@ pageserver_call(ZenithRequest *request) } +static void +pageserver_flush(void) +{ + if (PQflush(pageserver_conn)) + { + char* msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); + neon_log(ERROR, "failed to flush page requests: %s", msg); + } +} + +static ZenithResponse * +pageserver_call(ZenithRequest* request) +{ + pageserver_send(request); + pageserver_flush(); + return pageserver_receive(); +} + +page_server_api api = { + .request = pageserver_call, + .send = pageserver_send, + .flush = pageserver_flush, + .receive = pageserver_receive +}; + static bool check_zenith_id(char **newval, void **extra, GucSource source) { diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 93ea6771eb..5b21abc1bd 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -142,7 +142,10 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { ZenithResponse *(*request) (ZenithRequest *request); -} page_server_api; + void (*send) (ZenithRequest *request); + ZenithResponse *(*receive) (void); + void (*flush) (void); +} page_server_api; extern page_server_api *page_server; @@ -171,6 +174,7 @@ extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +extern void zenith_reset_prefetch(SMgrRelation reln); extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index d49df7af58..ebf899dfdb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -57,6 +57,8 @@ #include "postmaster/interrupt.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/relfilenode.h" +#include "storage/buf_internals.h" #include "storage/md.h" #include "fmgr.h" #include "miscadmin.h" @@ -110,6 +112,49 @@ typedef enum static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +/* + * Prefetch implementation: + * Prefetch is performed locally by each backend. + * There can be up to MAX_PREFETCH_REQUESTS registered using smgr_prefetch + * before smgr_read. All this requests are appended to primary smgr_read request. + * It is assumed that pages will be requested in prefetch order. + * Reading of prefetch responses is delayed until them are actually needed (smgr_read). + * It make it possible to parallelize processing and receiving of prefetched pages. + * In case of prefetch miss or any other SMGR request other than smgr_read, + * all prefetch responses has to be consumed. + */ + +#define MAX_PREFETCH_REQUESTS 128 + +BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; +BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; +int n_prefetch_requests; +int n_prefetch_responses; +int n_prefetched_buffers; +int n_prefetch_hits; +int n_prefetch_misses; +XLogRecPtr prefetch_lsn; + +static void +consume_prefetch_responses(void) +{ + for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { + ZenithResponse* resp = page_server->receive(); + pfree(resp); + } + n_prefetched_buffers = 0; + n_prefetch_responses = 0; +} + +static ZenithResponse* +page_server_request(void const* req) +{ + consume_prefetch_responses(); + return page_server->request((ZenithRequest*)req); +} + + StringInfoData zm_pack_request(ZenithRequest *msg) { @@ -735,7 +780,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) .forknum = forkNum }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -948,6 +993,16 @@ zenith_close(SMgrRelation reln, ForkNumber forknum) mdclose(reln, forknum); } + +/* + * zenith_reset_prefetch() -- reoe all previously rgistered prefeth requests + */ +void +zenith_reset_prefetch(SMgrRelation reln) +{ + n_prefetch_requests = 0; +} + /* * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -971,9 +1026,15 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); - return true; + if (n_prefetch_requests < MAX_PREFETCH_REQUESTS) + { + prefetch_requests[n_prefetch_requests].rnode = reln->smgr_rnode.node; + prefetch_requests[n_prefetch_requests].forkNum = forknum; + prefetch_requests[n_prefetch_requests].blockNum = blocknum; + n_prefetch_requests += 1; + return true; + } + return false; } /* @@ -1022,7 +1083,47 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno XLogRecPtr request_lsn, bool request_latest, char *buffer) { ZenithResponse *resp; + int i; + /* + * Try to find prefetched page. + * It is assumed that pages will be requested in the same order as them are prefetched, + * but some other backend may load page in shared buffers, so some prefetch responses should + * be skipped. + */ + for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) + { + resp = page_server->receive(); + if (resp->tag == T_ZenithGetPageResponse && + RelFileNodeEquals(prefetch_responses[i].rnode, rnode) && + prefetch_responses[i].forkNum == forkNum && + prefetch_responses[i].blockNum == blkno) + { + char* page = ((ZenithGetPageResponse *) resp)->page; + /* + * Check if prefetched page is still relevant. + * If it is updated by some other backend, then it should not + * be requested from smgr unless it is evicted from shared buffers. + * In the last case last_evicted_lsn should be updated and + * request_lsn should be greater than prefetch_lsn. + * Maximum with page LSN is used because page returned by page server + * may have LSN either greater either smaller than requested. + */ + if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn) + { + n_prefetched_buffers = i+1; + n_prefetch_hits += 1; + n_prefetch_requests = 0; + memcpy(buffer, page, BLCKSZ); + pfree(resp); + return; + } + } + pfree(resp); + } + n_prefetched_buffers = 0; + n_prefetch_responses = 0; + n_prefetch_misses += 1; { ZenithGetPageRequest request = { .req.tag = T_ZenithGetPageRequest, @@ -1032,10 +1133,29 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno .forknum = forkNum, .blkno = blkno }; - - resp = page_server->request((ZenithRequest *) &request); + if (n_prefetch_requests > 0) + { + /* Combine all prefetch requests with primary request */ + page_server->send((ZenithRequest *) &request); + for (i = 0; i < n_prefetch_requests; i++) + { + request.rnode = prefetch_requests[i].rnode; + request.forknum = prefetch_requests[i].forkNum; + request.blkno = prefetch_requests[i].blockNum; + prefetch_responses[i] = prefetch_requests[i]; + page_server->send((ZenithRequest *) &request); + } + page_server->flush(); + n_prefetch_responses = n_prefetch_requests; + n_prefetch_requests = 0; + prefetch_lsn = request_lsn; + resp = page_server->receive(); + } + else + { + resp = page_server->request((ZenithRequest *) &request); + } } - switch (resp->tag) { case T_ZenithGetPageResponse: @@ -1305,7 +1425,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) .forknum = forknum, }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -1365,7 +1485,7 @@ zenith_dbsize(Oid dbNode) .dbNode = dbNode, }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -1680,6 +1800,7 @@ static const struct f_smgr zenith_smgr = .smgr_unlink = zenith_unlink, .smgr_extend = zenith_extend, .smgr_prefetch = zenith_prefetch, + .smgr_reset_prefetch = zenith_reset_prefetch, .smgr_read = zenith_read, .smgr_write = zenith_write, .smgr_writeback = zenith_writeback, diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index e8518d3fc8..114676d2ed 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit e8518d3fc85e3da420d2f5a2742a21386e6585ec +Subproject commit 114676d2edd5307226d9448ec467821fdb77467d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 313769bb62..b1dbd93e2b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 313769bb6229f46380e24d8f6ff535f9185458af +Subproject commit b1dbd93e2b1691e93860f7e59b9e1fe5a6e79786 From 1a8c8b04d70bd82a20055e2653c4aa593e3bfc34 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 7 Sep 2022 18:01:49 +0300 Subject: [PATCH 13/33] Merge Repository and Tenant entities, rework tenant background jobs --- control_plane/src/bin/neon_local.rs | 8 +- pageserver/src/basebackup.rs | 2 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/bin/update_metadata.rs | 2 +- pageserver/src/config.rs | 2 +- pageserver/src/http/models.rs | 5 +- pageserver/src/http/openapi_spec.yml | 4 +- pageserver/src/http/routes.rs | 123 ++++--- pageserver/src/import_datadir.rs | 2 +- pageserver/src/lib.rs | 4 +- pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 56 ++-- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/storage_sync.rs | 34 +- pageserver/src/storage_sync/delete.rs | 6 +- pageserver/src/storage_sync/download.rs | 10 +- pageserver/src/storage_sync/index.rs | 8 +- pageserver/src/storage_sync/upload.rs | 12 +- .../src/{layered_repository.rs => tenant.rs} | 255 +++++++++----- .../{layered_repository => tenant}/blob_io.rs | 2 +- .../block_io.rs | 2 +- .../delta_layer.rs | 12 +- .../disk_btree.rs | 2 +- .../disk_btree_test_data.rs | 0 .../ephemeral_file.rs | 14 +- .../filename.rs | 0 .../image_layer.rs | 12 +- .../inmemory_layer.rs | 12 +- .../layer_map.rs | 6 +- .../metadata.rs | 4 +- .../par_fsync.rs | 0 .../storage_layer.rs | 0 .../timeline.rs | 4 +- pageserver/src/tenant_mgr.rs | 312 +++++++----------- pageserver/src/tenant_tasks.rs | 147 ++++++--- pageserver/src/timelines.rs | 31 +- pageserver/src/walingest.rs | 25 +- .../src/walreceiver/connection_manager.rs | 20 +- .../src/walreceiver/walreceiver_connection.rs | 7 +- test_runner/regress/test_broken_timeline.py | 4 +- test_runner/regress/test_tenant_tasks.py | 8 +- test_runner/regress/test_timeline_delete.py | 5 +- 43 files changed, 615 insertions(+), 563 deletions(-) rename pageserver/src/{layered_repository.rs => tenant.rs} (88%) rename pageserver/src/{layered_repository => tenant}/blob_io.rs (98%) rename pageserver/src/{layered_repository => tenant}/block_io.rs (98%) rename pageserver/src/{layered_repository => tenant}/delta_layer.rs (98%) rename pageserver/src/{layered_repository => tenant}/disk_btree.rs (99%) rename pageserver/src/{layered_repository => tenant}/disk_btree_test_data.rs (100%) rename pageserver/src/{layered_repository => tenant}/ephemeral_file.rs (97%) rename pageserver/src/{layered_repository => tenant}/filename.rs (100%) rename pageserver/src/{layered_repository => tenant}/image_layer.rs (97%) rename pageserver/src/{layered_repository => tenant}/inmemory_layer.rs (96%) rename pageserver/src/{layered_repository => tenant}/layer_map.rs (98%) rename pageserver/src/{layered_repository => tenant}/metadata.rs (98%) rename pageserver/src/{layered_repository => tenant}/par_fsync.rs (100%) rename pageserver/src/{layered_repository => tenant}/storage_layer.rs (100%) rename pageserver/src/{layered_repository => tenant}/timeline.rs (99%) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 828d6a2e5a..e3160db53b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -543,13 +543,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an match tenant_match.subcommand() { Some(("list", _)) => { for t in pageserver.tenant_list()? { - println!( - "{} {}", - t.id, - t.state - .map(|s| s.to_string()) - .unwrap_or_else(|| String::from("")) - ); + println!("{} {:?}", t.id, t.state); } } Some(("create", create_match)) => { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 61facc852d..eca6a3c87f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,8 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 87390a1b06..7e766ce859 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -3,8 +3,8 @@ //! A handy tool for debugging, that's all. use anyhow::Result; use clap::{App, Arg}; -use pageserver::layered_repository::dump_layerfile_from_path; use pageserver::page_cache; +use pageserver::tenant::dump_layerfile_from_path; use pageserver::virtual_file; use std::path::PathBuf; use utils::project_git_version; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ec71e5b320..679c6f76e7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -182,7 +182,7 @@ fn initialize_config( cfg_file_path.display() ); } else { - // We're initializing the repo, so there's no config file yet + // We're initializing the tenant, so there's no config file yet ( DEFAULT_CONFIG_FILE .parse::() diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 983fdb8647..3339564b0f 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -3,7 +3,7 @@ //! A handy tool for debugging, that's all. use anyhow::Result; use clap::{App, Arg}; -use pageserver::layered_repository::metadata::TimelineMetadata; +use pageserver::tenant::metadata::TimelineMetadata; use std::path::PathBuf; use std::str::FromStr; use utils::{lsn::Lsn, project_git_version}; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index fb70ea327d..56171f46e3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -19,7 +19,7 @@ use utils::{ zid::{NodeId, ZTenantId, ZTimelineId}, }; -use crate::layered_repository::TIMELINES_SEGMENT_NAME; +use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; pub mod defaults { diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 7c7d7f7b0c..0ccf23776c 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -7,8 +7,7 @@ use utils::{ zid::{NodeId, ZTenantId, ZTimelineId}, }; -// These enums are used in the API response fields. -use crate::tenant_mgr::TenantState; +use crate::tenant::TenantState; #[serde_as] #[derive(Serialize, Deserialize)] @@ -108,7 +107,7 @@ impl TenantConfigRequest { pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: ZTenantId, - pub state: Option, + pub state: TenantState, pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 6beb938d6a..b9a62d0f32 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -489,6 +489,7 @@ components: type: object required: - id + - state properties: id: type: string @@ -573,7 +574,6 @@ components: required: - last_record_lsn - disk_consistent_lsn - - timeline_state properties: last_record_lsn: type: string @@ -581,8 +581,6 @@ components: disk_consistent_lsn: type: string format: hex - timeline_state: - type: string ancestor_timeline_id: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 78f83511cb..36ba2e9b66 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,9 +11,9 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::Timeline; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; +use crate::tenant::{TenantState, Timeline}; use crate::tenant_config::TenantConfOpt; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ @@ -132,12 +132,11 @@ fn list_local_timelines( include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("Failed to get repo for tenant {tenant_id}"))?; - let repo_timelines = repo.list_timelines(); + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let timelines = tenant.list_timelines(); - let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); - for (timeline_id, repository_timeline) in repo_timelines { + let mut local_timeline_info = Vec::with_capacity(timelines.len()); + for (timeline_id, repository_timeline) in timelines { local_timeline_info.push(( timeline_id, local_timeline_info_from_timeline( @@ -201,23 +200,31 @@ async fn timeline_list_handler(request: Request) -> Result, query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; - let local_timeline_infos = tokio::task::spawn_blocking(move || { + let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - list_local_timelines( - tenant_id, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) + Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines()) }) .await .map_err(ApiError::from_err)??; - let mut response_data = Vec::with_capacity(local_timeline_infos.len()); - for (timeline_id, local_timeline_info) in local_timeline_infos { + let mut response_data = Vec::with_capacity(timelines.len()); + for (timeline_id, timeline) in timelines { + let local = match local_timeline_info_from_timeline( + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) { + Ok(local) => Some(local), + Err(e) => { + error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}"); + None + } + }; + response_data.push(TimelineInfo { tenant_id, timeline_id, - local: Some(local_timeline_info), + local, remote: get_state(&request) .remote_index .read() @@ -259,28 +266,25 @@ async fn timeline_detail_handler(request: Request) -> Result(local_timeline) + let timeline = tokio::task::spawn_blocking(move || { + tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id) }) .await - .ok() - .and_then(|r| r.ok()) - .flatten(); + .map_err(ApiError::from_err)?; + + let local_timeline_info = match timeline.and_then(|timeline| { + local_timeline_info_from_timeline( + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) + }) { + Ok(local_info) => Some(local_info), + Err(e) => { + error!("Failed to get local timeline info: {e:#}"); + None + } + }; let remote_timeline_info = { let remote_index_read = get_state(&request).remote_index.read().await; @@ -294,25 +298,26 @@ async fn timeline_detail_handler(request: Request) -> Result((local_timeline_info, remote_timeline_info)) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) - .await; + .await?; if local_timeline_info.is_none() && remote_timeline_info.is_none() { - return Err(ApiError::NotFound(format!( + Err(ApiError::NotFound(format!( "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely" - ))); + ))) + } else { + json_response( + StatusCode::OK, + TimelineInfo { + tenant_id, + timeline_id, + local: local_timeline_info, + remote: remote_timeline_info, + }, + ) } - - let timeline_info = TimelineInfo { - tenant_id, - timeline_id, - local: local_timeline_info, - remote: remote_timeline_info, - }; - - json_response(StatusCode::OK, timeline_info) } // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create @@ -320,10 +325,10 @@ async fn tenant_attach_handler(request: Request) -> Result, let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - info!("Handling tenant attach {}", tenant_id); + info!("Handling tenant attach {tenant_id}"); tokio::task::spawn_blocking(move || { - if tenant_mgr::get_tenant_state(tenant_id).is_some() { + if tenant_mgr::get_tenant(tenant_id, false).is_ok() { anyhow::bail!("Tenant is already present locally") }; Ok(()) @@ -426,7 +431,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, ApiErro check_permission(&request, Some(tenant_id))?; // if tenant is in progress of downloading it can be absent in global tenant map - let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id)) + let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) .await .map_err(ApiError::from_err)?; @@ -494,13 +499,25 @@ async fn tenant_status(request: Request) -> Result, ApiErro false }); + let tenant_state = match tenant { + Ok(tenant) => tenant.current_state(), + Err(e) => { + error!("Failed to get local tenant state: {e:#}"); + if has_in_progress_downloads { + TenantState::Paused + } else { + TenantState::Broken + } + } + }; + let current_physical_size = match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) .await .map_err(ApiError::from_err)? { Err(err) => { - // Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded). + // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). // In that case, put a warning message into log and operate normally. warn!("Failed to get local timelines for tenant {tenant_id}: {err}"); None diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index f8f614f8f4..ee0780f4b2 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -11,9 +11,9 @@ use bytes::Bytes; use tracing::*; use walkdir::WalkDir; -use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; use postgres_ffi::v14::relfile_utils::*; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 8b9251229e..5742568079 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,7 +3,6 @@ pub mod config; pub mod http; pub mod import_datadir; pub mod keyspace; -pub mod layered_repository; pub mod metrics; pub mod page_cache; pub mod page_service; @@ -13,6 +12,7 @@ pub mod reltag; pub mod repository; pub mod storage_sync; pub mod task_mgr; +pub mod tenant; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_tasks; @@ -181,7 +181,7 @@ mod backoff_defaults_tests { #[cfg(test)] mod tests { - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; use super::*; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 27b1400243..15c3c22dd6 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -53,8 +53,8 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::layered_repository::writeback_ephemeral_file; use crate::repository::Key; +use crate::tenant::writeback_ephemeral_file; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 149144bfe4..b03dab20e0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -34,13 +34,13 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::layered_repository::Timeline; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; use crate::tenant_mgr; use crate::CheckpointConfig; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; @@ -477,8 +477,8 @@ impl PageServerHandler { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; + let timeline = tenant_mgr::get_tenant(tenant_id, true)? + .create_empty_timeline(timeline_id, base_lsn)?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -539,10 +539,7 @@ impl PageServerHandler { ) -> anyhow::Result<()> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo - .get_timeline(timeline_id) - .with_context(|| format!("Timeline {timeline_id} was not found"))?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; ensure!(timeline.get_last_record_lsn() == start_lsn); // TODO leave clean state on error. For now you can use detach to clean @@ -770,7 +767,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenantid: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -782,7 +779,7 @@ impl PageServerHandler { .claims .as_ref() .expect("claims presence already checked"); - auth::check_permission(claims, tenantid) + auth::check_permission(claims, tenant_id) } } @@ -809,7 +806,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } info!( - "jwt auth succeeded for scope: {:#?} by tenantid: {:?}", + "jwt auth succeeded for scope: {:#?} by tenant id: {:?}", data.claims.scope, data.claims.tenant_id, ); @@ -1013,8 +1010,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); - let tenantid = ZTenantId::from_str(params[0])?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let tenant_id = ZTenantId::from_str(params[0])?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), @@ -1027,25 +1024,27 @@ impl postgres_backend_async::Handler for PageServerHandler { RowDescriptor::int8_col(b"pitr_interval"), ]))? .write_message(&BeMessage::DataRow(&[ - Some(repo.get_checkpoint_distance().to_string().as_bytes()), + Some(tenant.get_checkpoint_distance().to_string().as_bytes()), Some( - repo.get_checkpoint_timeout() + tenant + .get_checkpoint_timeout() .as_secs() .to_string() .as_bytes(), ), - Some(repo.get_compaction_target_size().to_string().as_bytes()), + Some(tenant.get_compaction_target_size().to_string().as_bytes()), Some( - repo.get_compaction_period() + tenant + .get_compaction_period() .as_secs() .to_string() .as_bytes(), ), - Some(repo.get_compaction_threshold().to_string().as_bytes()), - Some(repo.get_gc_horizon().to_string().as_bytes()), - Some(repo.get_gc_period().as_secs().to_string().as_bytes()), - Some(repo.get_image_creation_threshold().to_string().as_bytes()), - Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()), + Some(tenant.get_compaction_threshold().to_string().as_bytes()), + Some(tenant.get_gc_horizon().to_string().as_bytes()), + Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), + Some(tenant.get_image_creation_threshold().to_string().as_bytes()), + Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("do_gc ") { @@ -1066,16 +1065,16 @@ impl postgres_backend_async::Handler for PageServerHandler { let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let gc_horizon: u64 = caps .get(4) .map(|h| h.as_str().parse()) - .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; + .unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?; // Use tenant's pitr setting - let pitr = repo.get_pitr_interval(); - let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + let pitr = tenant.get_pitr_interval(); + let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), @@ -1169,12 +1168,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { - tenant_mgr::get_repository_for_tenant(tenant_id) - .and_then(|repo| { - repo.get_timeline(timeline_id) - .context("No timeline in tenant's repository") - }) - .with_context(|| format!("Could not get timeline {timeline_id} in tenant {tenant_id}")) + tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id)) } /// diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ba48a77961..2454b6f54f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,9 +7,9 @@ //! Clarify that) //! use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; +use crate::tenant::Timeline; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; @@ -1398,16 +1398,12 @@ fn is_slru_block_key(key: Key) -> bool { && key.field6 != 0xffffffff // and not SlruSegSize } -// -//-- Tests that should work the same with any Repository/Timeline implementation. -// - #[cfg(test)] pub fn create_test_timeline( - repo: &crate::layered_repository::Repository, + tenant: &crate::tenant::Tenant, timeline_id: utils::zid::ZTimelineId, ) -> Result> { - let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 8ebfa6a935..c104dba298 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -46,10 +46,10 @@ //! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. //! If the storage sync loop was successfully started before, pageserver schedules the layer files and the updated metadata file for upload, every time a layer is flushed to disk. //! The uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). -//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. +//! See [`crate::tenant`] for the upload calls and the adjacent logic. //! -//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], -//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. +//! Synchronization logic is able to communicate back with updated timeline sync states, submitted via [`crate::tenant_mgr::attach_local_tenants`] function. +//! Tenant manager applies corresponding timeline updates in pageserver's in-memory state. //! Such submissions happen in two cases: //! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future //! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory @@ -171,11 +171,11 @@ use self::{ use crate::{ config::PageServerConf, exponential_backoff, - layered_repository::metadata::{metadata_path, TimelineMetadata}, storage_sync::index::RemoteIndex, task_mgr, task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, + tenant::metadata::{metadata_path, TimelineMetadata}, tenant_mgr::attach_local_tenants, }; use crate::{ @@ -714,17 +714,17 @@ async fn storage_sync_loop( }; if tenant_entry.has_in_progress_downloads() { - info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration"); + info!("Tenant {tenant_id} has pending timeline downloads, skipping tenant registration"); continue; } else { info!( - "Tenant {tenant_id} download completed. Picking to register in repository" + "Tenant {tenant_id} download completed. Picking to register in tenant" ); // Here we assume that if tenant has no in-progress downloads that // means that it is the last completed timeline download that triggered // sync status update. So we look at the index for available timelines - // and register them all at once in a repository for download - // to be submitted in a single operation to repository + // and register them all at once in a tenant for download + // to be submitted in a single operation to tenant // so it can apply them at once to internal timeline map. timelines_to_attach.0.insert( tenant_id, @@ -737,9 +737,7 @@ async fn storage_sync_loop( } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - if let Err(e) = attach_local_tenants(conf, &index, timelines_to_attach) { - error!("Failed to attach new timelines: {e:?}"); - }; + attach_local_tenants(conf, &index, timelines_to_attach); } } ControlFlow::Break(()) => { @@ -1038,13 +1036,7 @@ async fn update_local_metadata( timeline_id, } = sync_id; tokio::task::spawn_blocking(move || { - crate::layered_repository::save_metadata( - conf, - timeline_id, - tenant_id, - &cloned_metadata, - true, - ) + crate::tenant::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) }) .await .with_context(|| { @@ -1411,12 +1403,12 @@ fn register_sync_status( mod test_utils { use utils::lsn::Lsn; - use crate::layered_repository::repo_harness::RepoHarness; + use crate::tenant::harness::TenantHarness; use super::*; pub(super) async fn create_local_timeline( - harness: &RepoHarness<'_>, + harness: &TenantHarness<'_>, timeline_id: ZTimelineId, filenames: &[&str], metadata: TimelineMetadata, @@ -1456,7 +1448,7 @@ mod test_utils { #[cfg(test)] mod tests { use super::test_utils::dummy_metadata; - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; use hex_literal::hex; use utils::lsn::Lsn; diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 794ecbaeb3..945f5fded8 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -112,8 +112,8 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use remote_storage::{LocalFs, RemoteStorage}; @@ -121,7 +121,7 @@ mod tests { #[tokio::test] async fn delete_timeline_negative() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline_negative")?; + let harness = TenantHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( @@ -154,7 +154,7 @@ mod tests { #[tokio::test] async fn delete_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline")?; + let harness = TenantHarness::create("delete_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 91ee557b79..32f228b447 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -17,7 +17,7 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{ - config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, + config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, TEMP_FILE_SUFFIX, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -425,18 +425,18 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, }, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use super::*; #[tokio::test] async fn download_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("download_timeline")?; + let harness = TenantHarness::create("download_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -537,7 +537,7 @@ mod tests { #[tokio::test] async fn download_timeline_negatives() -> anyhow::Result<()> { - let harness = RepoHarness::create("download_timeline_negatives")?; + let harness = TenantHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( @@ -596,7 +596,7 @@ mod tests { #[tokio::test] async fn test_download_index_part() -> anyhow::Result<()> { - let harness = RepoHarness::create("test_download_index_part")?; + let harness = TenantHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index b17bb40da4..cff14cde49 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -15,7 +15,7 @@ use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; use tracing::log::warn; -use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata}; +use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata}; use utils::{ lsn::Lsn, zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, @@ -340,11 +340,11 @@ mod tests { use std::collections::BTreeSet; use super::*; - use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; #[test] fn index_part_conversion() { - let harness = RepoHarness::create("index_part_conversion").unwrap(); + let harness = TenantHarness::create("index_part_conversion").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let metadata = TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); @@ -462,7 +462,7 @@ mod tests { #[test] fn index_part_conversion_negatives() { - let harness = RepoHarness::create("index_part_conversion_negatives").unwrap(); + let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let metadata = TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index a4285e426b..bd09e6b898 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -15,9 +15,7 @@ use super::{ LayersUpload, SyncData, SyncQueue, }; use crate::metrics::NO_LAYERS_UPLOAD; -use crate::{ - config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, -}; +use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -202,18 +200,18 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, }, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use super::{upload_index_part, *}; #[tokio::test] async fn regular_layer_upload() -> anyhow::Result<()> { - let harness = RepoHarness::create("regular_layer_upload")?; + let harness = TenantHarness::create("regular_layer_upload")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -301,7 +299,7 @@ mod tests { // Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario. #[tokio::test] async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { - let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; + let harness = TenantHarness::create("layer_upload_after_local_fs_update")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -396,7 +394,7 @@ mod tests { #[tokio::test] async fn test_upload_index_part() -> anyhow::Result<()> { - let harness = RepoHarness::create("test_upload_index_part")?; + let harness = TenantHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/tenant.rs similarity index 88% rename from pageserver/src/layered_repository.rs rename to pageserver/src/tenant.rs index ecc0bfe3b5..4ef810faba 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/tenant.rs @@ -1,6 +1,6 @@ //! //! Timeline repository implementation that keeps old data in files on disk, and -//! the recent changes in memory. See layered_repository/*_layer.rs files. +//! the recent changes in memory. See tenant/*_layer.rs files. //! The functions here are responsible for locating the correct layer for the //! get/put call, walking back the timeline branching history as needed. //! @@ -12,6 +12,7 @@ //! use anyhow::{bail, ensure, Context, Result}; +use tokio::sync::watch; use tracing::*; use std::cmp::min; @@ -71,24 +72,26 @@ use storage_layer::Layer; pub use timeline::Timeline; // re-export this function so that page_cache.rs can use it. -pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; +pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; // re-export for use in storage_sync.rs -pub use crate::layered_repository::metadata::save_metadata; +pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver -pub use crate::layered_repository::timeline::WalReceiverInfo; +pub use crate::tenant::timeline::WalReceiverInfo; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// -/// Repository consists of multiple timelines. Keep them in a hash table. +/// Tenant consists of multiple timelines. Keep them in a hash table. /// -pub struct Repository { +pub struct Tenant { // Global pageserver config parameters pub conf: &'static PageServerConf, + state: watch::Sender, + // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -114,17 +117,40 @@ pub struct Repository { upload_layers: bool, } +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} + /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. -impl Repository { +impl Tenant { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timeline_id: ZTimelineId) -> Option> { - self.timelines.lock().unwrap().get(&timeline_id).cloned() + pub fn get_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result> { + self.timelines + .lock() + .unwrap() + .get(&timeline_id) + .with_context(|| { + format!( + "Timeline {} was not found for tenant {}", + timeline_id, + self.tenant_id() + ) + }) + .map(Arc::clone) } - /// Lists timelines the repository contains. - /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + /// Lists timelines the tenant contains. + /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { self.timelines .lock() @@ -425,6 +451,54 @@ impl Repository { pub fn get_remote_index(&self) -> &RemoteIndex { &self.remote_index } + + pub fn current_state(&self) -> TenantState { + *self.state.borrow() + } + + pub fn is_active(&self) -> bool { + matches!(self.current_state(), TenantState::Active { .. }) + } + + pub fn should_run_tasks(&self) -> bool { + matches!( + self.current_state(), + TenantState::Active { + background_jobs_running: true + } + ) + } + + /// Changes tenant status to active, if it was not broken before. + /// Otherwise, ignores the state change, logging an error. + pub fn activate(&self, enable_background_jobs: bool) { + self.set_state(TenantState::Active { + background_jobs_running: enable_background_jobs, + }); + } + + pub fn set_state(&self, new_state: TenantState) { + match (self.current_state(), new_state) { + (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { + debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + } + (TenantState::Broken, _) => { + error!("Ignoring state update {new_state:?} for broken tenant"); + } + (_, new_state) => { + self.state.send_replace(new_state); + if self.should_run_tasks() { + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + crate::tenant_tasks::start_background_loops(self.tenant_id); + } + } + } + } + + pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + self.state.subscribe() + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -471,7 +545,7 @@ fn tree_sort_timelines( } /// Private functions -impl Repository { +impl Tenant { pub fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -609,8 +683,9 @@ impl Repository { tenant_id: ZTenantId, remote_index: RemoteIndex, upload_layers: bool, - ) -> Repository { - Repository { + ) -> Tenant { + let (state, _) = watch::channel(TenantState::Paused); + Tenant { tenant_id, conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), @@ -619,6 +694,7 @@ impl Repository { walredo_mgr, remote_index, upload_layers, + state, } } @@ -848,7 +924,7 @@ impl Repository { // compaction (both require `layer_removal_cs` lock), // but the GC iteration can run concurrently with branch creation. // - // See comments in [`Repository::branch_timeline`] for more information + // See comments in [`Tenant::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { if task_mgr::is_shutdown_requested() { @@ -881,7 +957,7 @@ impl Repository { } } -impl Drop for Repository { +impl Drop for Tenant { fn drop(&mut self) { remove_tenant_metrics(&self.tenant_id); } @@ -910,7 +986,7 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { } #[cfg(test)] -pub mod repo_harness { +pub mod harness { use bytes::{Bytes, BytesMut}; use once_cell::sync::Lazy; use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; @@ -920,8 +996,8 @@ pub mod repo_harness { use crate::storage_sync::index::RemoteIndex; use crate::{ config::PageServerConf, - layered_repository::Repository, repository::Key, + tenant::Tenant, walrecord::ZenithWalRecord, walredo::{WalRedoError, WalRedoManager}, }; @@ -968,7 +1044,7 @@ pub mod repo_harness { } } - pub struct RepoHarness<'a> { + pub struct TenantHarness<'a> { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, pub tenant_id: ZTenantId, @@ -979,7 +1055,7 @@ pub mod repo_harness { ), } - impl<'a> RepoHarness<'a> { + impl<'a> TenantHarness<'a> { pub fn create(test_name: &'static str) -> Result { Self::create_internal(test_name, false) } @@ -1016,14 +1092,14 @@ pub mod repo_harness { }) } - pub fn load(&self) -> Repository { - self.try_load().expect("failed to load test repo") + pub fn load(&self) -> Tenant { + self.try_load().expect("failed to load test tenant") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); - let repo = Repository::new( + let tenant = Tenant::new( self.conf, TenantConfOpt::from(self.tenant_conf), walredo_mgr, @@ -1031,7 +1107,7 @@ pub mod repo_harness { RemoteIndex::default(), false, ); - // populate repo with locally available timelines + // populate tenant with locally available timelines let mut timelines_to_load = HashMap::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") @@ -1043,12 +1119,13 @@ pub mod repo_harness { .unwrap() .to_string_lossy() .parse()?; + let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; timelines_to_load.insert(timeline_id, timeline_metadata); } - repo.init_attach_timelines(timelines_to_load)?; + tenant.init_attach_timelines(timelines_to_load)?; - Ok(repo) + Ok(tenant) } pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { @@ -1110,8 +1187,8 @@ mod tests { use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; - use crate::layered_repository::repo_harness::*; use crate::repository::{Key, Value}; + use crate::tenant::harness::*; use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; @@ -1122,8 +1199,8 @@ mod tests { #[test] fn test_basic() -> Result<()> { - let repo = RepoHarness::create("test_basic")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_basic")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1144,10 +1221,10 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { - let repo = RepoHarness::create("no_duplicate_timelines")?.load(); - let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); + let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -1170,8 +1247,8 @@ mod tests { /// #[test] fn test_branch() -> Result<()> { - let repo = RepoHarness::create("test_branch")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_branch")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); use std::str::from_utf8; @@ -1193,8 +1270,8 @@ mod tests { //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); @@ -1263,19 +1340,20 @@ mod tests { #[test] fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + .load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 // FIXME: this doesn't actually remove any layer currently, given how the checkpointing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); @@ -1292,11 +1370,12 @@ mod tests { #[test] fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); @@ -1336,36 +1415,37 @@ mod tests { #[test] fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); Ok(()) } #[test] fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // Check that the data is still accessible on the branch. assert_eq!( @@ -1379,16 +1459,17 @@ mod tests { #[test] fn timeline_load() -> Result<()> { const TEST_NAME: &str = "timeline_load"; - let harness = RepoHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME)?; { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + let tenant = harness.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } - let repo = harness.load(); - repo.get_timeline(TIMELINE_ID) + let tenant = harness.load(); + tenant + .get_timeline(TIMELINE_ID) .expect("cannot load timeline"); Ok(()) @@ -1397,18 +1478,18 @@ mod tests { #[test] fn timeline_load_with_ancestor() -> Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = RepoHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME)?; // create two timelines { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = harness.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -1417,14 +1498,14 @@ mod tests { } // check that both of them are initially unloaded - let repo = harness.load(); + let tenant = harness.load(); // check that both, child and ancestor are loaded - let _child_tline = repo + let _child_tline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("cannot get child timeline loaded"); - let _ancestor_tline = repo + let _ancestor_tline = tenant .get_timeline(TIMELINE_ID) .expect("cannot get ancestor timeline loaded"); @@ -1434,11 +1515,11 @@ mod tests { #[test] fn corrupt_metadata() -> Result<()> { const TEST_NAME: &str = "corrupt_metadata"; - let harness = RepoHarness::create(TEST_NAME)?; - let repo = harness.load(); + let harness = TenantHarness::create(TEST_NAME)?; + let tenant = harness.load(); - repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - drop(repo); + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1473,8 +1554,8 @@ mod tests { #[test] fn test_images() -> Result<()> { - let repo = RepoHarness::create("test_images")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_images")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1523,8 +1604,8 @@ mod tests { // #[test] fn test_bulk_insert() -> Result<()> { - let repo = RepoHarness::create("test_bulk_insert")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_bulk_insert")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let mut lsn = Lsn(0x10); @@ -1563,8 +1644,8 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { - let repo = RepoHarness::create("test_random_updates")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_random_updates")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 1000; @@ -1633,8 +1714,8 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { - let repo = RepoHarness::create("test_traverse_branches")?.load(); - let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_traverse_branches")?.load(); + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 1000; @@ -1667,8 +1748,8 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant .get_timeline(new_tline_id) .expect("Should have the branched timeline"); tline_id = new_tline_id; @@ -1712,8 +1793,8 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { - let repo = RepoHarness::create("test_traverse_ancestors")?.load(); - let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; @@ -1728,8 +1809,8 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant .get_timeline(new_tline_id) .expect("Should have the branched timeline"); tline_id = new_tline_id; diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/tenant/blob_io.rs similarity index 98% rename from pageserver/src/layered_repository/blob_io.rs rename to pageserver/src/tenant/blob_io.rs index a4c6186056..78ecbcb9c1 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -11,8 +11,8 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! -use crate::layered_repository::block_io::{BlockCursor, BlockReader}; use crate::page_cache::PAGE_SZ; +use crate::tenant::block_io::{BlockCursor, BlockReader}; use std::cmp::min; use std::io::{Error, ErrorKind}; diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/tenant/block_io.rs similarity index 98% rename from pageserver/src/layered_repository/block_io.rs rename to pageserver/src/tenant/block_io.rs index 5e32b8833a..bbcdabe1cd 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -60,7 +60,7 @@ where /// the underlying BlockReader. For example: /// /// ```no_run -/// # use pageserver::layered_repository::block_io::{BlockReader, FileBlockReader}; +/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; /// # let reader: FileBlockReader = todo!(); /// let cursor = reader.block_cursor(); /// let buf = cursor.read_blk(1); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs similarity index 98% rename from pageserver/src/layered_repository/delta_layer.rs rename to pageserver/src/tenant/delta_layer.rs index af02f84bc0..ff6d3652f9 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -24,15 +24,13 @@ //! "values" part. //! use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; -use crate::layered_repository::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; -use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; +use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::filename::{DeltaFileName, PathOrConf}; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::virtual_file::VirtualFile; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs similarity index 99% rename from pageserver/src/layered_repository/disk_btree.rs rename to pageserver/src/tenant/disk_btree.rs index c130a42a8e..33255dbd82 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -25,7 +25,7 @@ use std::{cmp::Ordering, io, result}; use thiserror::Error; use tracing::error; -use crate::layered_repository::block_io::{BlockReader, BlockWriter}; +use crate::tenant::block_io::{BlockReader, BlockWriter}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; diff --git a/pageserver/src/layered_repository/disk_btree_test_data.rs b/pageserver/src/tenant/disk_btree_test_data.rs similarity index 100% rename from pageserver/src/layered_repository/disk_btree_test_data.rs rename to pageserver/src/tenant/disk_btree_test_data.rs diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs similarity index 97% rename from pageserver/src/layered_repository/ephemeral_file.rs rename to pageserver/src/tenant/ephemeral_file.rs index a1b2d68cd5..c675e4e778 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -2,11 +2,11 @@ //! used to keep in-memory layers spilled on disk. use crate::config::PageServerConf; -use crate::layered_repository::blob_io::BlobWriter; -use crate::layered_repository::block_io::BlockReader; use crate::page_cache; use crate::page_cache::PAGE_SZ; use crate::page_cache::{ReadBufResult, WriteBufResult}; +use crate::tenant::blob_io::BlobWriter; +use crate::tenant::block_io::BlockReader; use crate::virtual_file::VirtualFile; use once_cell::sync::Lazy; use std::cmp::min; @@ -330,13 +330,13 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error { #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; - use crate::layered_repository::block_io::BlockCursor; + use crate::tenant::blob_io::{BlobCursor, BlobWriter}; + use crate::tenant::block_io::BlockCursor; use rand::{seq::SliceRandom, thread_rng, RngCore}; use std::fs; use std::str::FromStr; - fn repo_harness( + fn harness( test_name: &str, ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -368,7 +368,7 @@ mod tests { #[test] fn test_ephemeral_files() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?; + let (conf, tenantid, timelineid) = harness("ephemeral_files")?; let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -399,7 +399,7 @@ mod tests { #[test] fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?; + let (conf, tenantid, timelineid) = harness("ephemeral_blobs")?; let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/tenant/filename.rs similarity index 100% rename from pageserver/src/layered_repository/filename.rs rename to pageserver/src/tenant/filename.rs diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/tenant/image_layer.rs similarity index 97% rename from pageserver/src/layered_repository/image_layer.rs rename to pageserver/src/tenant/image_layer.rs index 4fe771bb3f..518643241d 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -20,15 +20,13 @@ //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; -use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader}; -use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::layered_repository::filename::{ImageFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; +use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::filename::{ImageFileName, PathOrConf}; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::virtual_file::VirtualFile; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs similarity index 96% rename from pageserver/src/layered_repository/inmemory_layer.rs rename to pageserver/src/tenant/inmemory_layer.rs index 5f269a868f..0e7b215b1e 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/tenant/inmemory_layer.rs @@ -5,14 +5,12 @@ //! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; -use crate::layered_repository::block_io::BlockReader; -use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; -use crate::layered_repository::ephemeral_file::EphemeralFile; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::repository::{Key, Value}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter}; +use crate::tenant::block_io::BlockReader; +use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter}; +use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::walrecord; use anyhow::{bail, ensure, Result}; use std::cell::RefCell; diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/tenant/layer_map.rs similarity index 98% rename from pageserver/src/layered_repository/layer_map.rs rename to pageserver/src/tenant/layer_map.rs index 88dcf32409..c24e3976fb 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -10,11 +10,11 @@ //! corresponding files are written to disk. //! -use crate::layered_repository::inmemory_layer::InMemoryLayer; -use crate::layered_repository::storage_layer::Layer; -use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; +use crate::tenant::inmemory_layer::InMemoryLayer; +use crate::tenant::storage_layer::Layer; +use crate::tenant::storage_layer::{range_eq, range_overlaps}; use anyhow::Result; use std::collections::VecDeque; use std::ops::Range; diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/tenant/metadata.rs similarity index 98% rename from pageserver/src/layered_repository/metadata.rs rename to pageserver/src/tenant/metadata.rs index 910dba4644..4ea2b7d55b 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,4 +1,4 @@ -//! Every image of a certain timeline from [`crate::layered_repository::Repository`] +//! Every image of a certain timeline from [`crate::tenant::Tenant`] //! has a metadata that needs to be stored persistently. //! //! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of @@ -216,7 +216,7 @@ pub fn save_metadata( #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; #[test] fn metadata_serializes_correctly() { diff --git a/pageserver/src/layered_repository/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs similarity index 100% rename from pageserver/src/layered_repository/par_fsync.rs rename to pageserver/src/tenant/par_fsync.rs diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs similarity index 100% rename from pageserver/src/layered_repository/storage_layer.rs rename to pageserver/src/tenant/storage_layer.rs diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/tenant/timeline.rs similarity index 99% rename from pageserver/src/layered_repository/timeline.rs rename to pageserver/src/tenant/timeline.rs index 60abbe33e6..c96ad99909 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -17,7 +17,7 @@ use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering} use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; -use crate::layered_repository::{ +use crate::tenant::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, filename::{DeltaFileName, ImageFileName}, @@ -118,7 +118,7 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`Repository::delete_timeline`]. + /// and [`Tenant::delete_timeline`]. layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a9f015229f..a8a9926c77 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,26 +1,31 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use crate::config::PageServerConf; -use crate::http::models::TenantInfo; -use crate::layered_repository::ephemeral_file::is_ephemeral_file; -use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; -use crate::layered_repository::Repository; -use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; -use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; -use crate::task_mgr::{self, TaskKind}; -use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::walredo::{PostgresRedoManager, WalRedoManager}; -use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::Context; -use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; use std::collections::{hash_map, HashMap, HashSet}; use std::ffi::OsStr; use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; + +use anyhow::Context; use tracing::*; +use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; + +use crate::config::PageServerConf; +use crate::http::models::TenantInfo; +use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; +use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::{ + ephemeral_file::is_ephemeral_file, + metadata::{TimelineMetadata, METADATA_FILE_NAME}, + Tenant, TenantState, +}; +use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::walredo::PostgresRedoManager; +use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; + use utils::crashsafe_dir; use utils::zid::{ZTenantId, ZTimelineId}; @@ -28,64 +33,31 @@ mod tenants_state { use once_cell::sync::Lazy; use std::{ collections::HashMap, - sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, + sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; use utils::zid::ZTenantId; - use crate::tenant_mgr::Tenant; + use crate::tenant::Tenant; - static TENANTS: Lazy>> = + static TENANTS: Lazy>>> = Lazy::new(|| RwLock::new(HashMap::new())); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { TENANTS .read() .expect("Failed to read() tenants lock, it got poisoned") } - pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap> { + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { TENANTS .write() .expect("Failed to write() tenants lock, it got poisoned") } } -struct Tenant { - state: TenantState, - /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub enum TenantState { - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - Active, - // Tenant is active, but there is no walreceiver connection. - Idle, - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - // The tenant cannot be accessed anymore for any reason, but graceful shutdown. - Stopping, - - // Something went wrong loading the tenant state - Broken, -} - -impl std::fmt::Display for TenantState { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Active => f.write_str("Active"), - Self::Idle => f.write_str("Idle"), - Self::Stopping => f.write_str("Stopping"), - Self::Broken => f.write_str("Broken"), - } - } -} - /// Initialize repositories with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) -/// are scheduled for download and added to the repository once download is completed. +/// are scheduled for download and added to the tenant once download is completed. pub fn init_tenant_mgr( conf: &'static PageServerConf, remote_storage: Option, @@ -128,7 +100,7 @@ pub fn init_tenant_mgr( ) }; - attach_local_tenants(conf, &remote_index, tenants_to_attach)?; + attach_local_tenants(conf, &remote_index, tenants_to_attach); Ok(remote_index) } @@ -141,7 +113,7 @@ pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, tenants_to_attach: TenantTimelineValues, -) -> anyhow::Result<()> { +) { let _entered = info_span!("attach_local_tenants").entered(); let number_of_tenants = tenants_to_attach.0.len(); @@ -152,104 +124,109 @@ pub fn attach_local_tenants( ); debug!("Timelines to attach: {local_timelines:?}"); - let repository = load_local_repo(conf, tenant_id, remote_index) - .context("Failed to load repository for tenant")?; - - let repo = Arc::clone(&repository); + let tenant = load_local_tenant(conf, tenant_id, remote_index); { match tenants_state::write_tenants().entry(tenant_id) { hash_map::Entry::Occupied(_) => { - anyhow::bail!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + continue; } hash_map::Entry::Vacant(v) => { - v.insert(Tenant { - state: TenantState::Idle, - repo, - }); + v.insert(Arc::clone(&tenant)); + } + } + } + + if tenant.current_state() == TenantState::Broken { + warn!("Skipping timeline load for broken tenant {tenant_id}") + } else { + let has_timelines = !local_timelines.is_empty(); + match tenant.init_attach_timelines(local_timelines) { + Ok(()) => { + info!("successfully loaded local timelines for tenant {tenant_id}"); + tenant.activate(has_timelines); + } + Err(e) => { + error!("Failed to attach tenant timelines: {e:?}"); + tenant.set_state(TenantState::Broken); } } } - // XXX: current timeline init enables walreceiver that looks for tenant in the state, so insert the tenant entry before - repository - .init_attach_timelines(local_timelines) - .context("Failed to attach timelines for tenant")?; } - info!("Processed {number_of_tenants} local tenants during attach"); - Ok(()) + info!("Processed {number_of_tenants} local tenants during attach") } -fn load_local_repo( +fn load_local_tenant( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { - let repository = Repository::new( +) -> Arc { + let tenant = Arc::new(Tenant::new( conf, TenantConfOpt::default(), Arc::new(PostgresRedoManager::new(conf, tenant_id)), tenant_id, remote_index.clone(), conf.remote_storage_config.is_some(), - ); - let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; - repository.update_tenant_config(tenant_conf); - - Ok(Arc::new(repository)) + )); + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } + } + tenant } /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// pub async fn shutdown_all_tenants() { - let tenantids = { + let tenants_to_shut_down = { let mut m = tenants_state::write_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - match tenant.state { - TenantState::Active | TenantState::Idle | TenantState::Stopping => { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) - } - TenantState::Broken => {} + let mut tenants_to_shut_down = Vec::with_capacity(m.len()); + for (_, tenant) in m.drain() { + if tenant.is_active() { + // updates tenant state, forbidding new GC and compaction iterations from starting + tenant.set_state(TenantState::Paused); + tenants_to_shut_down.push(tenant) } } drop(m); - tenantids + tenants_to_shut_down }; + // Shut down all existing walreceiver connections and stop accepting the new ones. task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from - // the repository have already been terminated by the caller, so there + // the tenant have already been terminated by the caller, so there // should be no more activity in any of the repositories. // // On error, log it but continue with the shutdown for other tenants. - for tenant_id in tenantids { + for tenant in tenants_to_shut_down { + let tenant_id = tenant.tenant_id(); debug!("shutdown tenant {tenant_id}"); - match get_repository_for_tenant(tenant_id) { - Ok(repo) => { - if let Err(err) = repo.checkpoint() { - error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); - } - } - Err(err) => { - error!("Could not get repository for tenant {tenant_id} during shutdown: {err:?}"); - } + + if let Err(err) = tenant.checkpoint() { + error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); } } } -fn create_repo( +fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, - wal_redo_manager: Arc, - remote_index: RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result<()> { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( !target_tenant_directory.exists(), @@ -282,7 +259,7 @@ fn create_repo( ) })?; // first, create a config in the top-level temp directory, fsync the file - Repository::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; + Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; // then, create a subdirectory in the top-level temp directory, fsynced crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( @@ -312,18 +289,11 @@ fn create_repo( fs::File::open(target_dir_parent)?.sync_all()?; info!( - "created directory structure in {}", + "created tenant directory structure in {}", target_tenant_directory.display() ); - Ok(Arc::new(Repository::new( - conf, - tenant_conf, - wal_redo_manager, - tenant_id, - remote_index, - conf.remote_storage_config.is_some(), - ))) + Ok(()) } fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result { @@ -350,12 +320,17 @@ pub fn create_tenant( } hash_map::Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; - v.insert(Tenant { - state: TenantState::Active, - repo, - }); - crate::tenant_tasks::start_background_loops(tenant_id); + create_tenant_files(conf, tenant_conf, tenant_id)?; + let tenant = Arc::new(Tenant::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + )); + tenant.activate(false); + v.insert(tenant); Ok(Some(tenant_id)) } } @@ -367,70 +342,23 @@ pub fn update_tenant_config( tenant_id: ZTenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); - get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - - Repository::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; + get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); + Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; Ok(()) } -pub fn get_tenant_state(tenantid: ZTenantId) -> Option { - Some(tenants_state::read_tenants().get(&tenantid)?.state) -} - -pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - let old_state = { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - let old_state = tenant.state; - tenant.state = new_state; - old_state - }; - - match (old_state, new_state) { - (TenantState::Broken, TenantState::Broken) - | (TenantState::Active, TenantState::Active) - | (TenantState::Idle, TenantState::Idle) - | (TenantState::Stopping, TenantState::Stopping) => { - debug!("tenant {tenant_id} already in state {new_state}"); - } - (TenantState::Broken, ignored) => { - debug!("Ignoring {ignored} since tenant {tenant_id} is in broken state"); - } - (_, TenantState::Broken) => { - debug!("Setting tenant {tenant_id} status to broken"); - } - (TenantState::Stopping, ignored) => { - debug!("Ignoring {ignored} since tenant {tenant_id} is in stopping state"); - } - (TenantState::Idle, TenantState::Active) => { - info!("activating tenant {tenant_id}"); - - // Spawn gc and compaction loops. The loops will shut themselves - // down when they notice that the tenant is inactive. - crate::tenant_tasks::start_background_loops(tenant_id); - } - (TenantState::Idle, TenantState::Stopping) => { - info!("stopping idle tenant {tenant_id}"); - } - (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { - info!("stopping tenant {tenant_id} tasks due to new state {new_state}"); - - // Note: The caller is responsible for waiting for any tasks to finish. - } - } - - Ok(()) -} - -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. +/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. +pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found"))?; - - Ok(Arc::clone(&tenant.repo)) + .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + if active_only && !tenant.is_active() { + anyhow::bail!("Tenant {tenant_id} is not active") + } else { + Ok(Arc::clone(tenant)) + } } pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { @@ -455,9 +383,14 @@ pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; info!("timeline task shutdown completed"); - match tenants_state::read_tenants().get(&tenant_id) { - Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, - None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), + match get_tenant(tenant_id, true) { + Ok(tenant) => { + tenant.delete_timeline(timeline_id)?; + if tenant.list_timelines().is_empty() { + tenant.activate(false); + } + } + Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), } Ok(()) @@ -467,21 +400,24 @@ pub async fn detach_tenant( conf: &'static PageServerConf, tenant_id: ZTenantId, ) -> anyhow::Result<()> { - set_tenant_state(tenant_id, TenantState::Stopping)?; + let tenant = match { + let mut tenants_accessor = tenants_state::write_tenants(); + tenants_accessor.remove(&tenant_id) + } { + Some(tenant) => tenant, + None => anyhow::bail!("Tenant not found for id {tenant_id}"), + }; + + tenant.set_state(TenantState::Paused); // shutdown all tenant and timeline tasks: gc, compaction, page service) task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - { - let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id); - } - // If removal fails there will be no way to successfully retry detach, // because the tenant no longer exists in the in-memory map. And it needs to be removed from it - // before we remove files, because it contains references to repository + // before we remove files, because it contains references to tenant // which references ephemeral files which are deleted on drop. So if we keep these references, // we will attempt to remove files which no longer exist. This can be fixed by having shutdown - // mechanism for repository that will clean temporary data to avoid any references to ephemeral files + // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( @@ -512,7 +448,7 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { TenantInfo { id: *id, - state: Some(tenant.state), + state: tenant.current_state(), current_physical_size: None, has_in_progress_downloads, } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 9aaafe7f92..3ef54838af 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -1,12 +1,14 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC +use std::ops::ControlFlow; +use std::sync::Arc; use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::{Tenant, TenantState}; use crate::tenant_mgr; -use crate::tenant_mgr::TenantState; use tracing::*; use utils::zid::ZTenantId; @@ -18,7 +20,10 @@ pub fn start_background_loops(tenant_id: ZTenantId) { None, &format!("compactor for tenant {tenant_id}"), false, - compaction_loop(tenant_id), + async move { + compaction_loop(tenant_id).await; + Ok(()) + }, ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), @@ -27,43 +32,50 @@ pub fn start_background_loops(tenant_id: ZTenantId) { None, &format!("garbage collector for tenant {tenant_id}"), false, - gc_loop(tenant_id), + async move { + gc_loop(tenant_id).await; + Ok(()) + }, ); } /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { +async fn compaction_loop(tenant_id: ZTenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting compaction loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - let result = async { + async { loop { trace!("waking up"); + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received compaction cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; + // Run blocking part of the task - // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { - break Ok(()); - } - // This should not fail. If someone started us, it means that the tenant exists. - // And before you remove a tenant, you have to wait until all the associated tasks - // exit. - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - // Run compaction - let mut sleep_duration = repo.get_compaction_period(); - if let Err(e) = repo.compaction_iteration() { - error!("Compaction failed, retrying: {}", e); - sleep_duration = Duration::from_secs(2) + let mut sleep_duration = tenant.get_compaction_period(); + if let Err(e) = tenant.compaction_iteration() { + error!("Compaction failed, retrying: {e:#}"); + sleep_duration = wait_duration; } // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - trace!("received cancellation request"); - break Ok(()); + info!("received compaction cancellation request during idling"); + break ; }, _ = tokio::time::sleep(sleep_duration) => {}, } @@ -72,49 +84,49 @@ async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - info!( - "compaction loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenant_id) - ); - result + trace!("compaction loop stopped."); } /// /// GC task's main loop /// -async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { +async fn gc_loop(tenant_id: ZTenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting gc loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - let result = async { + async { loop { trace!("waking up"); - // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { - break Ok(()); - } - // This should not fail. If someone started us, it means that the tenant exists. - // And before you remove a tenant, you have to wait until all the associated tasks - // exit. - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received GC cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; // Run gc - let gc_period = repo.get_gc_period(); - let gc_horizon = repo.get_gc_horizon(); + let gc_period = tenant.get_gc_period(); + let gc_horizon = tenant.get_gc_horizon(); let mut sleep_duration = gc_period; if gc_horizon > 0 { - if let Err(e) = repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false) + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false) { - error!("Gc failed, retrying: {}", e); - sleep_duration = Duration::from_secs(2) + error!("Gc failed, retrying: {e:#}"); + sleep_duration = wait_duration; } } // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - trace!("received cancellation request"); - break Ok(()); + info!("received GC cancellation request during idling"); + break; }, _ = tokio::time::sleep(sleep_duration) => {}, } @@ -122,9 +134,50 @@ async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { } .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - info!( - "GC loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenant_id) - ); - result + trace!("GC loop stopped."); +} + +async fn wait_for_active_tenant( + tenant_id: ZTenantId, + wait: Duration, +) -> ControlFlow<(), Arc> { + let tenant = loop { + match tenant_mgr::get_tenant(tenant_id, false) { + Ok(tenant) => break tenant, + Err(e) => { + error!("Failed to get a tenant {tenant_id}: {e:#}"); + tokio::time::sleep(wait).await; + } + } + }; + + // if the tenant has a proper status already, no need to wait for anything + if tenant.should_run_tasks() { + ControlFlow::Continue(tenant) + } else { + let mut tenant_state_updates = tenant.subscribe_for_state_updates(); + loop { + match tenant_state_updates.changed().await { + Ok(()) => { + let new_state = *tenant_state_updates.borrow(); + match new_state { + TenantState::Active { + background_jobs_running: true, + } => { + debug!("Tenant state changed to active with background jobs enabled, continuing the task loop"); + return ControlFlow::Continue(tenant); + } + state => { + debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}"); + tokio::time::sleep(wait).await; + } + } + } + Err(_sender_dropped_error) => { + info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop"); + return ControlFlow::Break(()); + } + } + } + } } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 35dec54d5c..69d14babf0 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,34 +2,28 @@ //! Timeline management code // -use anyhow::{bail, Context, Result}; -use remote_storage::path_with_suffix_extension; - use std::{ fs, path::Path, process::{Command, Stdio}, sync::Arc, }; + +use anyhow::{bail, Context, Result}; use tracing::*; +use remote_storage::path_with_suffix_extension; use utils::{ lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; -use crate::layered_repository::{Repository, Timeline}; +use crate::tenant::{Tenant, Timeline}; use crate::tenant_mgr; use crate::CheckpointConfig; use crate::{import_datadir, TEMP_FILE_SUFFIX}; -#[derive(Debug, Clone, Copy)] -pub struct PointInTime { - pub timeline_id: ZTimelineId, - pub lsn: Lsn, -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -69,7 +63,7 @@ fn bootstrap_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId, - repo: &Repository, + tenant: &Tenant, ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. @@ -89,7 +83,7 @@ fn bootstrap_timeline( // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(timeline_id, lsn)?; + let timeline = tenant.create_empty_timeline(timeline_id, lsn)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -127,16 +121,16 @@ pub(crate) async fn create_timeline( mut ancestor_start_lsn: Option, ) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - debug!("timeline {} already exists", new_timeline_id); + debug!("timeline {new_timeline_id} already exists"); return Ok(None); } let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { - let ancestor_timeline = repo + let ancestor_timeline = tenant .get_timeline(ancestor_timeline_id) .context("Cannot branch off the timeline that's not present in pageserver")?; @@ -162,10 +156,13 @@ pub(crate) async fn create_timeline( } } - repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } - None => bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?, + None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?, }; + // Have added new timeline into the tenant, now its background tasks are needed. + tenant.activate(true); + Ok(Some(loaded_timeline)) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 57592a46d3..45d0916dec 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -30,9 +30,9 @@ use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; -use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use crate::walrecord::*; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::v14::pg_constants; @@ -1022,16 +1022,13 @@ impl<'a> WalIngest<'a> { } } -/// -/// Tests that should work the same with any Repository/Timeline implementation. -/// #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::*; - use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::create_test_timeline; + use crate::tenant::harness::*; + use crate::tenant::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -1061,8 +1058,8 @@ mod tests { #[test] fn test_relsize() -> Result<()> { - let repo = RepoHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_relsize")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1189,8 +1186,8 @@ mod tests { // and then created it again within the same layer. #[test] fn test_drop_extend() -> Result<()> { - let repo = RepoHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_drop_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1229,8 +1226,8 @@ mod tests { // and then extended it again within the same layer. #[test] fn test_truncate_extend() -> Result<()> { - let repo = RepoHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_truncate_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1317,8 +1314,8 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[test] fn test_large_rel() -> Result<()> { - let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_large_rel")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1fcb768ddf..69e400f291 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,10 +16,10 @@ use std::{ time::Duration, }; -use crate::layered_repository::Timeline; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; +use crate::tenant::Timeline; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -767,11 +767,11 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; #[test] fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_no_candidate")?; + let harness = TenantHarness::create("no_connection_no_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -857,7 +857,7 @@ mod tests { #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("connection_no_candidate")?; + let harness = TenantHarness::create("connection_no_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -948,7 +948,7 @@ mod tests { #[test] fn no_connection_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_candidate")?; + let harness = TenantHarness::create("no_connection_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -1053,7 +1053,7 @@ mod tests { #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { - let harness = RepoHarness::create("candidate_with_many_connection_failures")?; + let harness = TenantHarness::create("candidate_with_many_connection_failures")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -1117,7 +1117,7 @@ mod tests { #[tokio::test] async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1204,7 +1204,7 @@ mod tests { #[tokio::test] async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_connection_threshhold_current_candidate")?; + let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1276,7 +1276,7 @@ mod tests { #[tokio::test] async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; + let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); @@ -1353,7 +1353,7 @@ mod tests { const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - fn dummy_state(harness: &RepoHarness) -> WalreceiverState { + fn dummy_state(harness: &TenantHarness) -> WalreceiverState { WalreceiverState { id: ZTenantTimelineId { tenant_id: harness.tenant_id, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index e8fa9f9aca..6f1fbc2c9d 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -21,10 +21,10 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ - layered_repository::{Timeline, WalReceiverInfo}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, + tenant::{Timeline, WalReceiverInfo}, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, @@ -141,8 +141,7 @@ pub async fn handle_walreceiver_connection( let tenant_id = timeline.tenant_id; let timeline_id = timeline.timeline_id; - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; // // Start streaming the WAL, from where we left off previously. @@ -283,7 +282,7 @@ pub async fn handle_walreceiver_connection( })?; if let Some(last_lsn) = status_update { - let remote_index = repo.get_remote_index(); + let remote_index = tenant.get_remote_index(); let timeline_remote_consistent_lsn = remote_index .read() .await diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 1d083b3ef9..ce3a74930e 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -71,7 +71,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # First timeline would not get loaded into pageserver due to corrupt metadata file with pytest.raises( - Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" + Exception, match=f"Timeline {timeline1} was not found for tenant {tenant1}" ) as err: pg1.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") @@ -80,7 +80,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # We don't have the remote storage enabled, which means timeline is in an incorrect state, # it's not loaded at all with pytest.raises( - Exception, match=f"Could not get timeline {timeline2} in tenant {tenant2}" + Exception, match=f"Timeline {timeline2} was not found for tenant {tenant2}" ) as err: pg2.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 315ec7f306..1214d703d0 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -40,11 +40,16 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) + def assert_active_without_jobs(tenant): + assert get_state(tenant) == {"Active": {"background_jobs_running": False}} + # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) - assert get_state(tenant) == "Active" + assert get_state(tenant) == { + "Active": {"background_jobs_running": True} + }, "Pageserver should activate a tenant and start background jobs if timelines are loaded" # Stop compute pg.stop() @@ -53,6 +58,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for tenant_info in client.tenant_list(): tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id)) # Assert that all tasks finish quickly after tenant is detached assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index a5dadc535b..5a20dbd232 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -18,7 +18,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): invalid_tenant_id = ZTenantId.generate() with pytest.raises( NeonPageserverApiException, - match=f"Tenant {invalid_tenant_id} not found in local tenant state", + match=f"Tenant {invalid_tenant_id} not found in the local state", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) @@ -64,7 +64,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # check 404 with pytest.raises( - NeonPageserverApiException, match="is not found neither locally nor remotely" + NeonPageserverApiException, + match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} is not found neither locally nor remotely", ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) From 59d04ab66aa68be3a7b3cd7997182f9b62636190 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 18:24:11 +0100 Subject: [PATCH 14/33] test_runner: redact passwords from log messages (#2434) --- test_runner/fixtures/log_helper.py | 13 +++++++++++++ test_runner/fixtures/neon_fixtures.py | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 17f2402391..7d112fce89 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,5 +1,6 @@ import logging import logging.config +import re """ This file configures logging to use in python tests. @@ -29,6 +30,17 @@ LOGGING = { } +class PasswordFilter(logging.Filter): + """Filter out password from logs.""" + + # Good enough to filter our passwords produced by PgProtocol.connstr + FILTER = re.compile(r"(\s*)password=[^\s]+(\s*)") + + def filter(self, record: logging.LogRecord) -> bool: + record.msg = self.FILTER.sub(r"\1password=\2", str(record.msg)) + return True + + def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. @@ -38,5 +50,6 @@ def getLogger(name="root") -> logging.Logger: # default logger for tests log = getLogger() +log.addFilter(PasswordFilter()) logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b47e560325..69c6d31315 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -125,7 +125,8 @@ def pytest_configure(config): if env_neon_bin: neon_binpath = env_neon_bin else: - neon_binpath = os.path.join(base_dir, "target/debug") + build_type = os.environ.get("BUILD_TYPE", "debug") + neon_binpath = os.path.join(base_dir, "target", build_type) log.info(f"neon_binpath is {neon_binpath}") if not os.path.exists(os.path.join(neon_binpath, "pageserver")): raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) From db0c49148db3bbc74d314313b601e6f1e7c0be3a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 20:07:16 +0300 Subject: [PATCH 15/33] clean up metrics in handle_pagerequests --- pageserver/src/page_service.rs | 53 +++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b03dab20e0..388f40f916 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -362,6 +362,39 @@ async fn page_service_conn_main( } } +struct PageRequestMetrics { + get_rel_exists: metrics::Histogram, + get_rel_size: metrics::Histogram, + get_page_at_lsn: metrics::Histogram, + get_db_size: metrics::Histogram, +} + +impl PageRequestMetrics { + fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + + let get_rel_exists = + SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]); + + let get_rel_size = + SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]); + + let get_page_at_lsn = + SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]); + + let get_db_size = + SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]); + + Self { + get_rel_exists, + get_rel_size, + get_page_at_lsn, + get_db_size, + } + } +} + #[derive(Debug)] struct PageServerHandler { conf: &'static PageServerConf, @@ -396,6 +429,8 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyBothResponse)?; pgb.flush().await?; + let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id); + loop { let msg = tokio::select! { biased; @@ -420,32 +455,22 @@ impl PageServerHandler { trace!("query: {:?}", copy_data_bytes); let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_str = tenant_id.to_string(); - let timeline_str = timeline_id.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_rel_exists.start_timer(); self.handle_get_rel_exists_request(&timeline, &req).await } PagestreamFeMessage::Nblocks(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_rel_size", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_rel_size.start_timer(); self.handle_get_nblocks_request(&timeline, &req).await } PagestreamFeMessage::GetPage(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_page_at_lsn.start_timer(); self.handle_get_page_at_lsn_request(&timeline, &req).await } PagestreamFeMessage::DbSize(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_db_size", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_db_size.start_timer(); self.handle_db_size_request(&timeline, &req).await } }; From d4d57ea2ddb49c6d40b90e171188dbeecee8f9fe Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 19:26:26 +0100 Subject: [PATCH 16/33] github/workflows: fix project creation via API (#2437) --- .github/actions/neon-project-create/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index d4fced4196..ba81afaaff 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -60,7 +60,7 @@ runs: --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { - \"platform_id\": \"serverless\", + \"platform_id\": \"aws\", \"region_id\": \"${REGION_ID}\", \"settings\": { } } From 1d53173e62673aecc9e2c73ab6ba6f0488249207 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 20:41:26 +0300 Subject: [PATCH 17/33] update openapi spec (tenant state has changed) --- pageserver/src/http/openapi_spec.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b9a62d0f32..1f2eba05ec 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -494,7 +494,13 @@ components: id: type: string state: - type: string + oneOf: + - type: string + - type: object + properties: + background_jobs_running: + type: boolean + current_physical_size: type: integer has_in_progress_downloads: From 32b7259d5e639e3dd16e3758a1534f0f47d9a6f2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 22:37:20 +0300 Subject: [PATCH 18/33] Timeline data management RFC (#2152) --- docs/SUMMARY.md | 1 + docs/rfcs/017-timeline-data-management.md | 413 ++++++++++++++++++ .../lock_legend.svg | 4 + .../proposed_timeline_data_access_sync_1.svg | 4 + .../proposed_timeline_data_access_sync_2.svg | 4 + .../proposed_timeline_tenant_state.svg | 4 + .../timeline_data_access_sync_1.svg | 4 + .../timeline_data_access_sync_2.svg | 4 + .../timeline_tenant_state.svg | 4 + 9 files changed, 442 insertions(+) create mode 100644 docs/rfcs/017-timeline-data-management.md create mode 100644 docs/rfcs/images/017-timeline-data-management/lock_legend.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 95ac512ea8..fb6467ffd5 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -79,4 +79,5 @@ - [014-storage-lsm](rfcs/014-storage-lsm.md) - [015-storage-messaging](rfcs/015-storage-messaging.md) - [016-connection-routing](rfcs/016-connection-routing.md) +- [017-timeline-data-management](rfcs/017-timeline-data-management.md) - [cluster-size-limits](rfcs/cluster-size-limits.md) diff --git a/docs/rfcs/017-timeline-data-management.md b/docs/rfcs/017-timeline-data-management.md new file mode 100644 index 0000000000..a8ca3c7ca9 --- /dev/null +++ b/docs/rfcs/017-timeline-data-management.md @@ -0,0 +1,413 @@ +# Name + +Tenant and timeline data management in pageserver + +## Summary + +This RFC attempts to describe timeline-related data management as it's done now in pageserver, highlight current complexities caused by this and propose a set of changes to mitigate them. + +The main goal is to prepare for future [on-demand layer downloads](https://github.com/neondatabase/neon/issues/2029), yet timeline data is one of the core primitive of pageserver, so a number of other RFCs are affected either. +Due to that, this document won't have a single implementation, rather requiring a set of code changes to achieve the final state. + +RFC considers the repository at the `main` branch, commit [`28243d68e60ffc7e69f158522f589f7d2e09186d`](https://github.com/neondatabase/neon/tree/28243d68e60ffc7e69f158522f589f7d2e09186d) on the time of writing. + +## Motivation + +In recent discussions, it became more clear that timeline-related code becomes harder to change: it consists of multiple disjoint modules, each requiring a synchronization to access. +The lower the code is, the complex the sync gets since many concurrent processes are involved and require orchestration to keep the data consistent. +As the number of modules and isolated data grows per timeline, more questions and corner cases arise: + +- https://github.com/neondatabase/neon/issues/1559 + right now it's not straightened out what to do when the synchronization task fails for too many times: every separate module's data has to be treated differently. + +- https://github.com/neondatabase/neon/issues/1751 + GC and compaction file activities are not well known outside their tasks code, causing race bugs + +- https://github.com/neondatabase/neon/issues/2003 + Even the tenant management gets affected: we have to alter its state based on timeline state, yet the data for making the decision is separated and the synchronisation logic has bugs + +- more issues were brought in discussions, but apparently they were too specific to the code to mention them in the issues. + For instance, `tenant_mgr` itself is a static object that we can not mock anyhow, which reduces our capabilities to test the data synchronization logic. + In fact, we have zero Rust tests that cover the case of synchronizing more than one module's data. + +On demand layer downloads would require us to dynamically manage the layer files, which we almost not doing at all on the module level, resulting in the most of their APIs dealing with timelines, rather than the layer files. +The disjoint data that would require data synchronization with possibly a chain of lock acquisitions, some async and some sync, and it would be hard to unit test it with the current code state. + +Neither this helps to easy start the on-demand download epic, nor it's easy to add more timeline-related code on top, whatever the task is. +We have to develop a vision on a number of topics before progressing safely: + +- timeline and tenant data structure and how should we access it +- sync and async worlds and in what way that should evolve +- unit tests for the complex logic + +This RFC aims to provide a general overview of the existing situation and propose ways to improve it. +The changes proposed are quite big and no single PR is expected to do the adjustments, they should gradually be done during the on-demand download work later. + +## What is a timeline and its data + +First, we need to define what data we want to manage per timeline. +Currently, the data every timeline operates is: + +- a set of layer files, on the FS + + Never updated files, created after pageserver's checkpoints and compaction runs, can be removed from the local FS due to compaction, gc or timeline deletion. + +- a set of layer files, on the remote storage + + Identically named and placed in tenant subdirectories files on the remote storage (S3), copied by a special background sync thread + +- a `metadata` file, on the FS + + Updated after every checkpoint with the never `disk_consistent_lsn` and `latest_gc_cutoff_lsn` values. Used to quickly restore timeline's basic metadata on pageserver restart. + Also contains data about the ancestor, if the timeline was branched off another timeline. + +- an `index_part.json` file, on the remote storage + + Contains `metadata` file contents and a list of layer files, available in the current S3 "directory" for the timeline. + Used to avoid potentially slow and expensive `S3 list` command, updated by the remotes storage sync thread after every operation with the remote layer files. + +- LayerMap and PageCache, in memory + + Dynamic, used to store and retrieve the page data to users. + +- timeline info, in memory + + LSNs, walreceiver data, `RemoteTimelineIndex` and other data to share via HTTP API and internal processes. + +- metrics data, in memory + + Data to push or provide to Prometheus, Opentelemetry, etc. + +Besides the data, every timeline currently needs an etcd connection to receive WAL events and connect to safekeepers. + +Timeline could be an ancestor to another one, forming a dependency tree, which is implicit right now: every time relations are looked up in place, based on the corresponding `TimelineMetadata` struct contents. +Yet, there's knowledge on a tenant as a group of timelines, belonging to a single user which is used in GC and compaction tasks, run on every tenant. +`tenant_mgr` manages tenant creation and its task startup, along with the remote storage sync for timeline layers. + +Last file being managed per-tenant is the tenant config file, created and updated on the local FS to hold tenant-specific configuration between restarts. +It's not yet anyhow synchronized with the remote storage, so only exists on the local FS. + +### How the data is stored + +We have multiple places where timeline data is stored: + +- `tenant_mgr` [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L43) a static `static ref TENANTS: RwLock>` with the `Tenant` having the `local_timelines: HashMap>` inside + +- same `Tenant` above has actually two references to timelines: another via its `repo: Arc` with `pub type RepositoryImpl = LayeredRepository;` that [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L178) `Mutex>` + +- `RemoteTimelineIndex` [contains](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync/index.rs#L84) the metadata about timelines on the remote storage (S3) for sync reasons and possible HTTP API queries + +- `walreceiver` [stores](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver.rs#L60) the metadata for possible HTTP API queries and its [internal state](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver/connection_manager.rs#L245) with a reference to the timeline, its current connections and etcd subscription (if any) + +- `PageCache` contains timeline-related data, and is created globally for the whole pageserver + +- implicitly, we also have files on local FS, that contain timeline state. We operate on those files and for some operations (GC, compaction) yet we don't anyhow synchronize the access to the files per se: there are more high-level locks, ensuring only one of a group of operations is running at a time. + + On practice though, `LayerMap` and layer files are tightly coupled together: current low-level code requires a timeline to be loaded into the memory to work with it, and the code removes the layer files after removing the entry from the `LayerMap` first. + +Based on this, a high-level pageserver's module diagram with data and entities could be: + +![timeline tenant state diagram](./images/017-timeline-data-management/timeline_tenant_state.svg) + +A few comments on the diagram: + +- the diagram does not show all the data and replaces a few newtypes and type aliases (for example, completely ignores "unloaded" timelines due to reasons described below) + + It aims to show main data and means of synchronizing it. + +- modules tend to isolate their data inside and provide access to it via API + +Due to multitenancy, that results in a common pattern for storing both tenant and timeline data: `RwLock` or `Mutex` around the `HashMap`, gc and compaction tasks also use the same lock pattern to ensure no concurrent runs are happening. + +- part of the modules is asynchronous, while the other is not, that complicates the data access + +Currently, anything that's not related to tasks (walreceiver, storage sync, GC, compaction) is blocking. + +Async tasks that try to access the data in the sync world, have to call `std::sync::Mutex::lock` method, which blocks the thread the callee async task runs on, also blocking other async tasks running in the same thread. Methods of `std::sync::RwLock` have the same issues, forcing async tasks either to block or spawn another, "blocking" task on a separate thread. + +Sync tasks that try to access the data in the async world, cannot use `.await` hence have to have some `Runtime` doing those calls for them. [`tokio::sync::Mutex`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.Mutex.html#method.blocking_lock) and [`tokio::sync::RwLock`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.RwLock.html#method.blocking_read) provide an API to simplify such calls. Similarly, both `std::sync` and `tokio::sync` have channels that are able to communicate into one direction without blocking and requiring `.await` calls, hence can be used to connect both worlds without locking. + +Some modules are in transition, started as async "blocking" tasks and being fully synchronous in their entire code below the start. Current idea is to transfer them to the async further, but it's not yet done. + +- locks are used in two different ways: + + - `RwLock>` ones to hold the shared data and ensure its atomic updates + - `Mutex<()>` for synchronizing the tasks, used to implicitly order the data access + + The "shared data" locks of the first kind are mainly accessed briefly to either look up or alter the data, yet there are a few notable exceptions, such as + `latest_gc_cutoff_lsn: RwLock` that is explicitly held in a few places to prevent GC thread from progressing. Those are covered later in the data access diagrams. + +- some synchronizations are not yet implemented + +E.g. asynchronous storage sync module does not synchronize with almost synchronous GC and compaction tasks when the layer files are uploaded to the remote storage. +That occasionally results in the files being deleted before the storage upload task is run for this layer, but due to the incremental nature of the layer files, we can handle such situations without issues. + +- `LayeredRepository` covers lots of responsibilities: GC and compaction task synchronisation, timeline access (`local_timelines` in `Tenant` is not used directly before the timeline from the repository is accessed), layer flushing to FS, layer sync to remote storage scheduling, etc. + +### How is this data accessed? + +There are multiple ways the data is accessed, from different sources: + +1. [HTTP requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/http/routes.rs) + +High-level CRUD API for managing tenants, timelines and getting data about them. +Current API list (modified for readability): + +```rust +.get("/v1/status", status_handler) // pageserver status +.get("/v1/tenant", tenant_list_handler) +.post("/v1/tenant", tenant_create_handler) // can create "empty" timelines or branch off the existing ones +.get("/v1/tenant/:tenant_id", tenant_status) // the only tenant public metadata +.put("/v1/tenant/config", tenant_config_handler) // tenant config data and local file manager +.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) +.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) +.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) // download entire tenant from the remote storage and load its timelines memory +.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) // delete all tenant timelines from memory, remote corresponding storage and local FS files +.get("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler) +.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler) +.get("/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", wal_receiver_get_handler) // get walreceiver stats metadata +``` + +Overall, neither HTTP operation goes below `LayeredRepository` level and does not interact with layers: instead, they manage tenant and timeline entities, their configuration and metadata. + +`GET` data is small (relative to layer files contents), updated via brief `.write()/.lock()` calls and read via copying/cloning the data to release the lock soon. +It does not mean that the operations themselves are short, e.g. `tenant_attach_handler` downloads multiple files from the remote storage which might take time, yet the final data is inserted in memory via one brief write under the lock. + +Non-`GET` operations mostly follow the same rule, with two differences: + +- `tenant_detach_handler` has to wait for its background tasks to stop before shutting down, which requires more work with locks +- `timeline_create_handler` currently requires GC to be paused before branching the timeline, which requires orchestrating too. + This is the only HTTP operation, able to load the timeline into memory: rest of the operations are reading the metadata or, as in `tenant_attach_handler`, schedule a deferred task to download timeline and load it into memory. + +"Timeline data synchronization" section below describes both complex cases in more details. + +2. [libpq requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/page_service.rs) + +Is the main interface of pageserver, intended to handle libpq (and similar) requests. +Operates on `LayeredTimeline` and, lower, `LayerMap` modules; all timelines accessed during the operation are loaded into memory immediately (if not loaded already), operations bail on timeline load errors. + +- `pagestream` + + Page requests: `get_rel_exists`, `get_rel_size`, `get_page_at_lsn`, `get_db_size` + + Main API points, intended to be used by `compute` to show the data to the user. All require requests to be made at certain Lsn, if this Lsn is not available in the memory, request processing is paused until that happens or bails after a timeout. + +- `basebackup` and `fullbackup` + + Options to generate postgres-compatible backup archives. + +- `import basebackup` + +- `import wal` + + Import the `pg_wal` section of the basebackup archive. + +- `get_last_record_rlsn`, `get_lsn_by_timestamp` + +"Metadata" retrieval methods, that still requires internal knowledge about layers. + +- `set`, `fallpoints`, `show` + +Utility methods to support various edge cases or help with debugging/testing. + +- `do_gc`, `compact`, `checkpoint` + +Manual triggers for corresponding tenant tasks (GC, compaction) and inmemory layer flushing on disk (checkpointing), with upload task scheduling as a follow-up. + +Apart from loading into memory, every timeline layer has to be accessed using specific set of locking primitives, especially if a write operations happens: otherwise, GC or compaction might spoil the data. User API is implicitly affected by this synchronization during branching, when a GC has to be orchestrated properly before the new timeline could be branched off the existing one. +See "Timeline data synchronization" section for the united synchronization diagram on the topic. + +3. internal access + +Entities within pageserver that update files on local FS and remote storage, metadata in memory; has to use internal data for those operations. +Places that access internal, lower data are also required to have the corresponding timeline successfully loaded into memory and accessed with corresponding synchronization. + +If ancestors' data is accessed via its child branch, it means more than one timeline has to be loaded into memory entirely and more locking primitives usage involved. +Right now, all ancestors are resolved in-place: every place that has to check timeline's ancestor has to lock the timelines map, check if one is loaded into the memory, load it there or bail if it's not present, and get the information required and so on. + +- periodic GC and compaction tasks + +Alter metadata (GC info), in-memory data (layer relations, page caches, etc.) and layer files on disk. +Same as its libpq counterparts, needs full synchronization with the low level layer management code. + +- storage sync task + +Alters metadata (`RemoteTimelineIndex`), layer files on remote storage (upload, delete) and local FS (download) and in-memory data (registers downloaded timelines in the repository). +Currently, does not know anything about layer files contents, rather focusing on the file structure and metadata file updates: due to the fact that the layer files cannot be updated (only created or deleted), storage sync is able to back up the files to the remote storage without further low-level synchronizations: only when the timeline is downloaded, a load operation is needed to run, possibly pausing GC and compaction tasks. + +- walreceiver and walingest task + +Per timeline, subscribes for etcd events from safekeeper and eventually spawns a walreceiver connection task to receive WAL from a safekeeper node. +Fills memory with data, eventually triggering a checkpoint task that creates a new layer file in the local FS and schedules a remote storage sync upload task. +During WAL receiving, also updates a separate in-memory data structure with the walreceiver stats, used later via HTTP API. + +Layer updates require low-level set of sync primitives used to preserve the data consistency. + +- checkpoint (layer freeze) task + +Periodic, short-lived tasks to generate a new layer file in the FS. Requires low level synchronization in the end, when the layer is being registered after creating and has additional mode to ensure only one concurrent compaction happens at a time. + +### Timeline data synchronization + +Here's a high-level timeline data access diagram, considering the synchronization locks, based on the state diagram above. + +For brevity, diagrams do not show `RwLock>` data accesses, considering them almost instant to happen. +`RwLock` is close to be an exception to the previous rule, since it's taken in a multiple places to ensure all layers are inserted correctly. +Yet the only long operation in the current code is a `.write()` lock on the map during its creation, while all other lock usages tend to be short in the current code. +Note though, that due to current "working with loaded timeline only", prevailing amount of the locks taken on the struct are `.write()` locks, not the `.read()` ones. +To simplify the diagrams, these accesses are now considered "fast" data access, not the synchronization attempts. + +`write_lock` synchronization diagram: + +![timeline data access synchronization(1)](./images/017-timeline-data-management/timeline_data_access_sync_1.svg) + +Comments: + +- `write_lock: Mutex<()>` ensures that all timeline data being written into **in-memory layers** is done without races, one concurrent write at a time +- `layer_flush_lock: Mutex<()>` and layer flushing seems to be slightly bloated with various ways to create a layer on disk and write it in memory + The lock itself seem to repeat `write_lock` purpose when it touches in-memory layers, and also to limit the on-disk layer creations. + Yet the latter is not really done consistently, since remote storage sync manages to download and register the new layers without touching the locks +- `freeze_inmem_layer(true)` that touches both `write_lock` and `layer_flush_lock` seems not very aligned with the rest of the locks to those primitives; it also now restricts the layer creation concurrency even more, yet there are various `freeze_inmem_layer(false)` that are ignoring those restrictions at the same time + +![timeline data access synchronization(2)](./images/017-timeline-data-management/timeline_data_access_sync_2.svg) + +Comments: + +- `partitioning: Mutex<(KeyPartitioning, Lsn)>` lock is a data sync lock that's not used to synchronize the tasks (all other such kinds were considered "almost instant" and omitted on the diagram), yet is very similar to what `write_lock` and `layer_flush_lock` do: it ensures the timeline in-memory data is up-to-date with the layer files state on disk, which is what `LayerMap` is for. + +- there are multiple locks that do similar task management operations: + - `gc_cs: Mutex<()>` and `latest_gc_cutoff_lsn: RwLock` ensures that branching and gc are not run concurrently + - `layer_removal_cs: Mutex<()>` lock ensure gc, compaction and timeline deletion via HTTP API do not run concurrently + - `file_lock: RwLock<()>` is used as a semaphore, to ensure "all" gc and compaction tasks are shut down and do not start + Yet that lock does take only gc and compaction from internal loops: libpq call is not cancelled and waited upon. + +Those operations do not seem to belong to a timeline. Moreover, some of those could be eliminated entirely due to duplication of their tasks. + +## Proposed implementation + +### How to structure timeline data access better + +- adjust tenant state handling + +Current [`TenantState`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L108) [changes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L317) mainly indicates whether GC and compaction tasks are running or not; another state, `Broken` shows only in case any timeline does not load during startup. + +We could start both GC and compaction tasks at the time the tenant is created and adjust the tasks to throttle/sleep on timeline absence and wake up when the first one is added. +The latter becomes more important on download on demand, since we won't have the entire timeline in reach to verify its correctness. Moreover, if any network connection happens, the timeline could fail temporarily and entire tenant should be marked as broken due to that. + +Since nothing verifies the `TenantState` via HTTP API currently, it makes sense to remove the whole state entirely and don't write the code to synchronize its changes. +Instead, we could indicate internal issues for every timeline and have a better API to "stop" timeline processing without deleting its data, making our API less restrictive. + +- remove the "unloaded" status for the timeline + +Current approach to timeline management [assumes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L486-L493) + +```rust +#[derive(Clone)] +enum LayeredTimelineEntry { + Loaded(Arc), + Unloaded { + id: ZTimelineId, + metadata: TimelineMetadata, + }, +} +``` + +supposes that timelines have to be in `Unloaded` state. + +The difference between both variants is whether its layer map was loaded from disk and kept in memory (Loaded) or not (Unloaded). +The idea behind such separation was to lazy load timelines in memory with all their layers only after its first access and potentially unload them later. + +Yet now there's no public API methods, that deal with unloaded timelines' layers: all of them either bail when such timeline is worked on, or load it into memory and continue working. +Moreover, every timeline in the local FS is loaded on pageserver startup now, so only two places where `Unloaded` variant is used are branching and timeline attach, with both loading the timeline into memory before the end of the operation. +Even if that loading into memory bails for some reason, next GC or compaction task periodic run would load such timeline into memory. +There are a few timeline methods that return timeline metadata without loading its layers, but such metadata also comes from the `metadata` FS file, not the layer files (so no page info could be retrieved without loading the entire layer map first). + +With the layer on-demand download, it's not feasible anymore to wait for the entire layer map to be loaded into the memory, since it might not even be available on the local FS when requested: `LayerMap` needs to be changed to contain metadata to retrieve the missing layers and handle partially present on the local FS timeline state. + +To accommodate to that and move away from the redundant status, a timeline should always be "loaded" with its metadata read from the disk and its layer map prepared to be downloaded when requested, per layer. + +Layers in the layer map, on the other hand, could be in various state: loaded, unloaded, downloading, downloading failed, etc. and their state has to be handled instead, if we want to support on-demand download in the future. + +This way, tenants and timelines could always try to serve requests and do their internal tasks periodically, trying to recover. + +- scale down the remote storage sync to per layer file, not per timeline as now + +Due to the reasons from the previous bullet, current remote storage model needs its timeline download approach to be changed. +Right now, a timeline is marked as "ready" only after all its layers on the remote storage are downloaded on the local storage. +With the on-demand download approach, only remote storage timeline metadata should be downloaded from S3, leaving the rest of the layers ready for download if/when it's requested. + +Note: while the remote storage sync should operate per layer, it should stay global for all tenants, to better manage S3 limits and sync queue priorities. +Yet the only place using remote storage should be the layer map. + +- encapsulate `tenant_mgr` logic into a regular Rust struct, unite with part of the `Repository` and anything else needed to manage the timeline data in a single place and to test it independently + +[`Repository`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/repository.rs#L187) trait gets closer to `tenant_mgr` in terms of functionality: there are two background task-related functions, that are run on all timelines of a tenant: `gc_iteration` (it does allow running on a single timeline, but GC task runs it on all timelines) and `compaction_iteration` that are related to service tasks, not the data storage; and the metadata management functions, also not really related to the timeline contents. + +`tenant_mgr` proxies some of the `Repository` calls, yet both service tasks use `tenant_mgr` to access the data they need, creating a circular dependency between their APIs. +To avoid excessive synchronization between components, taking multiple locks for that and static state, we can organize the data access and updates in one place. +One potential benefit Rust gets from this is the ability to track and manage timeline resources, if all the related data is located in one place. + +- move `RemoteStorage` usage from `LayeredRepository` into `LayerMap`, as the rest of the layer-based entities (layer files, etc.) + +Layer == file in our model, since pageserver always either tries to load the LayerMap from disk for the timeline not in memory, or assumes the file contents matches its memory. +`LayeredRepository` is one of the most loaded objects currently and not everything from it deserves unification with the `tenant_mgr`. +In particular, layer files need to be better prepared for future download on demand functionality, where every layer could be dynamically loaded and unloaded from memory and local FS. +Current amount of locks and sync-async separation would make it hard to implement truly dynamic (un)loading; moreover, we would need retries with backoffs, since the unloaded layer files are most probably not available on the local FS either and network is not always reliable. + +One of the solutions to the issue is already being developed for the remote storage sync: [SyncQueue](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync.rs#L463) +The queue is able to batch CRUD layer operations (both for local and remote FS contexts) and reorder them to increase the sync speed. +Similar approach could be generalized for all layer modifications, including in-memory ones such as GC or compaction: this way, we could manage all layer modifications and reads in one place with lesser locks and tests that are closer to unit tests. + +- change the approach to locking synchronization + +A number of locks in the timeline seem to be used to coordinate gc, compaction tasks and related processes. +It should be done in a task manager or other place, external to the timeline. + +Timeline contents still needs to be synchronized, considering the task work, so fields like `latest_gc_cutoff_lsn: RwLock` are expected to stay for that purpose, but general amount of locks should be reduced. + +### Putting it all together + +If the proposal bullets applied to the diagrams above, the state could be represented as: + +![timeline timeline tenant state](./images/017-timeline-data-management/proposed_timeline_tenant_state.svg) + +The reorders aim to put all tasks into separated modules, with strictly defined interfaces and as less knowledge about other components, as possible. +This way, all timeline data is now in the `data_storage`, including the GC, walreceiver, `RemoteTimelineIndex`, `LayerMap`, etc. with some API to get the data in the way, +more convenient for the data sync system inside. +So far, it seems that a few maps with `Arc>` with actual data operations added inside each `SeparateData` struct, if needed. + +`page_cache` is proposed to placed into the same `data_storage` since it contains tenant timelines' data: this way, all metadata and data is in the same struct, simplifying things with Rust's borrow checker and allowing us to share internals between data modules and later might simplify timeline in-memory size tracking. + +`task_manager` is related to data storage and manages all tenant and timeline tasks, manages shared resources (runtimes, thread pools, etcd connection, etc.) and synchronizes tasks. +All locks such as `gc_cs` belong to this module tree, as primitives inherently related to the task synchronization. +Tasks have to access timelines and their metadata, but should do that through `data_storage` API and similar. + +`task_manager` should (re)start, stop and track all tasks that are run in it, selecting an appropriate runtime depending on a task kind (we have async/sync task separation, CPU and IO bound tasks separation, ...) +Some locks such as `layer_removal_cs` one are not needed, if the only component that starts the tasks ensures they don't run concurrently. + +`LayeredTimeline` is still split into two parts, more high-level with whatever primitives needed to sync its state, and the actual state storage with `LayerMap` and other low level entities. +Only `LayerMap` knows what storage it's layer files are taken from (inmem, local FS, etc.), and it's responsible for synchronizing the layers when needed, as also reacting to sync events, successful or not. + +Last but not least, `tenant config file` has to be backed into a remote storage, as tenant-specific information for all timelines. +Tenant and timelines have volatile information that's now partially mixed with constant information (e.g. fields in `metadata` file), that model should be better split and handled, in case we want to properly support its backups and synchronization. + +![proposed timeline data access synchronization(1)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg) + +There's still a need to keep inmemory layer buffer synchronized during layer freezing, yet that could happen on a layer level, not on a timeline level, as `write_lock` used to be, so we could lower the sync primitives one layer deeper, preparing us for download on demand feature, where multiple layers could be concurrently streamed and written from various data sources. + +Flushing the frozen layer requires creating a new layer on disk and further remote storage upload, so `LayerMap` has to get those flushed bytes and queue them later: no need to block in the timeline itself for anything again, rather locking on the layer level, if needed. + +![proposed timeline data access synchronization(2)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg) + +Lock diagrams legend: + +![lock diagrams legend](./images/017-timeline-data-management/lock_legend.svg) + +After the frozen layers are flushed, something has to ensure that the layer structure is intact, so a repartitioning lock is needed still, and could also guard the layer map structure changes, since both are needed either way. +This locking belongs to the `LowLevelLayeredTimeline` from the proposed data structure diagram, as the place with all such data being held. + +Similarly, branching is still required to be done after certain Lsn in our current model, but this needs only one lock to synchronize and that could be the `gc_cs: Mutex<()>` lock. +It raises the question of where this lock has to be placed, it's the only place that requires pausing a GC task during external, HTTP request handling. +The right place for the lock seems to be the `task_manager` that could manage GC in more fine-grained way to accommodate the incoming branching request. + +There's no explicit lock sync between GC, compaction or other mutually exclusive tasks: it is a job of the `task_manager` to ensure those are not run concurrently. diff --git a/docs/rfcs/images/017-timeline-data-management/lock_legend.svg b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg new file mode 100644 index 0000000000..d6d2bc00ae --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg @@ -0,0 +1,4 @@ + + + +
Lock interaction legend:

Lock interaction legend:...
LOCK NAME
LOCK NAME
LOCK NAME
LOCK NAME
Event flow
Event flow
or
or
lock acquisition, 
every lock is shown with a single lines
Different lines of the same shape denote different locks
lock acquisition,...
Continuous lock acquisition,
lock release is explicitly shown later
Continuous lock acquisition,...
Lock release
Lock release
Instant lock acquisition and release
Instant lock acquisition and rele...
Lock details (RwLock/Mutex)
are shown on the corresponding arrows
and lock names
Lock details (RwLock/Mutex)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..d1c97d1738 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
layer_write_lock.lock()
layer_write_lock.lock()
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
held through entire freezing
held through entire freezing
flush_frozen_layers
schedules the operation in to LayerMap
flush_frozen_layers...

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
checkpoint(Flush)
checkpoint(Flush)
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..81918fcd98 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
gc
gc
compact
compact
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
checkpoint(Forced)
checkpoint(Forced)
takes the lock when ready to do gc
holds during entire operation
takes the lock when ready to do gc...
gc_cs.lock()
gc_cs.lock()
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds lock during
entire operation
holds lock during...
holds lock during
entire branching
holds lock during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg new file mode 100644 index 0000000000..207017fb1b --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +sLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
Tenanta number of maps with Arc<RwLock<Data>> patternfor tenants, timelines, gc, walreceiver, remove storage, etc. metadataLayeredTimelinewrite_lock: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagercompactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLowLevelLayeredTimelinepartitioning: Mutex<(KeyPartitioning, Lsn)>layers: RwLock<LayerMap>
tenant contains timeline layer data
tenant con...
remote storage syncstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Tasks interact with layers, via LayerMap
Tasks interact with layers, via LayerMap
task_managerruntime, threadpools, shared connections (etcd), etc.logic to manage tenant/timeline taskstenant config file in any form
layer map schedules sync tasks
and calls logic on their completion
layer map schedules sync tasks...
page cachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>tenant storageHashMap<TenantId, Tenant>Tenant state information, its sync and task manager interaction
layer map manages local and remote files
in a queue-based manner
layer map manages local and remote files...
tasks update or read metadata via the storage
tasks update or read metadata via the storage
Legend:
Legend:
interaction between components,
arrows show which component does the data access
interaction between components,...
data relation,
arrows show where current data is contained in
data relation,...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..b968fedd8c --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
write_lock.lock()
w...
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
check_checkpoint_distance
check_checkpoint_distance
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
held through entire freezing
h...
 layer_flush_lock.lock() 
...
skips both flushes if the lock is taken
s...
skips the flush if the lock is taken 
s...
always waits for the lock
and runs
frozen layers flush 
holding the lock
always waits f...
flush_frozen_layers(false)
flush_frozen_layers(false)

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..382d834517 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
tenant idle/detach
shutdown
tenant idle/detach...
gc
gc
compact
compact
lock is held for
almost entire operations
lock is held for...
RwLock(file_lock)
RwLock(file_lock)
read
read
read
read
write
write
HTTP API call
delete timeline
HTTP API call...
layer_removal_cs.lock()
layer_removal_cs.lock()
lock is held for
the entire operation
lock is held for...
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
gc_cs.lock()
gc_cs.lock()
held during entire
branching
held during entire...
checkpoint(Forced)
checkpoint(Forced)
write updated value,
release the lock
write updated value,...
RwLock(latest_gc_cutoff_lsn)
RwLock(latest_gc_cutoff_lsn)
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds read during
enire operation
holds read during...
holds read during
enire branching
holds read during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg new file mode 100644 index 0000000000..c4bc36f309 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +                                             Tasks                                                                                                                                                                   StateLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
LayeredRepositorytimelines: Mutex<HashMap<TimelineId, LayeredTimeline>>gc_cs: Mutex<()>file_lock: RwLock<()>tenant_conf: Arc<RwLock<TenantConfOpt>>remote_index: Arc<RwLock<HashMap<                        TenantTimelineId, RemoteTimelineMetadata>>tenant_mgrstatic ref TENANTS: RwLock<HashMap<TenantId, Tenant>>Tenantstate: TenantStaterepo: Arc<LayeredRepository>local_timelines: HashMap<TimelineId, Arc<DatadirTimelineImpl>>PageCachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>DatadirTimelineImplpartitioning: Mutex<(KeyPartitioning, Lsn)>tline: Arc<LayeredTimeline>compactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLayeredTimelinewrite_lock: Mutex<()>layer_flush_lock: Mutex<()>layer_removal_cs: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>tenant_conf: Arc<RwLock<TenantConfOpt>>gc_info: RwLock<GcInfo>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagerlayers: RwLock<LayerMap>layer flush taskPer timeline, moves in-memory data to disk when scheduled (adds layers)remote storage sync taskstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Backed by repository:
Backed by repository:
get page requests lookup and update
get page requests lookup and update
flushes new files on disk, loads existing into memory
flushes new files on disk, loads existing into memory
Tasks interact with files on disk, full CRUD
Remote storage sync task is the only one to interact with other storage
Tasks interact with files on disk, full CRUD...
schedules layer sync
schedules layer sync
Text is not SVG - cannot display
\ No newline at end of file From 35761ac6b6f4daee78bcaabd083e88ec3b877958 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Tue, 13 Sep 2022 23:55:18 +0200 Subject: [PATCH 19/33] docs/sourcetree: add info about IDE config (#2332) --- docs/sourcetree.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index f3bc9230e2..339a90e0ba 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -134,3 +134,42 @@ Also consider: To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case. More details are available in poetry's [documentation](https://python-poetry.org/docs/). + +## Configuring IDEs +Neon consists of three projects in different languages which use different project models. + +* A bunch of Rust crates, all available from the root `Cargo.toml`. +* Integration tests in Python in the `test_runner` directory. Some stand-alone Python scripts exist as well. +* Postgres and our Postgres extensions in C built with Makefiles under `vendor/postgres` and `pgxn`. + +### CLion +You can use CLion with the [Rust plugin](https://plugins.jetbrains.com/plugin/8182-rust) to develop Neon. It should pick up Rust and Python projects whenever you open Neon's repository as a project. We have not tried setting up a debugger, though. + +C code requires some extra care, as it's built via Make, not CMake. Some of our developers have successfully used [compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_generate) for CLion. It is a JSON file which lists all C source files and corresponding compilation keys. CLion can use it instead of `CMakeLists.txt`. To set up a project with a compilation database: + +1. Clone the Neon repository and install all dependencies, including Python. Do not open it with CLion just yet. +2. Run the following commands in the repository's root: + ```bash + # Install a `compiledb` tool which can parse make's output and generate the compilation database. + poetry add -D compiledb + # Run Make without actually compiling code so we can generate the compilation database. It still may take a few minutes. + make --dry-run --print-directory --keep-going --assume-new=* postgres neon-pg-ext | poetry run compiledb --verbose --no-build + # Uninstall the tool + poetry remove -D compiledb + # Make sure the compile_commands.json file is not committed. + echo /compile_commands.json >>.git/info/exclude + ``` +3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. +4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). +5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. +7. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. + +You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. + +Whenever you change layout of C files, you may need to regenerate the compilation database. No need to re-create the CLion project, changes should be picked up automatically. + +Known issues (fixes and suggestions are welcome): + +* Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. +* CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. +* Cargo Clippy diagnostics in CLion may take a lot of resources. From ba8698bbcbc4f3a4d46e0eeaa48cec3191c0d440 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 21:06:10 +0300 Subject: [PATCH 20/33] update neon_local output in readme --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 977afc2a2c..03ed57a0fa 100644 --- a/README.md +++ b/README.md @@ -125,16 +125,18 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > ./target/debug/neon_local init -initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c -created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50 -initial timeline de200bd42b49cc1814412c7e592dd6e9 created -pageserver init succeeded +Starting pageserver at '127.0.0.1:64000' in '.neon' + +Pageserver started +Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7 +Stopping pageserver gracefully...done! # start pageserver and safekeeper > ./target/debug/neon_local start +Starting etcd broker using /usr/bin/etcd Starting pageserver at '127.0.0.1:64000' in '.neon' + Pageserver started -initializing for sk 1 for 7676 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1' Safekeeper started From 260ec20a0218f3da95a2393c9ba377049967dcb2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 23:58:27 +0300 Subject: [PATCH 21/33] Refotmat pgxn code, add typedefs.list that was used --- pgxn/neon/inmem_smgr.c | 28 +- pgxn/neon/libpagestore.c | 25 +- pgxn/neon/libpqwalproposer.c | 237 +- pgxn/neon/neon.c | 9 +- pgxn/neon/neon.h | 2 +- pgxn/neon/pagestore_client.h | 19 +- pgxn/neon/pagestore_smgr.c | 169 +- pgxn/neon/walproposer.c | 682 +++--- pgxn/neon/walproposer.h | 343 +-- pgxn/neon/walproposer_utils.c | 142 +- pgxn/neon/walproposer_utils.h | 26 +- pgxn/neon_test_utils/neontest.c | 30 +- pgxn/typedefs.list | 3776 +++++++++++++++++++++++++++++++ 13 files changed, 4691 insertions(+), 797 deletions(-) create mode 100644 pgxn/typedefs.list diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 13fd4d50b6..4926d759e8 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -188,10 +188,10 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { /* * We assume the buffer cache is large enough to hold all the buffers - * needed for most operations. Overflowing to this "in-mem smgr" in rare - * cases is OK. But if we find that we're using more than WARN_PAGES, - * print a warning so that we get alerted and get to investigate why - * we're accessing so many buffers. + * needed for most operations. Overflowing to this "in-mem smgr" in + * rare cases is OK. But if we find that we're using more than + * WARN_PAGES, print a warning so that we get alerted and get to + * investigate why we're accessing so many buffers. */ elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", @@ -207,7 +207,9 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, pg = used_pages; used_pages++; INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); - } else { + } + else + { elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, @@ -226,14 +228,14 @@ BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum) { /* - * It's not clear why a WAL redo function would call smgrnblocks(). - * During recovery, at least before reaching consistency, the size of a - * relation could be arbitrarily small, if it was truncated after the - * record being replayed, or arbitrarily large if it was extended - * afterwards. But one place where it's called is in - * XLogReadBufferExtended(): it extends the relation, if it's smaller than - * the requested page. That's a waste of time in the WAL redo - * process. Pretend that all relations are maximally sized to avoid it. + * It's not clear why a WAL redo function would call smgrnblocks(). During + * recovery, at least before reaching consistency, the size of a relation + * could be arbitrarily small, if it was truncated after the record being + * replayed, or arbitrarily large if it was extended afterwards. But one + * place where it's called is in XLogReadBufferExtended(): it extends the + * relation, if it's smaller than the requested page. That's a waste of + * time in the WAL redo process. Pretend that all relations are maximally + * sized to avoid it. */ return MaxBlockNumber; } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index d0572e66cb..55285a6345 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -153,11 +153,11 @@ static void pageserver_disconnect(void) { /* - * If anything goes wrong while we were sending a request, it's not - * clear what state the connection is in. For example, if we sent the - * request but didn't receive a response yet, we might receive the - * response some time later after we have already sent a new unrelated - * request. Close the connection to avoid getting confused. + * If anything goes wrong while we were sending a request, it's not clear + * what state the connection is in. For example, if we sent the request + * but didn't receive a response yet, we might receive the response some + * time later after we have already sent a new unrelated request. Close + * the connection to avoid getting confused. */ if (connected) { @@ -191,12 +191,13 @@ pageserver_send(ZenithRequest *request) * * In principle, this could block if the output buffer is full, and we * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output - * and TCP buffer. + * practice, our requests are small enough to always fit in the output and + * TCP buffer. */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { - char* msg = PQerrorMessage(pageserver_conn); + char *msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); neon_log(ERROR, "failed to send page request: %s", msg); } @@ -205,6 +206,7 @@ pageserver_send(ZenithRequest *request) if (message_level_is_interesting(PageStoreTrace)) { char *msg = zm_to_string((ZenithMessage *) request); + neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); } @@ -255,15 +257,16 @@ static void pageserver_flush(void) { if (PQflush(pageserver_conn)) - { - char* msg = PQerrorMessage(pageserver_conn); + { + char *msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); neon_log(ERROR, "failed to flush page requests: %s", msg); } } static ZenithResponse * -pageserver_call(ZenithRequest* request) +pageserver_call(ZenithRequest *request) { pageserver_send(request); pageserver_flush(); diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c index 2b2b7a1a6a..1f739f3722 100644 --- a/pgxn/neon/libpqwalproposer.c +++ b/pgxn/neon/libpqwalproposer.c @@ -7,38 +7,40 @@ /* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ struct WalProposerConn { - PGconn* pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from libpqprop_async_read */ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from + * libpqprop_async_read */ }; /* Prototypes for exported functions */ -static char* libpqprop_error_message(WalProposerConn* conn); -static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); -static WalProposerConn* libpqprop_connect_start(char* conninfo); -static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); -static bool libpqprop_send_query(WalProposerConn* conn, char* query); -static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); -static pgsocket libpqprop_socket(WalProposerConn* conn); -static int libpqprop_flush(WalProposerConn* conn); -static void libpqprop_finish(WalProposerConn* conn); -static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); -static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); -static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); +static char *libpqprop_error_message(WalProposerConn * conn); +static WalProposerConnStatusType libpqprop_status(WalProposerConn * conn); +static WalProposerConn * libpqprop_connect_start(char *conninfo); +static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn * conn); +static bool libpqprop_send_query(WalProposerConn * conn, char *query); +static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn * conn); +static pgsocket libpqprop_socket(WalProposerConn * conn); +static int libpqprop_flush(WalProposerConn * conn); +static void libpqprop_finish(WalProposerConn * conn); +static PGAsyncReadResult libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount); +static PGAsyncWriteResult libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size); +static bool libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size); -static WalProposerFunctionsType PQWalProposerFunctions = { +static WalProposerFunctionsType PQWalProposerFunctions = +{ libpqprop_error_message, - libpqprop_status, - libpqprop_connect_start, - libpqprop_connect_poll, - libpqprop_send_query, - libpqprop_get_query_result, - libpqprop_socket, - libpqprop_flush, - libpqprop_finish, - libpqprop_async_read, - libpqprop_async_write, - libpqprop_blocking_write, + libpqprop_status, + libpqprop_connect_start, + libpqprop_connect_poll, + libpqprop_send_query, + libpqprop_get_query_result, + libpqprop_socket, + libpqprop_flush, + libpqprop_finish, + libpqprop_async_read, + libpqprop_async_write, + libpqprop_blocking_write, }; /* Module initialization */ @@ -52,7 +54,7 @@ pg_init_libpqwalproposer(void) /* Helper function */ static bool -ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) +ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking) { /* If we're already correctly blocking or nonblocking, all good */ if (is_nonblocking == conn->is_nonblocking) @@ -67,14 +69,14 @@ ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) } /* Exported function definitions */ -static char* -libpqprop_error_message(WalProposerConn* conn) +static char * +libpqprop_error_message(WalProposerConn * conn) { return PQerrorMessage(conn->pg_conn); } static WalProposerConnStatusType -libpqprop_status(WalProposerConn* conn) +libpqprop_status(WalProposerConn * conn) { switch (PQstatus(conn->pg_conn)) { @@ -87,35 +89,38 @@ libpqprop_status(WalProposerConn* conn) } } -static WalProposerConn* -libpqprop_connect_start(char* conninfo) +static WalProposerConn * +libpqprop_connect_start(char *conninfo) { - WalProposerConn* conn; - PGconn* pg_conn; + WalProposerConn *conn; + PGconn *pg_conn; pg_conn = PQconnectStart(conninfo); + /* - * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the - * behavior of PQconnectStart here. + * Allocation of a PQconn can fail, and will return NULL. We want to fully + * replicate the behavior of PQconnectStart here. */ if (!pg_conn) return NULL; /* - * And in theory this allocation can fail as well, but it's incredibly unlikely if we just - * successfully allocated a PGconn. + * And in theory this allocation can fail as well, but it's incredibly + * unlikely if we just successfully allocated a PGconn. * - * palloc will exit on failure though, so there's not much we could do if it *did* fail. + * palloc will exit on failure though, so there's not much we could do if + * it *did* fail. */ conn = palloc(sizeof(WalProposerConn)); conn->pg_conn = pg_conn; - conn->is_nonblocking = false; /* connections always start in blocking mode */ + conn->is_nonblocking = false; /* connections always start in blocking + * mode */ conn->recvbuf = NULL; return conn; } static WalProposerConnectPollStatusType -libpqprop_connect_poll(WalProposerConn* conn) +libpqprop_connect_poll(WalProposerConn * conn) { WalProposerConnectPollStatusType return_val; @@ -134,26 +139,34 @@ libpqprop_connect_poll(WalProposerConn* conn) return_val = WP_CONN_POLLING_OK; break; - /* There's a comment at its source about this constant being unused. We'll expect it's never - * returned. */ + /* + * There's a comment at its source about this constant being + * unused. We'll expect it's never returned. + */ case PGRES_POLLING_ACTIVE: elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); - /* This return is never actually reached, but it's here to make the compiler happy */ + + /* + * This return is never actually reached, but it's here to make + * the compiler happy + */ return WP_CONN_POLLING_FAILED; default: Assert(false); - return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ } return return_val; } static bool -libpqprop_send_query(WalProposerConn* conn, char* query) +libpqprop_send_query(WalProposerConn * conn, char *query) { - /* We need to be in blocking mode for sending the query to run without - * requiring a call to PQflush */ + /* + * We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush + */ if (!ensure_nonblocking_status(conn, false)) return false; @@ -165,13 +178,13 @@ libpqprop_send_query(WalProposerConn* conn, char* query) } static WalProposerExecStatusType -libpqprop_get_query_result(WalProposerConn* conn) +libpqprop_get_query_result(WalProposerConn * conn) { - PGresult* result; + PGresult *result; WalProposerExecStatusType return_val; /* Marker variable if we need to log an unexpected success result */ - char* unexpected_success = NULL; + char *unexpected_success = NULL; /* Consume any input that we might be missing */ if (!PQconsumeInput(conn->pg_conn)) @@ -182,8 +195,11 @@ libpqprop_get_query_result(WalProposerConn* conn) result = PQgetResult(conn->pg_conn); - /* PQgetResult returns NULL only if getting the result was successful & there's no more of the - * result to get. */ + + /* + * PQgetResult returns NULL only if getting the result was successful & + * there's no more of the result to get. + */ if (!result) { elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); @@ -191,7 +207,7 @@ libpqprop_get_query_result(WalProposerConn* conn) } /* Helper macro to reduce boilerplate */ - #define UNEXPECTED_SUCCESS(msg) \ +#define UNEXPECTED_SUCCESS(msg) \ return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ unexpected_success = msg; \ break; @@ -199,12 +215,12 @@ libpqprop_get_query_result(WalProposerConn* conn) switch (PQresultStatus(result)) { - /* "true" success case */ + /* "true" success case */ case PGRES_COPY_BOTH: return_val = WP_EXEC_SUCCESS_COPYBOTH; break; - /* Unexpected success case */ + /* Unexpected success case */ case PGRES_EMPTY_QUERY: UNEXPECTED_SUCCESS("empty query return"); case PGRES_COMMAND_OK: @@ -220,7 +236,7 @@ libpqprop_get_query_result(WalProposerConn* conn) case PGRES_PIPELINE_SYNC: UNEXPECTED_SUCCESS("pipeline sync point"); - /* Failure cases */ + /* Failure cases */ case PGRES_BAD_RESPONSE: case PGRES_NONFATAL_ERROR: case PGRES_FATAL_ERROR: @@ -230,7 +246,7 @@ libpqprop_get_query_result(WalProposerConn* conn) default: Assert(false); - return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ } if (unexpected_success) @@ -240,19 +256,19 @@ libpqprop_get_query_result(WalProposerConn* conn) } static pgsocket -libpqprop_socket(WalProposerConn* conn) +libpqprop_socket(WalProposerConn * conn) { return PQsocket(conn->pg_conn); } static int -libpqprop_flush(WalProposerConn* conn) +libpqprop_flush(WalProposerConn * conn) { return (PQflush(conn->pg_conn)); } static void -libpqprop_finish(WalProposerConn* conn) +libpqprop_finish(WalProposerConn * conn) { if (conn->recvbuf != NULL) PQfreemem(conn->recvbuf); @@ -267,9 +283,9 @@ libpqprop_finish(WalProposerConn* conn) * to this function. */ static PGAsyncReadResult -libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) +libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount) { - int result; + int result; if (conn->recvbuf != NULL) { @@ -285,12 +301,11 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) return PG_ASYNC_READ_FAIL; } - /* The docs for PQgetCopyData list the return values as: - * 0 if the copy is still in progress, but no "complete row" is - * available - * -1 if the copy is done - * -2 if an error occured - * (> 0) if it was successful; that value is the amount transferred. + /* + * The docs for PQgetCopyData list the return values as: 0 if the copy is + * still in progress, but no "complete row" is available -1 if the copy is + * done -2 if an error occured (> 0) if it was successful; that value is + * the amount transferred. * * The protocol we use between walproposer and safekeeper means that we * *usually* wouldn't expect to see that the copy is done, but this can @@ -304,25 +319,28 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) *buf = NULL; return PG_ASYNC_READ_TRY_AGAIN; case -1: - { - /* - * If we get -1, it's probably because of a server error; the - * safekeeper won't normally send a CopyDone message. - * - * We can check PQgetResult to make sure that the server failed; - * it'll always result in PGRES_FATAL_ERROR - */ - ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server + * failed; it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); - if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); - /* If there was actually an error, it'll be properly reported by - * calls to PQerrorMessage -- we don't have to do anything else */ - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - } + /* + * If there was actually an error, it'll be properly reported + * by calls to PQerrorMessage -- we don't have to do anything + * else + */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } case -2: *amount = 0; *buf = NULL; @@ -336,23 +354,25 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) } static PGAsyncWriteResult -libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) +libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size) { - int result; + int result; /* If we aren't in non-blocking mode, switch to it. */ if (!ensure_nonblocking_status(conn, true)) return PG_ASYNC_WRITE_FAIL; - /* The docs for PQputcopyData list the return values as: - * 1 if the data was queued, - * 0 if it was not queued because of full buffers, or - * -1 if an error occured + /* + * The docs for PQputcopyData list the return values as: 1 if the data was + * queued, 0 if it was not queued because of full buffers, or -1 if an + * error occured */ result = PQputCopyData(conn->pg_conn, buf, size); - /* We won't get a result of zero because walproposer always empties the - * connection's buffers before sending more */ + /* + * We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more + */ Assert(result != 0); switch (result) @@ -366,16 +386,17 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) elog(FATAL, "invalid return %d from PQputCopyData", result); } - /* After queueing the data, we still need to flush to get it to send. - * This might take multiple tries, but we don't want to wait around - * until it's done. + /* + * After queueing the data, we still need to flush to get it to send. This + * might take multiple tries, but we don't want to wait around until it's + * done. * - * PQflush has the following returns (directly quoting the docs): - * 0 if sucessful, - * 1 if it was unable to send all the data in the send queue yet - * -1 if it failed for some reason + * PQflush has the following returns (directly quoting the docs): 0 if + * sucessful, 1 if it was unable to send all the data in the send queue + * yet -1 if it failed for some reason */ - switch (result = PQflush(conn->pg_conn)) { + switch (result = PQflush(conn->pg_conn)) + { case 0: return PG_ASYNC_WRITE_SUCCESS; case 1: @@ -388,16 +409,18 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) } static bool -libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) +libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size) { - int result; + int result; /* If we are in non-blocking mode, switch out of it. */ if (!ensure_nonblocking_status(conn, false)) return false; - /* Ths function is very similar to libpqprop_async_write. For more - * information, refer to the comments there */ + /* + * Ths function is very similar to libpqprop_async_write. For more + * information, refer to the comments there + */ if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) return false; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 62d2624e56..5346680b0b 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -29,7 +29,8 @@ PG_MODULE_MAGIC; void _PG_init(void); -void _PG_init(void) +void +_PG_init(void) { pg_init_libpagestore(); pg_init_libpqwalproposer(); @@ -59,9 +60,9 @@ pg_cluster_size(PG_FUNCTION_ARGS) Datum backpressure_lsns(PG_FUNCTION_ARGS) { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; Datum values[3]; bool nulls[3]; TupleDesc tupdesc; diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 2c66bc7bf0..dad9c1b508 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -16,4 +16,4 @@ extern void pg_init_libpagestore(void); extern void pg_init_libpqwalproposer(void); extern void pg_init_walproposer(void); -#endif /* NEON_H */ +#endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 5b21abc1bd..7dc38c13fb 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -83,8 +83,8 @@ typedef struct typedef struct { ZenithRequest req; - Oid dbNode; -} ZenithDbSizeRequest; + Oid dbNode; +} ZenithDbSizeRequest; typedef struct @@ -123,12 +123,13 @@ typedef struct { ZenithMessageTag tag; int64 db_size; -} ZenithDbSizeResponse; +} ZenithDbSizeResponse; typedef struct { ZenithMessageTag tag; - char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error + * message */ } ZenithErrorResponse; extern StringInfoData zm_pack_request(ZenithRequest *msg); @@ -142,12 +143,12 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { ZenithResponse *(*request) (ZenithRequest *request); - void (*send) (ZenithRequest *request); + void (*send) (ZenithRequest *request); ZenithResponse *(*receive) (void); - void (*flush) (void); + void (*flush) (void); } page_server_api; -extern page_server_api *page_server; +extern page_server_api * page_server; extern char *page_server_connstring; extern char *zenith_timeline; @@ -179,7 +180,7 @@ extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber block char *buffer); extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, bool request_latest, char *buffer); extern void zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -217,7 +218,7 @@ extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); /* utils for zenith relsize cache */ extern void relsize_hash_init(void); -extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ebf899dfdb..504ae60d4a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -94,7 +94,9 @@ const int SmgrTrace = DEBUG5; page_server_api *page_server; /* GUCs */ -char *page_server_connstring; // with substituted password +char *page_server_connstring; + +//with substituted password char *zenith_timeline; char *zenith_tenant; bool wal_redo = false; @@ -107,7 +109,7 @@ typedef enum UNLOGGED_BUILD_PHASE_1, UNLOGGED_BUILD_PHASE_2, UNLOGGED_BUILD_NOT_PERMANENT -} UnloggedBuildPhase; +} UnloggedBuildPhase; static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; @@ -127,31 +129,33 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; #define MAX_PREFETCH_REQUESTS 128 -BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; -BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; -int n_prefetch_requests; -int n_prefetch_responses; -int n_prefetched_buffers; -int n_prefetch_hits; -int n_prefetch_misses; -XLogRecPtr prefetch_lsn; +BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; +BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; +int n_prefetch_requests; +int n_prefetch_responses; +int n_prefetched_buffers; +int n_prefetch_hits; +int n_prefetch_misses; +XLogRecPtr prefetch_lsn; static void consume_prefetch_responses(void) { - for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { - ZenithResponse* resp = page_server->receive(); + for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) + { + ZenithResponse *resp = page_server->receive(); + pfree(resp); } n_prefetched_buffers = 0; n_prefetch_responses = 0; } -static ZenithResponse* -page_server_request(void const* req) +static ZenithResponse * +page_server_request(void const *req) { consume_prefetch_responses(); - return page_server->request((ZenithRequest*)req); + return page_server->request((ZenithRequest *) req); } @@ -196,11 +200,11 @@ zm_pack_request(ZenithRequest *msg) { ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->dbNode); + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); - break; + break; } case T_ZenithGetPageRequest: { @@ -546,21 +550,22 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, else if (lsn == InvalidXLogRecPtr) { /* - * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, - * and we can just ignore that in Zenith. We do need to remember the new size, - * though, so that smgrnblocks() returns the right answer after the rel has - * been extended. We rely on the relsize cache for that. + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Zenith. We do need + * to remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. * - * A completely empty heap page doesn't need to be WAL-logged, either. The - * heapam can leave such a page behind, if e.g. an insert errors out after - * initializing the page, but before it has inserted the tuple and WAL-logged - * the change. When we read the page from the page server, it will come back - * as all-zeros. That's OK, the heapam will initialize an all-zeros page on - * first use. + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. * - * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies - * that the page was not WAL-logged, and its contents will be lost when it's - * evicted. + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. */ if (PageIsNew(buffer)) { @@ -691,9 +696,9 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc * Is it possible that the last-written LSN is ahead of last flush * LSN? Generally not, we shouldn't evict a page from the buffer cache * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index building, - * _bt_blwritepage logs the full page without flushing WAL before - * smgrextend (files are fsynced before build ends). + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); @@ -728,10 +733,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) switch (reln->smgr_relpersistence) { case 0: + /* - * We don't know if it's an unlogged rel stored locally, or permanent - * rel stored in the page server. First check if it exists locally. - * If it does, great. Otherwise check if it exists in the page server. + * We don't know if it's an unlogged rel stored locally, or + * permanent rel stored in the page server. First check if it + * exists locally. If it does, great. Otherwise check if it exists + * in the page server. */ if (mdexists(reln, forkNum)) return true; @@ -755,11 +762,11 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) /* * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server - * will error out if you check that, because the whole dbdir for tablespace - * 0, db 0 doesn't exists. We possibly should change the page server to - * accept that and return 'false', to be consistent with mdexists(). But - * we probably also should fix pg_table_size() to not call smgrexists() - * with bogus relfilenode. + * will error out if you check that, because the whole dbdir for + * tablespace 0, db 0 doesn't exists. We possibly should change the page + * server to accept that and return 'false', to be consistent with + * mdexists(). But we probably also should fix pg_table_size() to not call + * smgrexists() with bogus relfilenode. * * For now, handle that special case here. */ @@ -880,13 +887,13 @@ void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { /* - * Might or might not exist locally, depending on whether it's - * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is - * set). Try to unlink, it won't do any harm if the file doesn't - * exist. + * Might or might not exist locally, depending on whether it's an unlogged + * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to + * unlink, it won't do any harm if the file doesn't exist. */ mdunlink(rnode, forkNum, isRedo); - if (!RelFileNodeBackendIsTemp(rnode)) { + if (!RelFileNodeBackendIsTemp(rnode)) + { forget_cached_relsize(rnode.node, forkNum); } } @@ -926,8 +933,9 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, /* * Check that the cluster size limit has not been exceeded. * - * Temporary and unlogged relations are not included in the cluster size measured - * by the page server, so ignore those. Autovacuum processes are also exempt. + * Temporary and unlogged relations are not included in the cluster size + * measured by the page server, so ignore those. Autovacuum processes are + * also exempt. */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && @@ -937,10 +945,10 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, - (errcode(ERRCODE_DISK_FULL), - errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", - max_cluster_size), - errhint("This limit is defined by neon.max_cluster_size GUC"))); + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); } zenith_wallog_page(reln, forkNum, blkno, buffer); @@ -987,8 +995,8 @@ void zenith_close(SMgrRelation reln, ForkNumber forknum) { /* - * Let md.c close it, if it had it open. Doesn't hurt to do this - * even for permanent relations that have no local storage. + * Let md.c close it, if it had it open. Doesn't hurt to do this even for + * permanent relations that have no local storage. */ mdclose(reln, forknum); } @@ -1079,17 +1087,18 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, * While function is defined in the zenith extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ -void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) +void +zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) { ZenithResponse *resp; - int i; + int i; /* - * Try to find prefetched page. - * It is assumed that pages will be requested in the same order as them are prefetched, - * but some other backend may load page in shared buffers, so some prefetch responses should - * be skipped. + * Try to find prefetched page. It is assumed that pages will be requested + * in the same order as them are prefetched, but some other backend may + * load page in shared buffers, so some prefetch responses should be + * skipped. */ for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) { @@ -1099,19 +1108,20 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno prefetch_responses[i].forkNum == forkNum && prefetch_responses[i].blockNum == blkno) { - char* page = ((ZenithGetPageResponse *) resp)->page; + char *page = ((ZenithGetPageResponse *) resp)->page; + /* - * Check if prefetched page is still relevant. - * If it is updated by some other backend, then it should not - * be requested from smgr unless it is evicted from shared buffers. - * In the last case last_evicted_lsn should be updated and - * request_lsn should be greater than prefetch_lsn. - * Maximum with page LSN is used because page returned by page server - * may have LSN either greater either smaller than requested. + * Check if prefetched page is still relevant. If it is updated by + * some other backend, then it should not be requested from smgr + * unless it is evicted from shared buffers. In the last case + * last_evicted_lsn should be updated and request_lsn should be + * greater than prefetch_lsn. Maximum with page LSN is used + * because page returned by page server may have LSN either + * greater either smaller than requested. */ if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn) { - n_prefetched_buffers = i+1; + n_prefetched_buffers = i + 1; n_prefetch_hits += 1; n_prefetch_requests = 0; memcpy(buffer, page, BLCKSZ); @@ -1133,6 +1143,7 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno .forknum = forkNum, .blkno = blkno }; + if (n_prefetch_requests > 0) { /* Combine all prefetch requests with primary request */ @@ -1471,8 +1482,8 @@ int64 zenith_dbsize(Oid dbNode) { ZenithResponse *resp; - int64 db_size; - XLogRecPtr request_lsn; + int64 db_size; + XLogRecPtr request_lsn; bool latest; RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; @@ -1564,10 +1575,12 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) XLogFlush(lsn); /* - * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them, - * or update LSN for "dummy" metadata block. Second approach seems more efficient. If the relation is extended - * again later, the extension will update the last-written LSN for the extended pages, so there's no harm in - * leaving behind obsolete entries for the truncated chunks. + * Truncate may affect several chunks of relations. So we should either + * update last written LSN for all of them, or update LSN for "dummy" + * metadata block. Second approach seems more efficient. If the relation + * is extended again later, the extension will update the last-written LSN + * for the extended pages, so there's no harm in leaving behind obsolete + * entries for the truncated chunks. */ SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a769a5216b..05257ced4c 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -88,8 +88,9 @@ WalProposerFunctionsType *WalProposerFunctions = NULL; static int n_safekeepers = 0; static int quorum = 0; static Safekeeper safekeeper[MAX_SAFEKEEPERS]; -static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to + * safekeepers */ static ProposerGreeting greetRequest; static VoteRequest voteRequest; /* Vote request for safekeeper */ static WaitEventSet *waitEvents; @@ -99,6 +100,7 @@ static AppendResponse quorumFeedback; * record-aligned (first record which might not yet received by someone). */ static XLogRecPtr truncateLsn; + /* * Term of the proposer. We want our term to be highest and unique, * so we collect terms from safekeepers quorum, choose max and +1. @@ -116,7 +118,7 @@ static int n_votes = 0; static int n_connected = 0; static TimestampTz last_reconnect_attempt; -static WalproposerShmemState *walprop_shared; +static WalproposerShmemState * walprop_shared; /* Prototypes for private functions */ static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId); @@ -138,7 +140,7 @@ static void RecvAcceptorGreeting(Safekeeper *sk); static void SendVoteRequest(Safekeeper *sk); static void RecvVoteResponse(Safekeeper *sk); static void HandleElectedProposer(void); -static term_t GetHighestTerm(TermHistory *th); +static term_t GetHighestTerm(TermHistory * th); static term_t GetEpoch(Safekeeper *sk); static void DetermineEpochStartLsn(void); static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); @@ -155,7 +157,7 @@ static XLogRecPtr CalculateMinFlushLsn(void); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); static void HandleSafekeeperResponse(void); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); -static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); @@ -175,7 +177,8 @@ static void walproposer_shmem_request(void); #endif -void pg_init_walproposer(void) +void +pg_init_walproposer(void) { if (!process_shared_preload_libraries_in_progress) return; @@ -194,50 +197,53 @@ void pg_init_walproposer(void) WalProposerStart = &WalProposerStartImpl; } -static void nwp_register_gucs(void) +static void +nwp_register_gucs(void) { DefineCustomStringVariable( - "neon.safekeepers", - "List of Neon WAL acceptors (host:port)", - NULL, /* long_desc */ - &wal_acceptors_list, /* valueAddr */ - "", /* bootValue */ - PGC_POSTMASTER, - GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */ - NULL, NULL, NULL - ); + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use + * GUC_LIST_QUOTE */ + NULL, NULL, NULL + ); DefineCustomIntVariable( - "neon.safekeeper_reconnect_timeout", - "Timeout for reconnecting to offline wal acceptor.", - NULL, - &wal_acceptor_reconnect_timeout, - 1000, 0, INT_MAX, /* default, min, max */ - PGC_SIGHUP, /* context */ - GUC_UNIT_MS, /* flags */ - NULL, NULL, NULL - ); + "neon.safekeeper_reconnect_timeout", + "Timeout for reconnecting to offline wal acceptor.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL + ); DefineCustomIntVariable( - "neon.safekeeper_connect_timeout", - "Timeout after which give up connection attempt to safekeeper.", - NULL, - &wal_acceptor_connect_timeout, - 5000, 0, INT_MAX, - PGC_SIGHUP, - GUC_UNIT_MS, - NULL, NULL, NULL - ); + "neon.safekeeper_connect_timeout", + "Timeout after which give up connection attempt to safekeeper.", + NULL, + &wal_acceptor_connect_timeout, + 5000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL + ); } /* shmem handling */ -static void nwp_prepare_shmem(void) +static void +nwp_prepare_shmem(void) { #if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = walproposer_shmem_request; + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = walproposer_shmem_request; #else RequestAddinShmemSpace(WalproposerShmemSize()); #endif @@ -260,7 +266,8 @@ walproposer_shmem_request(void) } #endif -static void nwp_shmem_startup_hook(void) +static void +nwp_shmem_startup_hook(void) { if (prev_shmem_startup_hook_type) prev_shmem_startup_hook_type(); @@ -275,7 +282,7 @@ void WalProposerMain(Datum main_arg) { #if PG_VERSION_NUM >= 150000 - TimeLineID tli; + TimeLineID tli; #endif /* Establish signal handlers. */ @@ -286,7 +293,7 @@ WalProposerMain(Datum main_arg) BackgroundWorkerUnblockSignals(); #if PG_VERSION_NUM >= 150000 - // FIXME pass proper tli to WalProposerInit ? + /* FIXME pass proper tli to WalProposerInit ? */ GetXLogReplayRecPtr(&tli); WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier()); #else @@ -339,7 +346,7 @@ WalProposerPoll(void) { while (true) { - Safekeeper *sk; + Safekeeper *sk; int rc; WaitEvent event; TimestampTz now = GetCurrentTimestamp(); @@ -356,8 +363,8 @@ WalProposerPoll(void) AdvancePollState(sk, event.events); /* - * If the timeout expired, attempt to reconnect to any safekeepers that - * we dropped + * If the timeout expired, attempt to reconnect to any safekeepers + * that we dropped */ ReconnectSafekeepers(); @@ -371,7 +378,7 @@ WalProposerPoll(void) ResetLatch(MyLatch); break; } - if (rc == 0) /* timeout expired: poll state */ + if (rc == 0) /* timeout expired: poll state */ { TimestampTz now; @@ -390,12 +397,12 @@ WalProposerPoll(void) now = GetCurrentTimestamp(); for (int i = 0; i < n_safekeepers; i++) { - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; if ((sk->state == SS_CONNECTING_WRITE || - sk->state == SS_CONNECTING_READ) && + sk->state == SS_CONNECTING_READ) && TimestampDifferenceExceeds(sk->startedConnAt, now, - wal_acceptor_connect_timeout)) + wal_acceptor_connect_timeout)) { elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", sk->host, sk->port, wal_acceptor_connect_timeout); @@ -472,7 +479,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) */ safekeeper[n_safekeepers].conninfo[0] = '\0'; initStringInfo(&safekeeper[n_safekeepers].outbuf); - safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); if (safekeeper[n_safekeepers].xlogreader == NULL) elog(FATAL, "Failed to allocate xlog reader"); safekeeper[n_safekeepers].flushWrite = false; @@ -504,7 +511,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); #if PG_VERSION_NUM >= 150000 -// FIXME don't use hardcoded timeline id +/* FIXME don't use hardcoded timeline id */ greetRequest.timeline = 1; #else greetRequest.timeline = ThisTimeLineID; @@ -589,7 +596,7 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove) for (int i = 0; i < n_safekeepers; i++) { uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; sk->eventPos = -1; @@ -647,12 +654,21 @@ ResetConnection(Safekeeper *sk) */ if (sk->conninfo[0] == '\0') { - int written = 0; + int written = 0; + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); - // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, - // so it is better to be defensive and check that everything aligns well + "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + + /* + * currently connection string is not that long, but once we pass + * something like jwt we might overflow the buffer, + */ + + /* + * so it is better to be defensive and check that everything aligns + * well + */ if (written > MAXCONNINFO || written < 0) elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } @@ -762,8 +778,8 @@ static void AdvancePollState(Safekeeper *sk, uint32 events) { /* - * Sanity check. We assume further down that the operations don't - * block because the socket is ready. + * Sanity check. We assume further down that the operations don't block + * because the socket is ready. */ AssertEventsOkForState(events, sk); @@ -777,12 +793,12 @@ AdvancePollState(Safekeeper *sk, uint32 events) case SS_OFFLINE: elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", sk->host, sk->port); - break; /* actually unreachable, but prevents - * -Wimplicit-fallthrough */ + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ /* - * Both connecting states run the same logic. The only - * difference is the events they're expecting + * Both connecting states run the same logic. The only difference + * is the events they're expecting */ case SS_CONNECTING_READ: case SS_CONNECTING_WRITE: @@ -797,20 +813,22 @@ AdvancePollState(Safekeeper *sk, uint32 events) break; /* - * Finish handshake comms: receive information about the safekeeper. + * Finish handshake comms: receive information about the + * safekeeper. */ case SS_HANDSHAKE_RECV: RecvAcceptorGreeting(sk); break; /* - * Voting is an idle state - we don't expect any events to trigger. - * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are - * transferred from SS_VOTING to sending actual vote requests. + * Voting is an idle state - we don't expect any events to + * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see how + * nodes are transferred from SS_VOTING to sending actual vote + * requests. */ case SS_VOTING: elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk->state)); ResetConnection(sk); return; @@ -824,8 +842,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) /* * AsyncFlush ensures we only move on to SS_ACTIVE once the flush - * completes. If we still have more to do, we'll wait until the next - * poll comes along. + * completes. If we still have more to do, we'll wait until the + * next poll comes along. */ if (!AsyncFlush(sk)) return; @@ -839,7 +857,7 @@ AdvancePollState(Safekeeper *sk, uint32 events) */ case SS_IDLE: elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk->state)); ResetConnection(sk); return; @@ -864,19 +882,17 @@ HandleConnectionEvent(Safekeeper *sk) { case WP_CONN_POLLING_OK: elog(LOG, "connected with node %s:%s", sk->host, - sk->port); + sk->port); /* - * We have to pick some event to update event set. - * We'll eventually need the socket to be readable, - * so we go with that. + * We have to pick some event to update event set. We'll + * eventually need the socket to be readable, so we go with that. */ new_events = WL_SOCKET_READABLE; break; /* - * If we need to poll to finish connecting, - * continue doing that + * If we need to poll to finish connecting, continue doing that */ case WP_CONN_POLLING_READING: sk->state = SS_CONNECTING_READ; @@ -889,13 +905,12 @@ HandleConnectionEvent(Safekeeper *sk) case WP_CONN_POLLING_FAILED: elog(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); /* - * If connecting failed, we don't want to restart - * the connection because that might run us into a - * loop. Instead, shut it down -- it'll naturally - * restart at a slower interval on calls to + * If connecting failed, we don't want to restart the connection + * because that might run us into a loop. Instead, shut it down -- + * it'll naturally restart at a slower interval on calls to * ReconnectSafekeepers. */ ShutdownConnection(sk); @@ -903,9 +918,8 @@ HandleConnectionEvent(Safekeeper *sk) } /* - * Because PQconnectPoll can change the socket, we have to - * un-register the old event and re-register an event on - * the new socket. + * Because PQconnectPoll can change the socket, we have to un-register the + * old event and re-register an event on the new socket. */ HackyRemoveWalProposerEvent(sk); sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); @@ -926,7 +940,7 @@ SendStartWALPush(Safekeeper *sk) if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) { elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); ShutdownConnection(sk); return; } @@ -940,8 +954,7 @@ RecvStartWALPushResult(Safekeeper *sk) switch (walprop_get_query_result(sk->conn)) { /* - * Successful result, move on to starting the - * handshake + * Successful result, move on to starting the handshake */ case WP_EXEC_SUCCESS_COPYBOTH: @@ -949,31 +962,31 @@ RecvStartWALPushResult(Safekeeper *sk) break; /* - * Needs repeated calls to finish. Wait until the - * socket is readable + * Needs repeated calls to finish. Wait until the socket is + * readable */ case WP_EXEC_NEEDS_INPUT: /* - * SS_WAIT_EXEC_RESULT is always reached through an - * event, so we don't need to update the event set + * SS_WAIT_EXEC_RESULT is always reached through an event, so we + * don't need to update the event set */ break; case WP_EXEC_FAILED: elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); ShutdownConnection(sk); return; /* - * Unexpected result -- funamdentally an error, but we - * want to produce a custom message, rather than a - * generic "something went wrong" + * Unexpected result -- funamdentally an error, but we want to + * produce a custom message, rather than a generic "something went + * wrong" */ case WP_EXEC_UNEXPECTED_SUCCESS: elog(WARNING, "Received bad response from safekeeper %s:%s query execution", - sk->host, sk->port); + sk->host, sk->port); ShutdownConnection(sk); return; } @@ -988,8 +1001,8 @@ static void SendProposerGreeting(Safekeeper *sk) { /* - * On failure, logging & resetting the connection is handled. - * We just need to handle the control flow. + * On failure, logging & resetting the connection is handled. We just need + * to handle the control flow. */ BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); } @@ -998,12 +1011,12 @@ static void RecvAcceptorGreeting(Safekeeper *sk) { /* - * If our reading doesn't immediately succeed, any necessary - * error handling or state setting is taken care of. We can - * leave any other work until later. + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other work + * until later. */ sk->greetResponse.apm.tag = 'g'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse)) return; /* Protocol is all good, move to voting. */ @@ -1033,37 +1046,34 @@ RecvAcceptorGreeting(Safekeeper *sk) { /* Another compute with higher term is running. */ elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->greetResponse.term, propTerm); + sk->host, sk->port, + sk->greetResponse.term, propTerm); } /* - * Check if we have quorum. If there aren't enough safekeepers, - * wait and do nothing. We'll eventually get a task when the - * election starts. + * Check if we have quorum. If there aren't enough safekeepers, wait and + * do nothing. We'll eventually get a task when the election starts. * * If we do have quorum, we can start an election. */ if (n_connected < quorum) { /* - * SS_VOTING is an idle state; read-ready indicates the - * connection closed. + * SS_VOTING is an idle state; read-ready indicates the connection + * closed. */ UpdateEventSet(sk, WL_SOCKET_READABLE); } else { /* - * Now send voting request to the cohort and wait - * responses + * Now send voting request to the cohort and wait responses */ for (int j = 0; j < n_safekeepers; j++) { /* * Remember: SS_VOTING indicates that the safekeeper is - * participating in voting, but hasn't sent anything - * yet. + * participating in voting, but hasn't sent anything yet. */ if (safekeeper[j].state == SS_VOTING) SendVoteRequest(&safekeeper[j]); @@ -1087,28 +1097,27 @@ static void RecvVoteResponse(Safekeeper *sk) { sk->voteResponse.apm.tag = 'v'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->voteResponse)) return; elog(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), - LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); /* - * In case of acceptor rejecting our vote, bail out, but only - * if either it already lives in strictly higher term - * (concurrent compute spotted) or we are not elected yet and - * thus need the vote. + * In case of acceptor rejecting our vote, bail out, but only if either it + * already lives in strictly higher term (concurrent compute spotted) or + * we are not elected yet and thus need the vote. */ if ((!sk->voteResponse.voteGiven) && (sk->voteResponse.term > propTerm || n_votes < quorum)) { elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->voteResponse.term, propTerm); + sk->host, sk->port, + sk->voteResponse.term, propTerm); } Assert(sk->voteResponse.term == propTerm); @@ -1116,7 +1125,7 @@ RecvVoteResponse(Safekeeper *sk) n_votes++; if (n_votes < quorum) { - sk->state = SS_IDLE; /* can't do much yet, no quorum */ + sk->state = SS_IDLE; /* can't do much yet, no quorum */ } else if (n_votes > quorum) { @@ -1146,16 +1155,16 @@ HandleElectedProposer(void) DetermineEpochStartLsn(); /* - * Check if not all safekeepers are up-to-date, we need to - * download WAL needed to synchronize them + * Check if not all safekeepers are up-to-date, we need to download WAL + * needed to synchronize them */ if (truncateLsn < propEpochStartLsn) { elog(LOG, - "start recovery because truncateLsn=%X/%X is not " - "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(truncateLsn), - LSN_FORMAT_ARGS(propEpochStartLsn)); + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); /* Perform recovery */ if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) elog(FATAL, "Failed to recover state"); @@ -1175,18 +1184,17 @@ HandleElectedProposer(void) /* * The proposer has been elected, and there will be no quorum waiting - * after this point. There will be no safekeeper with state SS_IDLE - * also, because that state is used only for quorum waiting. + * after this point. There will be no safekeeper with state SS_IDLE also, + * because that state is used only for quorum waiting. */ if (syncSafekeepers) { /* - * Send empty message to enforce receiving feedback - * even from nodes who are fully recovered; this is - * required to learn they switched epoch which finishes - * sync-safeekepers who doesn't generate any real new - * records. Will go away once we switch to async acks. + * Send empty message to enforce receiving feedback even from nodes + * who are fully recovered; this is required to learn they switched + * epoch which finishes sync-safeekepers who doesn't generate any real + * new records. Will go away once we switch to async acks. */ BroadcastAppendRequest(); @@ -1200,7 +1208,7 @@ HandleElectedProposer(void) /* latest term in TermHistory, or 0 is there is no entries */ static term_t -GetHighestTerm(TermHistory *th) +GetHighestTerm(TermHistory * th) { return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; } @@ -1276,8 +1284,8 @@ DetermineEpochStartLsn(void) } /* - * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was - * committed yet. Start streaming then from the basebackup LSN. + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing + * was committed yet. Start streaming then from the basebackup LSN. */ if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) { @@ -1322,24 +1330,24 @@ DetermineEpochStartLsn(void) ); /* - * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since - * which we are going to write according to the consensus. If not, we must - * bail out, as clog and other non rel data is inconsistent. + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN + * since which we are going to write according to the consensus. If not, + * we must bail out, as clog and other non rel data is inconsistent. */ if (!syncSafekeepers) { /* - * Basebackup LSN always points to the beginning of the record (not the - * page), as StartupXLOG most probably wants it this way. Safekeepers - * don't skip header as they need continious stream of data, so - * correct LSN for comparison. + * Basebackup LSN always points to the beginning of the record (not + * the page), as StartupXLOG most probably wants it this way. + * Safekeepers don't skip header as they need continious stream of + * data, so correct LSN for comparison. */ if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) { /* - * However, allow to proceed if previously elected leader was me; plain - * restart of walproposer not intervened by concurrent compute (who could - * generate WAL) is ok. + * However, allow to proceed if previously elected leader was me; + * plain restart of walproposer not intervened by concurrent + * compute (who could generate WAL) is ok. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == walprop_shared->mineLastElectedTerm))) @@ -1407,7 +1415,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec { Assert(buf[0] == 'w' || buf[0] == 'k'); if (buf[0] == 'k') - continue; /* keepalive */ + continue; /* keepalive */ memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], sizeof rec_start_lsn); rec_start_lsn = pg_ntoh64(rec_start_lsn); @@ -1457,18 +1465,20 @@ SendProposerElected(Safekeeper *sk) { ProposerElected msg; TermHistory *th; - term_t lastCommonTerm; - int i; + term_t lastCommonTerm; + int i; /* - * Determine start LSN by comparing safekeeper's log term switch history and - * proposer's, searching for the divergence point. + * Determine start LSN by comparing safekeeper's log term switch history + * and proposer's, searching for the divergence point. * * Note: there is a vanishingly small chance of no common point even if * there is some WAL on safekeeper, if immediately after bootstrap compute - * wrote some WAL on single sk and died; we stream since the beginning then. + * wrote some WAL on single sk and died; we stream since the beginning + * then. */ th = &sk->voteResponse.termHistory; + /* * If any WAL is present on the sk, it must be authorized by some term. * OTOH, without any WAL there are no term swiches in the log. @@ -1485,7 +1495,7 @@ SendProposerElected(Safekeeper *sk) /* term must begin everywhere at the same point */ Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); } - i--; /* step back to the last common term */ + i--; /* step back to the last common term */ if (i < 0) { /* safekeeper is empty or no common point, start from the beginning */ @@ -1500,17 +1510,17 @@ SendProposerElected(Safekeeper *sk) * to the truncateLsn before, but now current safekeeper tells * otherwise. * - * Also we have a special condition here, which is empty safekeeper - * with no history. In combination with a gap, that can happen when - * we introduce a new safekeeper to the cluster. This is a rare case, - * which is triggered manually for now, and should be treated with - * care. + * Also we have a special condition here, which is empty + * safekeeper with no history. In combination with a gap, that can + * happen when we introduce a new safekeeper to the cluster. This + * is a rare case, which is triggered manually for now, and should + * be treated with care. */ /* - * truncateLsn will not change without ack from current safekeeper, - * and it's aligned to the WAL record, so we can safely start - * streaming from this point. + * truncateLsn will not change without ack from current + * safekeeper, and it's aligned to the WAL record, so we can + * safely start streaming from this point. */ sk->startStreamingAt = truncateLsn; @@ -1533,9 +1543,10 @@ SendProposerElected(Safekeeper *sk) } else { - XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; - XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : - sk->voteResponse.flushLsn); + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : + sk->voteResponse.flushLsn); + sk->startStreamingAt = Min(propEndLsn, skEndLsn); } } @@ -1595,8 +1606,8 @@ static void StartStreaming(Safekeeper *sk) { /* - * This is the only entrypoint to state SS_ACTIVE. It's executed - * exactly once for a connection. + * This is the only entrypoint to state SS_ACTIVE. It's executed exactly + * once for a connection. */ sk->state = SS_ACTIVE; sk->streamingAt = sk->startStreamingAt; @@ -1617,7 +1628,10 @@ SendMessageToNode(Safekeeper *sk) { Assert(sk->state == SS_ACTIVE); - /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ + /* + * Note: we always send everything to the safekeeper until WOULDBLOCK or + * nothing left to send + */ HandleActiveState(sk, WL_SOCKET_WRITEABLE); } @@ -1633,7 +1647,7 @@ BroadcastAppendRequest() } static void -PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +PrepareAppendRequest(AppendRequestHeader * req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); req->tag = 'a'; @@ -1652,7 +1666,7 @@ PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr e static void HandleActiveState(Safekeeper *sk, uint32 events) { - uint32 newEvents = WL_SOCKET_READABLE; + uint32 newEvents = WL_SOCKET_READABLE; if (events & WL_SOCKET_WRITEABLE) if (!SendAppendRequests(sk)) @@ -1666,10 +1680,10 @@ HandleActiveState(Safekeeper *sk, uint32 events) * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data * in the buffer. * - * LSN comparison checks if we have pending unsent messages. This check isn't - * necessary now, because we always send append messages immediately after - * arrival. But it's good to have it here in case we change this behavior - * in the future. + * LSN comparison checks if we have pending unsent messages. This check + * isn't necessary now, because we always send append messages immediately + * after arrival. But it's good to have it here in case we change this + * behavior in the future. */ if (sk->streamingAt != availableLsn || sk->flushWrite) newEvents |= WL_SOCKET_WRITEABLE; @@ -1689,15 +1703,16 @@ HandleActiveState(Safekeeper *sk, uint32 events) static bool SendAppendRequests(Safekeeper *sk) { - XLogRecPtr endLsn; + XLogRecPtr endLsn; AppendRequestHeader *req; PGAsyncWriteResult writeResult; WALReadError errinfo; - bool sentAnything = false; + bool sentAnything = false; if (sk->flushWrite) { if (!AsyncFlush(sk)) + /* * AsyncFlush failed, that could happen if the socket is closed or * we have nothing to write and should wait for writeable socket. @@ -1716,7 +1731,8 @@ SendAppendRequests(Safekeeper *sk) endLsn += MAX_SEND_SIZE; /* if we went beyond available WAL, back off */ - if (endLsn > availableLsn) { + if (endLsn > availableLsn) + { endLsn = availableLsn; } @@ -1734,21 +1750,21 @@ SendAppendRequests(Safekeeper *sk) resetStringInfo(&sk->outbuf); /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); + appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); /* write the WAL itself */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); if (!WALRead(sk->xlogreader, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn, - #if PG_VERSION_NUM >= 150000 - // FIXME don't use hardcoded timelineid here - 1, - #else - ThisTimeLineID, - #endif - &errinfo)) + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, +#if PG_VERSION_NUM >= 150000 + /* FIXME don't use hardcoded timelineid here */ + 1, +#else + ThisTimeLineID, +#endif + &errinfo)) { WALReadRaiseError(&errinfo); } @@ -1766,17 +1782,19 @@ SendAppendRequests(Safekeeper *sk) break; case PG_ASYNC_WRITE_TRY_FLUSH: + /* * We still need to call PQflush some more to finish the job. - * Caller function will handle this by setting right event set. + * Caller function will handle this by setting right event + * set. */ sk->flushWrite = true; return true; case PG_ASYNC_WRITE_FAIL: elog(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); ShutdownConnection(sk); return false; default: @@ -1800,17 +1818,17 @@ static bool RecvAppendResponses(Safekeeper *sk) { XLogRecPtr minQuorumLsn; - bool readAnything = false; + bool readAnything = false; while (true) { /* - * If our reading doesn't immediately succeed, any - * necessary error handling or state setting is taken care - * of. We can leave any other work until later. + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other + * work until later. */ sk->appendResponse.apm.tag = 'a'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->appendResponse)) break; ereport(DEBUG2, @@ -1824,8 +1842,8 @@ RecvAppendResponses(Safekeeper *sk) { /* Another compute with higher term is running. */ elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", - sk->host, sk->port, - sk->appendResponse.term, propTerm); + sk->host, sk->port, + sk->appendResponse.term, propTerm); } readAnything = true; @@ -1851,11 +1869,11 @@ RecvAppendResponses(Safekeeper *sk) /* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) { - uint8 nkeys; - int i; - int32 len; + uint8 nkeys; + int i; + int32 len; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); @@ -1863,54 +1881,65 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *r for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgstring(reply_message); + if (strcmp(key, "current_timeline_size") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); } else if (strcmp(key, "ps_writelsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); } else if (strcmp(key, "ps_flushlsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); } else if (strcmp(key, "ps_applylsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); } else if (strcmp(key, "ps_replytime") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->ps_replytime = pq_getmsgint64(reply_message); + pq_getmsgint(reply_message, sizeof(int32)); + //read value length + rf->ps_replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); + rf->ps_replytime, replyTimeStr); pfree(replyTimeStr); } } else { - len = pq_getmsgint(reply_message, sizeof(int32)); // read value length - // Skip unknown keys to support backward compatibile protocol changes - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + len = pq_getmsgint(reply_message, sizeof(int32)); + //read value length + + /* + * Skip unknown keys to support backward compatibile protocol + * changes + */ + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1952,9 +1981,10 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) static XLogRecPtr CalculateMinFlushLsn(void) { - XLogRecPtr lsn = n_safekeepers > 0 - ? safekeeper[0].appendResponse.flushLsn - : InvalidXLogRecPtr; + XLogRecPtr lsn = n_safekeepers > 0 + ? safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; + for (int i = 1; i < n_safekeepers; i++) { lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); @@ -2006,8 +2036,8 @@ WalproposerShmemInit(void) LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); walprop_shared = ShmemInitStruct("Walproposer shared state", - sizeof(WalproposerShmemState), - &found); + sizeof(WalproposerShmemState), + &found); if (!found) { @@ -2021,7 +2051,7 @@ WalproposerShmemInit(void) } void -replication_feedback_set(ReplicationFeedback *rf) +replication_feedback_set(ReplicationFeedback * rf) { SpinLockAcquire(&walprop_shared->mutex); memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); @@ -2044,10 +2074,11 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe * Get ReplicationFeedback fields from the most advanced safekeeper */ static void -GetLatestZentihFeedback(ReplicationFeedback *rf) +GetLatestZentihFeedback(ReplicationFeedback * rf) { - int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) { if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) @@ -2064,12 +2095,12 @@ GetLatestZentihFeedback(ReplicationFeedback *rf) rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); replication_feedback_set(rf); } @@ -2080,7 +2111,7 @@ HandleSafekeeperResponse(void) HotStandbyFeedback hsFeedback; XLogRecPtr minQuorumLsn; XLogRecPtr diskConsistentLsn; - XLogRecPtr minFlushLsn; + XLogRecPtr minFlushLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); @@ -2088,7 +2119,7 @@ HandleSafekeeperResponse(void) if (!syncSafekeepers) { - // Get ReplicationFeedback fields from the most advanced safekeeper + /* Get ReplicationFeedback fields from the most advanced safekeeper */ GetLatestZentihFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } @@ -2102,11 +2133,15 @@ HandleSafekeeperResponse(void) /* advance the replication slot */ if (!syncSafekeepers) ProcessStandbyReply( - // write_lsn - This is what durably stored in WAL service. + /* write_lsn - This is what durably stored in WAL service. */ quorumFeedback.flushLsn, - //flush_lsn - This is what durably stored in WAL service. + /* flush_lsn - This is what durably stored in WAL service. */ quorumFeedback.flushLsn, - //apply_lsn - This is what processed and durably saved at pageserver. + + /* + * apply_lsn - This is what processed and durably saved at + * pageserver. + */ quorumFeedback.rf.ps_flushlsn, GetCurrentTimestamp(), false); } @@ -2128,15 +2163,14 @@ HandleSafekeeperResponse(void) * flushed to all safekeepers. We must always start streaming from the * beginning of the record, which simplifies decoding on the far end. * - * Advanced truncateLsn should be not further than nearest commitLsn. - * This prevents surprising violation of truncateLsn <= commitLsn - * invariant which might occur because 1) truncateLsn can be advanced - * immediately once chunk is broadcast to all safekeepers, and - * commitLsn generally can't be advanced based on feedback from - * safekeeper who is still in the previous epoch (similar to 'leader - * can't commit entries from previous term' in Raft); 2) chunks we - * read from WAL and send are plain sheets of bytes, but safekeepers - * ack only on record boundaries. + * Advanced truncateLsn should be not further than nearest commitLsn. This + * prevents surprising violation of truncateLsn <= commitLsn invariant + * which might occur because 1) truncateLsn can be advanced immediately + * once chunk is broadcast to all safekeepers, and commitLsn generally + * can't be advanced based on feedback from safekeeper who is still in the + * previous epoch (similar to 'leader can't commit entries from previous + * term' in Raft); 2) chunks we read from WAL and send are plain sheets of + * bytes, but safekeepers ack only on record boundaries. */ minFlushLsn = CalculateMinFlushLsn(); if (minFlushLsn > truncateLsn) @@ -2144,8 +2178,8 @@ HandleSafekeeperResponse(void) truncateLsn = minFlushLsn; /* - * Advance the replication slot to free up old WAL files. Note - * that slot doesn't exist if we are in syncSafekeepers mode. + * Advance the replication slot to free up old WAL files. Note that + * slot doesn't exist if we are in syncSafekeepers mode. */ if (MyReplicationSlot) PhysicalConfirmReceivedLocation(truncateLsn); @@ -2170,7 +2204,7 @@ HandleSafekeeperResponse(void) n_synced = 0; for (int i = 0; i < n_safekeepers; i++) { - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; /* alive safekeeper which is not synced yet; wait for it */ @@ -2225,11 +2259,11 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * failed, a warning is emitted and the connection is reset. */ static bool -AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) { - char *buf; - int buf_size; - uint64 tag; + char *buf; + int buf_size; + uint64 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) @@ -2252,54 +2286,56 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) switch (tag) { case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); - pq_getmsgend(&s); - return true; - } + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } default: - { - Assert(false); - return false; - } + { + Assert(false); + return false; + } } } @@ -2367,7 +2403,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta ShutdownConnection(sk); return false; default: - Assert(false); + Assert(false); return false; } } @@ -2409,19 +2445,19 @@ AsyncFlush(Safekeeper *sk) } } -// Check if we need to suspend inserts because of lagging replication. +/* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) { if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; #if PG_VERSION_NUM >= 150000 - XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); #else - XLogRecPtr myFlushLsn = GetFlushRecPtr(); + XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024*1024) @@ -2434,23 +2470,23 @@ backpressure_lag_impl(void) if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 - && myFlushLsn > writePtr + max_replication_write_lag*MB)) + && myFlushLsn > writePtr + max_replication_write_lag * MB)) { - return (myFlushLsn - writePtr - max_replication_write_lag*MB); + return (myFlushLsn - writePtr - max_replication_write_lag * MB); } if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 - && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) + && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) { - return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); + return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); } if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 - && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) + && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) { - return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); + return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } } return 0; @@ -2458,24 +2494,26 @@ backpressure_lag_impl(void) #define BACK_PRESSURE_DELAY 10000L // 0.01 sec -static bool backpressure_throttling_impl(void) +static bool +backpressure_throttling_impl(void) { - int64 lag; - TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + int64 lag; + TimestampTz start, + stop; + bool retry = PrevProcessInterruptsCallback + ? PrevProcessInterruptsCallback() + : false; - // Don't throttle read only transactions and wal sender. + /* Don't throttle read only transactions and wal sender. */ if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) return retry; - // Calculate replicas lag + /* Calculate replicas lag */ lag = backpressure_lag_impl(); if (lag == 0) return retry; - // Suspend writers until replicas catch up + /* Suspend writers until replicas catch up */ set_ps_display("backpressure throttling"); elog(DEBUG2, "backpressure throttling: lag %lu", lag); diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 75167163f3..59e70f33bf 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -14,10 +14,13 @@ #define SK_PROTOCOL_VERSION 2 #define MAX_SAFEKEEPERS 32 -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ -#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ -#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single + * WAL message */ +#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender + * message header */ +#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender + * message header */ /* * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, @@ -25,12 +28,12 @@ */ #define WL_NO_EVENTS 0 -extern char* wal_acceptors_list; -extern int wal_acceptor_reconnect_timeout; -extern int wal_acceptor_connect_timeout; -extern bool am_wal_proposer; +extern char *wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connect_timeout; +extern bool am_wal_proposer; -struct WalProposerConn; /* Defined in libpqwalproposer */ +struct WalProposerConn; /* Defined in libpqwalproposer */ typedef struct WalProposerConn WalProposerConn; struct WalMessage; @@ -44,21 +47,26 @@ typedef enum { /* The full read was successful. buf now points to the data */ PG_ASYNC_READ_SUCCESS, - /* The read is ongoing. Wait until the connection is read-ready, then try - * again. */ + + /* + * The read is ongoing. Wait until the connection is read-ready, then try + * again. + */ PG_ASYNC_READ_TRY_AGAIN, /* Reading failed. Check PQerrorMessage(conn) */ PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; +} PGAsyncReadResult; /* Possible return values from WritePGAsync */ typedef enum { /* The write fully completed */ PG_ASYNC_WRITE_SUCCESS, - /* The write started, but you'll need to call PQflush some more times - * to finish it off. We just tried, so it's best to wait until the - * connection is read- or write-ready to try again. + + /* + * The write started, but you'll need to call PQflush some more times to + * finish it off. We just tried, so it's best to wait until the connection + * is read- or write-ready to try again. * * If it becomes read-ready, call PQconsumeInput and flush again. If it * becomes write-ready, just call PQflush. @@ -66,7 +74,7 @@ typedef enum PG_ASYNC_WRITE_TRY_FLUSH, /* Writing failed. Check PQerrorMessage(conn) */ PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; +} PGAsyncWriteResult; /* * WAL safekeeper state, which is used to wait for some event. @@ -79,8 +87,8 @@ typedef enum typedef enum { /* - * Does not have an active connection and will stay that way until - * further notice. + * Does not have an active connection and will stay that way until further + * notice. * * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. */ @@ -105,8 +113,8 @@ typedef enum SS_WAIT_EXEC_RESULT, /* - * Executing the receiving half of the handshake. After receiving, moves to - * SS_VOTING. + * Executing the receiving half of the handshake. After receiving, moves + * to SS_VOTING. */ SS_HANDSHAKE_RECV, @@ -120,8 +128,9 @@ typedef enum SS_VOTING, /* - * Already sent voting information, waiting to receive confirmation from the - * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. + * Already sent voting information, waiting to receive confirmation from + * the node. After receiving, moves to SS_IDLE, if the quorum isn't + * reached yet. */ SS_WAIT_VERDICT, @@ -141,7 +150,7 @@ typedef enum * to read. */ SS_ACTIVE, -} SafekeeperState; +} SafekeeperState; /* Consensus logical timestamp. */ typedef uint64 term_t; @@ -156,21 +165,21 @@ typedef uint64 NNodeId; /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting { - uint64 tag; /* message tag */ - uint32 protocolVersion; /* proposer-safekeeper protocol version */ - uint32 pgVersion; - pg_uuid_t proposerId; - uint64 systemId; /* Postgres system identifier */ - uint8 ztimelineid[16]; /* Zenith timeline id */ - uint8 ztenantid[16]; - TimeLineID timeline; - uint32 walSegSize; -} ProposerGreeting; + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 ztimelineid[16]; /* Zenith timeline id */ + uint8 ztenantid[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; typedef struct AcceptorProposerMessage { - uint64 tag; -} AcceptorProposerMessage; + uint64 tag; +} AcceptorProposerMessage; /* * Acceptor -> Proposer initial response: the highest term acceptor voted for. @@ -180,7 +189,7 @@ typedef struct AcceptorGreeting AcceptorProposerMessage apm; term_t term; NNodeId nodeId; -} AcceptorGreeting; +} AcceptorGreeting; /* * Proposer -> Acceptor vote request. @@ -189,36 +198,39 @@ typedef struct VoteRequest { uint64 tag; term_t term; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; /* Element of term switching chain. */ typedef struct TermSwitchEntry { - term_t term; - XLogRecPtr lsn; -} TermSwitchEntry; + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; typedef struct TermHistory { - uint32 n_entries; + uint32 n_entries; TermSwitchEntry *entries; -} TermHistory; +} TermHistory; /* Vote itself, sent from safekeeper to proposer */ -typedef struct VoteResponse { +typedef struct VoteResponse +{ AcceptorProposerMessage apm; - term_t term; - uint64 voteGiven; + term_t term; + uint64 voteGiven; + /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow - * proposer to choose the most advanced one. + * proposer to choose the most advanced one. */ - XLogRecPtr flushLsn; - XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for + * recovery of some safekeeper */ TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ -} VoteResponse; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; /* * Proposer -> Acceptor message announcing proposer is elected and communicating @@ -226,60 +238,62 @@ typedef struct VoteResponse { */ typedef struct ProposerElected { - uint64 tag; - term_t term; + uint64 tag; + term_t term; /* proposer will send since this point */ - XLogRecPtr startStreamingAt; + XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; -} ProposerElected; + XLogRecPtr timelineStartLsn; +} ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader { - uint64 tag; - term_t term; /* term of the proposer */ + uint64 tag; + term_t term; /* term of the proposer */ + /* * LSN since which current proposer appends WAL (begin_lsn of its first * record); determines epoch switch point. */ - XLogRecPtr epochStartLsn; - XLogRecPtr beginLsn; /* start position of message in WAL */ - XLogRecPtr endLsn; /* end position of message in WAL */ - XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + /* - * minimal LSN which may be needed for recovery of some safekeeper (end lsn - * + 1 of last chunk streamed to everyone) + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) */ - XLogRecPtr truncateLsn; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; /* * Hot standby feedback received from replica */ typedef struct HotStandbyFeedback { - TimestampTz ts; + TimestampTz ts; FullTransactionId xmin; FullTransactionId catalog_xmin; -} HotStandbyFeedback; +} HotStandbyFeedback; -typedef struct ReplicationFeedback +typedef struct ReplicationFeedback { - // current size of the timeline on pageserver - uint64 currentClusterSize; - // standby_status_update fields that safekeeper received from pageserver - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; + /* current size of the timeline on pageserver */ + uint64 currentClusterSize; + /* standby_status_update fields that safekeeper received from pageserver */ + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; TimestampTz ps_replytime; -} ReplicationFeedback; +} ReplicationFeedback; typedef struct WalproposerShmemState @@ -288,7 +302,7 @@ typedef struct WalproposerShmemState ReplicationFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; -} WalproposerShmemState; +} WalproposerShmemState; /* * Report safekeeper state to proposer @@ -296,25 +310,26 @@ typedef struct WalproposerShmemState typedef struct AppendResponse { AcceptorProposerMessage apm; + /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. */ - term_t term; - // TODO: add comment - XLogRecPtr flushLsn; - // Safekeeper reports back his awareness about which WAL is committed, as - // this is a criterion for walproposer --sync mode exit - XLogRecPtr commitLsn; + term_t term; + /* TODO: add comment */ + XLogRecPtr flushLsn; + /* Safekeeper reports back his awareness about which WAL is committed, as */ + /* this is a criterion for walproposer --sync mode exit */ + XLogRecPtr commitLsn; HotStandbyFeedback hs; - // Feedback recieved from pageserver includes standby_status_update fields - // and custom zenith feedback. - // This part of the message is extensible. + /* Feedback recieved from pageserver includes standby_status_update fields */ + /* and custom zenith feedback. */ + /* This part of the message is extensible. */ ReplicationFeedback rf; -} AppendResponse; +} AppendResponse; -// ReplicationFeedback is extensible part of the message that is parsed separately -// Other fields are fixed part +/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) @@ -323,9 +338,10 @@ typedef struct AppendResponse */ typedef struct Safekeeper { - char const* host; - char const* port; - char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ + char const *host; + char const *port; + char conninfo[MAXCONNINFO]; /* connection info for + * connecting/reconnecting */ /* * postgres protocol connection to the WAL acceptor @@ -333,46 +349,50 @@ typedef struct Safekeeper * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we * reach SS_ACTIVE; not before. */ - WalProposerConn* conn; + WalProposerConn *conn; + /* * Temporary buffer for the message being sent to the safekeeper. */ StringInfoData outbuf; + /* * WAL reader, allocated for each safekeeper. */ - XLogReaderState* xlogreader; + XLogReaderState *xlogreader; /* * Streaming will start here; must be record boundary. */ - XLogRecPtr startStreamingAt; + XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ - XLogRecPtr streamingAt; /* current streaming position */ - AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + bool flushWrite; /* set to true if we need to call AsyncFlush, + * to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ - int eventPos; /* position in wait event set. Equal to -1 if no event */ - SafekeeperState state; /* safekeeper state machine state */ - TimestampTz startedConnAt; /* when connection attempt started */ - AcceptorGreeting greetResponse; /* acceptor greeting */ - VoteResponse voteResponse; /* the vote */ - AppendResponse appendResponse; /* feedback for master */ + int eventPos; /* position in wait event set. Equal to -1 if + * no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz startedConnAt; /* when connection attempt started */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ } Safekeeper; extern PGDLLIMPORT void WalProposerMain(Datum main_arg); -void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); -void WalProposerPoll(void); -void WalProposerRegister(void); -void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +void WalProposerPoll(void); +void WalProposerRegister(void); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback * rf); extern void StartProposerReplication(StartReplicationCmd *cmd); -Size WalproposerShmemSize(void); -bool WalproposerShmemInit(void); -void replication_feedback_set(ReplicationFeedback *rf); -void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback * rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); /* libpqwalproposer hooks & helper type */ @@ -383,29 +403,37 @@ typedef enum WP_CONN_POLLING_READING, WP_CONN_POLLING_WRITING, WP_CONN_POLLING_OK, + /* * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. * We've removed it here to avoid clutter. */ -} WalProposerConnectPollStatusType; +} WalProposerConnectPollStatusType; /* Re-exported and modified ExecStatusType */ typedef enum { /* We received a single CopyBoth result */ WP_EXEC_SUCCESS_COPYBOTH, - /* Any success result other than a single CopyBoth was received. The specifics of the result - * were already logged, but it may be useful to provide an error message indicating which - * safekeeper messed up. + + /* + * Any success result other than a single CopyBoth was received. The + * specifics of the result were already logged, but it may be useful to + * provide an error message indicating which safekeeper messed up. * - * Do not expect PQerrorMessage to be appropriately set. */ + * Do not expect PQerrorMessage to be appropriately set. + */ WP_EXEC_UNEXPECTED_SUCCESS, - /* No result available at this time. Wait until read-ready, then call again. Internally, this is - * returned when PQisBusy indicates that PQgetResult would block. */ + + /* + * No result available at this time. Wait until read-ready, then call + * again. Internally, this is returned when PQisBusy indicates that + * PQgetResult would block. + */ WP_EXEC_NEEDS_INPUT, /* Catch-all failure. Check PQerrorMessage. */ WP_EXEC_FAILED, -} WalProposerExecStatusType; +} WalProposerExecStatusType; /* Re-exported ConnStatusType */ typedef enum @@ -414,40 +442,39 @@ typedef enum WP_CONNECTION_BAD, /* - * The original ConnStatusType has many more tags, but requests that - * they not be relied upon (except for displaying to the user). We - * don't need that extra functionality, so we collect them into a - * single tag here. + * The original ConnStatusType has many more tags, but requests that they + * not be relied upon (except for displaying to the user). We don't need + * that extra functionality, so we collect them into a single tag here. */ WP_CONNECTION_IN_PROGRESS, -} WalProposerConnStatusType; +} WalProposerConnStatusType; /* Re-exported PQerrorMessage */ -typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); +typedef char *(*walprop_error_message_fn) (WalProposerConn * conn); /* Re-exported PQstatus */ -typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); +typedef WalProposerConnStatusType(*walprop_status_fn) (WalProposerConn * conn); /* Re-exported PQconnectStart */ -typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); +typedef WalProposerConn * (*walprop_connect_start_fn) (char *conninfo); /* Re-exported PQconectPoll */ -typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); +typedef WalProposerConnectPollStatusType(*walprop_connect_poll_fn) (WalProposerConn * conn); /* Blocking wrapper around PQsendQuery */ -typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); +typedef bool (*walprop_send_query_fn) (WalProposerConn * conn, char *query); /* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ -typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); +typedef WalProposerExecStatusType(*walprop_get_query_result_fn) (WalProposerConn * conn); /* Re-exported PQsocket */ -typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); +typedef pgsocket (*walprop_socket_fn) (WalProposerConn * conn); /* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ -typedef int (*walprop_flush_fn) (WalProposerConn* conn); +typedef int (*walprop_flush_fn) (WalProposerConn * conn); /* Re-exported PQfinish */ -typedef void (*walprop_finish_fn) (WalProposerConn* conn); +typedef void (*walprop_finish_fn) (WalProposerConn * conn); /* * Ergonomic wrapper around PGgetCopyData @@ -463,9 +490,9 @@ typedef void (*walprop_finish_fn) (WalProposerConn* conn); * performs a bit of extra checking work that's always required and is normally * somewhat verbose. */ -typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, - char** buf, - int* amount); +typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn, + char **buf, + int *amount); /* * Ergonomic wrapper around PQputCopyData + PQflush @@ -474,33 +501,33 @@ typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, * * For information on the meaning of return codes, refer to PGAsyncWriteResult. */ -typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, - void const* buf, - size_t size); +typedef PGAsyncWriteResult(*walprop_async_write_fn) (WalProposerConn * conn, + void const *buf, + size_t size); /* * Blocking equivalent to walprop_async_write_fn * * Returns 'true' if successful, 'false' on failure. */ -typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); +typedef bool (*walprop_blocking_write_fn) (WalProposerConn * conn, void const *buf, size_t size); /* All libpqwalproposer exported functions collected together. */ typedef struct WalProposerFunctionsType { - walprop_error_message_fn walprop_error_message; - walprop_status_fn walprop_status; - walprop_connect_start_fn walprop_connect_start; - walprop_connect_poll_fn walprop_connect_poll; - walprop_send_query_fn walprop_send_query; - walprop_get_query_result_fn walprop_get_query_result; - walprop_socket_fn walprop_socket; - walprop_flush_fn walprop_flush; - walprop_finish_fn walprop_finish; - walprop_async_read_fn walprop_async_read; - walprop_async_write_fn walprop_async_write; - walprop_blocking_write_fn walprop_blocking_write; -} WalProposerFunctionsType; + walprop_error_message_fn walprop_error_message; + walprop_status_fn walprop_status; + walprop_connect_start_fn walprop_connect_start; + walprop_connect_poll_fn walprop_connect_poll; + walprop_send_query_fn walprop_send_query; + walprop_get_query_result_fn walprop_get_query_result; + walprop_socket_fn walprop_socket; + walprop_flush_fn walprop_flush; + walprop_finish_fn walprop_finish; + walprop_async_read_fn walprop_async_read; + walprop_async_write_fn walprop_async_write; + walprop_blocking_write_fn walprop_blocking_write; +} WalProposerFunctionsType; /* Allow the above functions to be "called" with normal syntax */ #define walprop_error_message(conn) \ @@ -536,8 +563,8 @@ typedef struct WalProposerFunctionsType * This pointer is set by the initializer in libpqwalproposer, so that we * can use it later. */ -extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; +extern PGDLLIMPORT WalProposerFunctionsType * WalProposerFunctions; extern uint64 BackpressureThrottlingTime(void); -#endif /* __NEON_WALPROPOSER_H__ */ +#endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index 417a8c4586..e1dcaa081d 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -127,10 +127,10 @@ CompareLsn(const void *a, const void *b) * * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); */ -char* +char * FormatSafekeeperState(SafekeeperState state) { - char* return_val = NULL; + char *return_val = NULL; switch (state) { @@ -171,27 +171,30 @@ FormatSafekeeperState(SafekeeperState state) /* Asserts that the provided events are expected for given safekeeper's state */ void -AssertEventsOkForState(uint32 events, Safekeeper* sk) +AssertEventsOkForState(uint32 events, Safekeeper *sk) { - uint32 expected = SafekeeperStateDesiredEvents(sk->state); + uint32 expected = SafekeeperStateDesiredEvents(sk->state); - /* The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. - * (b) if we are expecting something, there's overlap - * (i.e. `events & expected != 0`) + /* + * The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. (b) if we are expecting something, there's + * overlap (i.e. `events & expected != 0`) */ - bool events_ok_for_state; /* long name so the `Assert` is more clear later */ + bool events_ok_for_state; /* long name so the `Assert` is more + * clear later */ if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); + events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); else events_ok_for_state = ((events & expected) != 0); if (!events_ok_for_state) { - /* To give a descriptive message in the case of failure, we use elog and - * then an assertion that's guaranteed to fail. */ + /* + * To give a descriptive message in the case of failure, we use elog + * and then an assertion that's guaranteed to fail. + */ elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); Assert(events_ok_for_state); @@ -204,12 +207,12 @@ AssertEventsOkForState(uint32 events, Safekeeper* sk) uint32 SafekeeperStateDesiredEvents(SafekeeperState state) { - uint32 result = WL_NO_EVENTS; + uint32 result = WL_NO_EVENTS; /* If the state doesn't have a modifier, we can check the base state */ switch (state) { - /* Connecting states say what they want in the name */ + /* Connecting states say what they want in the name */ case SS_CONNECTING_READ: result = WL_SOCKET_READABLE; break; @@ -217,33 +220,35 @@ SafekeeperStateDesiredEvents(SafekeeperState state) result = WL_SOCKET_WRITEABLE; break; - /* Reading states need the socket to be read-ready to continue */ + /* Reading states need the socket to be read-ready to continue */ case SS_WAIT_EXEC_RESULT: case SS_HANDSHAKE_RECV: case SS_WAIT_VERDICT: result = WL_SOCKET_READABLE; break; - /* Idle states use read-readiness as a sign that the connection has been - * disconnected. */ + /* + * Idle states use read-readiness as a sign that the connection + * has been disconnected. + */ case SS_VOTING: case SS_IDLE: result = WL_SOCKET_READABLE; break; - /* - * Flush states require write-ready for flushing. - * Active state does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should - * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ + /* + * Flush states require write-ready for flushing. Active state + * does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We + * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ case SS_SEND_ELECTED_FLUSH: case SS_ACTIVE: result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; break; - /* The offline state expects no events. */ + /* The offline state expects no events. */ case SS_OFFLINE: result = WL_NO_EVENTS; break; @@ -263,27 +268,30 @@ SafekeeperStateDesiredEvents(SafekeeperState state) * * The string should not be freed. It should also not be expected to remain the same between * function calls. */ -char* +char * FormatEvents(uint32 events) { static char return_str[8]; /* Helper variable to check if there's extra bits */ - uint32 all_flags = WL_LATCH_SET - | WL_SOCKET_READABLE - | WL_SOCKET_WRITEABLE - | WL_TIMEOUT - | WL_POSTMASTER_DEATH - | WL_EXIT_ON_PM_DEATH - | WL_SOCKET_CONNECTED; + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; - /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an - * sense of what events have been triggered without needing to remember your powers of two. */ + /* + * The formatting here isn't supposed to be *particularly* useful -- it's + * just to give an sense of what events have been triggered without + * needing to remember your powers of two. + */ - return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; - return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; + return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_'; return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; - return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; + return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_'; return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; @@ -291,7 +299,7 @@ FormatEvents(uint32 events) if (events & (~all_flags)) { elog(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); + events & (~all_flags)); return_str[6] = '*'; return_str[7] = '\0'; } @@ -407,21 +415,21 @@ XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) if (walpropFile < 0) { - #if PG_VERSION_NUM >= 150000 - // FIXME Is it ok to use hardcoded value here? - TimeLineID tli = 1; - #else +#if PG_VERSION_NUM >= 150000 + /* FIXME Is it ok to use hardcoded value here? */ + TimeLineID tli = 1; +#else bool use_existent = true; - #endif +#endif /* Create/use new log file */ XLByteToSeg(recptr, walpropSegNo, wal_segment_size); - #if PG_VERSION_NUM >= 150000 +#if PG_VERSION_NUM >= 150000 walpropFile = XLogFileInit(walpropSegNo, tli); walpropFileTLI = tli; - #else +#else walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); walpropFileTLI = ThisTimeLineID; - #endif +#endif } /* Calculate the start offset of the received logs */ @@ -483,6 +491,7 @@ XLogWalPropClose(XLogRecPtr recptr) if (close(walpropFile) != 0) { char xlogfname[MAXFNAMELEN]; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); ereport(PANIC, @@ -508,12 +517,12 @@ StartProposerReplication(StartReplicationCmd *cmd) XLogRecPtr FlushPtr; TimeLineID currTLI; - #if PG_VERSION_NUM < 150000 +#if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); - #endif + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); +#endif /* create xlogreader for physical replication */ xlogreader = @@ -525,7 +534,7 @@ StartProposerReplication(StartReplicationCmd *cmd) if (!xlogreader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory"))); /* * We assume here that we're logging enough information in the WAL for @@ -542,7 +551,7 @@ StartProposerReplication(StartReplicationCmd *cmd) if (SlotIsLogical(MyReplicationSlot)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot use a logical replication slot for physical replication"))); + errmsg("cannot use a logical replication slot for physical replication"))); /* * We don't need to verify the slot's restart_lsn here; instead we @@ -630,9 +639,9 @@ StartProposerReplication(StartReplicationCmd *cmd) (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", LSN_FORMAT_ARGS(cmd->startpoint), cmd->timeline), - errdetail("This server's history forked from timeline %u at %X/%X.", - cmd->timeline, - LSN_FORMAT_ARGS(switchpoint)))); + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); } sendTimeLineValidUpto = switchpoint; } @@ -869,14 +878,14 @@ WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, errno = save_errno; ereport(ERROR, (errcode_for_file_access(), - errmsg("requested WAL segment %s has already been removed", - xlogfname))); + errmsg("requested WAL segment %s has already been removed", + xlogfname))); } else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); + errmsg("could not open file \"%s\": %m", + path))); } @@ -943,7 +952,7 @@ XLogSendPhysical(void) XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes PG_USED_FOR_ASSERTS_ONLY; - TimeLineID currTLI; + TimeLineID currTLI; /* If requested switch the WAL sender to the stopping state. */ if (got_STOPPING) @@ -1004,8 +1013,8 @@ XLogSendPhysical(void) { /* * Still a cascading standby. But is the timeline we're sending - * still the one recovery is recovering from? currTLI was - * updated by the GetStandbyFlushRecPtr() call above. + * still the one recovery is recovering from? currTLI was updated + * by the GetStandbyFlushRecPtr() call above. */ if (sendTimeLine != currTLI) becameHistoric = true; @@ -1043,11 +1052,11 @@ XLogSendPhysical(void) * primary: if the primary subsequently crashes and restarts, standbys * must not have applied any WAL that got lost on the primary. */ - #if PG_VERSION_NUM >= 150000 +#if PG_VERSION_NUM >= 150000 SendRqstPtr = GetFlushRecPtr(NULL); - #else +#else SendRqstPtr = GetFlushRecPtr(); - #endif +#endif } /* @@ -1180,4 +1189,3 @@ XLogSendPhysical(void) set_ps_display(activitymsg); } } - diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h index 4771d3ff82..aa5df5fa43 100644 --- a/pgxn/neon/walproposer_utils.h +++ b/pgxn/neon/walproposer_utils.h @@ -3,17 +3,17 @@ #include "walproposer.h" -int CompareLsn(const void *a, const void *b); -char* FormatSafekeeperState(SafekeeperState state); -void AssertEventsOkForState(uint32 events, Safekeeper* sk); -uint32 SafekeeperStateDesiredEvents(SafekeeperState state); -char* FormatEvents(uint32 events); -bool HexDecodeString(uint8 *result, char *input, int nbytes); -uint32 pq_getmsgint32_le(StringInfo msg); -uint64 pq_getmsgint64_le(StringInfo msg); -void pq_sendint32_le(StringInfo buf, uint32 i); -void pq_sendint64_le(StringInfo buf, uint64 i); -void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); -void XLogWalPropClose(XLogRecPtr recptr); +int CompareLsn(const void *a, const void *b); +char *FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper *sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char *FormatEvents(uint32 events); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); -#endif /* __NEON_WALPROPOSER_UTILS_H__ */ +#endif /* __NEON_WALPROPOSER_UTILS_H__ */ diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 3e30065cd3..07bd7bdd28 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -39,8 +39,8 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); * Linkage to functions in zenith module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +typedef void (*zenith_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; @@ -136,8 +136,8 @@ clear_buffer_cache(PG_FUNCTION_ARGS) /* * Pin the buffer, and release it again. Because we have - * zenith_test_evict==true, this will evict the page from - * the buffer cache if no one else is holding a pin on it. + * zenith_test_evict==true, this will evict the page from the + * buffer cache if no one else is holding a pin on it. */ if (isvalid) { @@ -177,8 +177,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *forkname; uint32 blkno; - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); @@ -262,7 +262,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to use raw page functions"))); + errmsg("must be superuser to use raw page functions"))); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || PG_ARGISNULL(3) || PG_ARGISNULL(4)) @@ -271,19 +271,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { RelFileNode rnode = { .spcNode = PG_GETARG_OID(0), - .dbNode = PG_GETARG_OID(1), + .dbNode = PG_GETARG_OID(1), .relNode = PG_GETARG_OID(2) }; - ForkNumber forknum = PG_GETARG_UINT32(3); + ForkNumber forknum = PG_GETARG_UINT32(3); - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); /* Initialize buffer to copy to */ - bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); @@ -298,7 +299,8 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) Datum neon_xlogflush(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogFlush(lsn); PG_RETURN_VOID(); } diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list new file mode 100644 index 0000000000..760f384212 --- /dev/null +++ b/pgxn/typedefs.list @@ -0,0 +1,3776 @@ +ACCESS_ALLOWED_ACE +ACL +ACL_SIZE_INFORMATION +AFFIX +ASN1_INTEGER +ASN1_OBJECT +ASN1_STRING +AV +A_ArrayExpr +A_Const +A_Expr +A_Expr_Kind +A_Indices +A_Indirection +A_Star +AbsoluteTime +AccessMethodInfo +AccessPriv +Acl +AclItem +AclMaskHow +AclMode +AclResult +AcquireSampleRowsFunc +ActionList +ActiveSnapshotElt +AddForeignUpdateTargets_function +AffixNode +AffixNodeData +AfterTriggerEvent +AfterTriggerEventChunk +AfterTriggerEventData +AfterTriggerEventList +AfterTriggerShared +AfterTriggerSharedData +AfterTriggersData +AfterTriggersQueryData +AfterTriggersTableData +AfterTriggersTransData +Agg +AggClauseCosts +AggInfo +AggPath +AggSplit +AggState +AggStatePerAgg +AggStatePerGroup +AggStatePerHash +AggStatePerPhase +AggStatePerTrans +AggStrategy +AggTransInfo +Aggref +AggregateInstrumentation +AlenState +Alias +AllocBlock +AllocChunk +AllocPointer +AllocSet +AllocSetContext +AllocSetFreeList +AllocateDesc +AllocateDescKind +AlterCollationStmt +AlterDatabaseSetStmt +AlterDatabaseStmt +AlterDefaultPrivilegesStmt +AlterDomainStmt +AlterEnumStmt +AlterEventTrigStmt +AlterExtensionContentsStmt +AlterExtensionStmt +AlterFdwStmt +AlterForeignServerStmt +AlterFunctionStmt +AlterObjectDependsStmt +AlterObjectSchemaStmt +AlterOpFamilyStmt +AlterOperatorStmt +AlterOwnerStmt +AlterPolicyStmt +AlterPublicationStmt +AlterRoleSetStmt +AlterRoleStmt +AlterSeqStmt +AlterStatsStmt +AlterSubscriptionStmt +AlterSubscriptionType +AlterSystemStmt +AlterTSConfigType +AlterTSConfigurationStmt +AlterTSDictionaryStmt +AlterTableCmd +AlterTableMoveAllStmt +AlterTableSpaceOptionsStmt +AlterTableStmt +AlterTableType +AlterTableUtilityContext +AlterTypeRecurseParams +AlterTypeStmt +AlterUserMappingStmt +AlteredTableInfo +AlternativeSubPlan +AmcheckOptions +AnalyzeAttrComputeStatsFunc +AnalyzeAttrFetchFunc +AnalyzeForeignTable_function +AnlExprData +AnlIndexData +AnyArrayType +Append +AppendPath +AppendRelInfo +AppendState +ApplyExecutionData +ApplySubXactData +Archive +ArchiveEntryPtrType +ArchiveFormat +ArchiveHandle +ArchiveMode +ArchiveOpts +ArchiverOutput +ArchiverStage +ArrayAnalyzeExtraData +ArrayBuildState +ArrayBuildStateAny +ArrayBuildStateArr +ArrayCoerceExpr +ArrayConstIterState +ArrayExpr +ArrayExprIterState +ArrayIOData +ArrayIterator +ArrayMapState +ArrayMetaState +ArrayParseState +ArraySubWorkspace +ArrayType +AsyncQueueControl +AsyncQueueEntry +AsyncRequest +AttInMetadata +AttStatsSlot +AttoptCacheEntry +AttoptCacheKey +AttrDefInfo +AttrDefault +AttrMap +AttrMissing +AttrNumber +AttributeOpts +AuthRequest +AutoPrewarmSharedState +AutoVacOpts +AutoVacuumShmemStruct +AutoVacuumWorkItem +AutoVacuumWorkItemType +AuxProcType +BF_ctx +BF_key +BF_word +BF_word_signed +BIGNUM +BIO +BIO_METHOD +BITVECP +BMS_Comparison +BMS_Membership +BN_CTX +BOOL +BOOLEAN +BOX +BTArrayKeyInfo +BTBuildState +BTCycleId +BTDedupInterval +BTDedupState +BTDedupStateData +BTDeletedPageData +BTIndexStat +BTInsertState +BTInsertStateData +BTLeader +BTMetaPageData +BTOneVacInfo +BTOptions +BTPS_State +BTPageOpaque +BTPageOpaqueData +BTPageStat +BTPageState +BTParallelScanDesc +BTPendingFSM +BTScanInsert +BTScanInsertData +BTScanOpaque +BTScanOpaqueData +BTScanPos +BTScanPosData +BTScanPosItem +BTShared +BTSortArrayContext +BTSpool +BTStack +BTStackData +BTVacInfo +BTVacState +BTVacuumPosting +BTVacuumPostingData +BTWriteState +BUF_MEM +BYTE +BY_HANDLE_FILE_INFORMATION +Backend +BackendId +BackendParameters +BackendState +BackendType +BackgroundWorker +BackgroundWorkerArray +BackgroundWorkerHandle +BackgroundWorkerSlot +Barrier +BaseBackupCmd +BeginDirectModify_function +BeginForeignInsert_function +BeginForeignModify_function +BeginForeignScan_function +BeginSampleScan_function +BernoulliSamplerData +BgWorkerStartTime +BgwHandleStatus +BinaryArithmFunc +BindParamCbData +BipartiteMatchState +BitmapAnd +BitmapAndPath +BitmapAndState +BitmapHeapPath +BitmapHeapScan +BitmapHeapScanState +BitmapIndexScan +BitmapIndexScanState +BitmapOr +BitmapOrPath +BitmapOrState +Bitmapset +BlobInfo +Block +BlockId +BlockIdData +BlockInfoRecord +BlockNumber +BlockSampler +BlockSamplerData +BlockedProcData +BlockedProcsData +BloomBuildState +BloomFilter +BloomMetaPageData +BloomOpaque +BloomOptions +BloomPageOpaque +BloomPageOpaqueData +BloomScanOpaque +BloomScanOpaqueData +BloomSignatureWord +BloomState +BloomTuple +BlowfishContext +BoolAggState +BoolExpr +BoolExprType +BoolTestType +BooleanTest +BpChar +BrinBuildState +BrinDesc +BrinMemTuple +BrinMetaPageData +BrinOpaque +BrinOpcInfo +BrinOptions +BrinRevmap +BrinSpecialSpace +BrinStatsData +BrinTuple +BrinValues +BtreeCheckState +BtreeLevel +Bucket +BufFile +Buffer +BufferAccessStrategy +BufferAccessStrategyType +BufferCachePagesContext +BufferCachePagesRec +BufferDesc +BufferDescPadded +BufferHeapTupleTableSlot +BufferLookupEnt +BufferStrategyControl +BufferTag +BufferUsage +BuildAccumulator +BuiltinScript +BulkInsertState +BulkInsertStateData +CACHESIGN +CAC_state +CCFastEqualFN +CCHashFN +CEOUC_WAIT_MODE +CFuncHashTabEntry +CHAR +CHECKPOINT +CHKVAL +CIRCLE +CMPDAffix +CONTEXT +COP +CRITICAL_SECTION +CRSSnapshotAction +CState +CTECycleClause +CTEMaterialize +CTESearchClause +CV +CachedExpression +CachedPlan +CachedPlanSource +CallContext +CallStmt +CancelRequestPacket +CaseExpr +CaseTestExpr +CaseWhen +Cash +CastInfo +CatCList +CatCTup +CatCache +CatCacheHeader +CatalogId +CatalogIndexState +ChangeVarNodes_context +CheckPoint +CheckPointStmt +CheckpointStatsData +CheckpointerRequest +CheckpointerShmemStruct +Chromosome +CkptSortItem +CkptTsStatus +ClientAuthentication_hook_type +ClientCertMode +ClientCertName +ClientData +ClonePtrType +ClosePortalStmt +ClosePtrType +Clump +ClusterInfo +ClusterParams +ClusterStmt +CmdType +CoalesceExpr +CoerceParamHook +CoerceToDomain +CoerceToDomainValue +CoerceViaIO +CoercionContext +CoercionForm +CoercionPathType +CollAliasData +CollInfo +CollateClause +CollateExpr +CollateStrength +CollectedATSubcmd +CollectedCommand +CollectedCommandType +ColorTrgm +ColorTrgmInfo +ColumnCompareData +ColumnDef +ColumnIOData +ColumnRef +ColumnsHashData +CombinationGenerator +ComboCidEntry +ComboCidEntryData +ComboCidKey +ComboCidKeyData +Command +CommandDest +CommandId +CommandTag +CommandTagBehavior +CommentItem +CommentStmt +CommitTimestampEntry +CommitTimestampShared +CommonEntry +CommonTableExpr +CompareScalarsContext +CompiledExprState +CompositeIOData +CompositeTypeStmt +CompoundAffixFlag +CompressionAlgorithm +CompressorState +ComputeXidHorizonsResult +ConditionVariable +ConditionVariableMinimallyPadded +ConditionalStack +ConfigData +ConfigVariable +ConnCacheEntry +ConnCacheKey +ConnParams +ConnStatusType +ConnType +ConnectionStateEnum +ConnsAllowedState +ConsiderSplitContext +Const +ConstrCheck +ConstrType +Constraint +ConstraintCategory +ConstraintInfo +ConstraintsSetStmt +ControlData +ControlFileData +ConvInfo +ConvProcInfo +ConversionLocation +ConvertRowtypeExpr +CookedConstraint +CopyDest +CopyFormatOptions +CopyFromState +CopyFromStateData +CopyInsertMethod +CopyMultiInsertBuffer +CopyMultiInsertInfo +CopySource +CopyStmt +CopyToState +CopyToStateData +Cost +CostSelector +Counters +CoverExt +CoverPos +CreateAmStmt +CreateCastStmt +CreateConversionStmt +CreateDomainStmt +CreateEnumStmt +CreateEventTrigStmt +CreateExtensionStmt +CreateFdwStmt +CreateForeignServerStmt +CreateForeignTableStmt +CreateFunctionStmt +CreateOpClassItem +CreateOpClassStmt +CreateOpFamilyStmt +CreatePLangStmt +CreatePolicyStmt +CreatePublicationStmt +CreateRangeStmt +CreateReplicationSlotCmd +CreateRoleStmt +CreateSchemaStmt +CreateSchemaStmtContext +CreateSeqStmt +CreateStatsStmt +CreateStmt +CreateStmtContext +CreateSubscriptionStmt +CreateTableAsStmt +CreateTableSpaceStmt +CreateTransformStmt +CreateTrigStmt +CreateUserMappingStmt +CreatedbStmt +CredHandle +CteItem +CteScan +CteScanState +CteState +CtlCommand +CtxtHandle +CurrentOfExpr +CustomExecMethods +CustomOutPtrType +CustomPath +CustomScan +CustomScanMethods +CustomScanState +CycleCtr +DBState +DCHCacheEntry +DEADLOCK_INFO +DECountItem +DH +DIR +DNSServiceErrorType +DNSServiceRef +DR_copy +DR_intorel +DR_printtup +DR_sqlfunction +DR_transientrel +DSA +DWORD +DataDumperPtr +DataPageDeleteStack +DatabaseInfo +DateADT +Datum +DatumTupleFields +DbInfo +DbInfoArr +DeClonePtrType +DeadLockState +DeallocateStmt +DeclareCursorStmt +DecodedBkpBlock +DecodingOutputState +DefElem +DefElemAction +DefaultACLInfo +DefineStmt +DeleteStmt +DependencyGenerator +DependencyGeneratorData +DependencyType +DestReceiver +DictISpell +DictInt +DictSimple +DictSnowball +DictSubState +DictSyn +DictThesaurus +DimensionInfo +DirectoryMethodData +DirectoryMethodFile +DisableTimeoutParams +DiscardMode +DiscardStmt +DistanceValue +DistinctExpr +DoStmt +DocRepresentation +DomainConstraintCache +DomainConstraintRef +DomainConstraintState +DomainConstraintType +DomainIOData +DropBehavior +DropOwnedStmt +DropReplicationSlotCmd +DropRoleStmt +DropStmt +DropSubscriptionStmt +DropTableSpaceStmt +DropUserMappingStmt +DropdbStmt +DumpComponents +DumpId +DumpOptions +DumpSignalInformation +DumpableObject +DumpableObjectType +DynamicFileList +DynamicZoneAbbrev +EC_KEY +EDGE +ENGINE +EOM_flatten_into_method +EOM_get_flat_size_method +EPQState +EPlan +EState +EVP_CIPHER +EVP_CIPHER_CTX +EVP_MD +EVP_MD_CTX +EVP_PKEY +EachState +Edge +EditableObjectType +ElementsState +EnableTimeoutParams +EndBlobPtrType +EndBlobsPtrType +EndDataPtrType +EndDirectModify_function +EndForeignInsert_function +EndForeignModify_function +EndForeignScan_function +EndSampleScan_function +EnumItem +EolType +EphemeralNameRelationType +EphemeralNamedRelation +EphemeralNamedRelationData +EphemeralNamedRelationMetadata +EphemeralNamedRelationMetadataData +EquivalenceClass +EquivalenceMember +ErrorContextCallback +ErrorData +EstimateDSMForeignScan_function +EstimationInfo +EventTriggerCacheEntry +EventTriggerCacheItem +EventTriggerCacheStateType +EventTriggerData +EventTriggerEvent +EventTriggerInfo +EventTriggerQueryState +ExceptionLabelMap +ExceptionMap +ExclusiveBackupState +ExecAuxRowMark +ExecEvalBoolSubroutine +ExecEvalSubroutine +ExecForeignBatchInsert_function +ExecForeignDelete_function +ExecForeignInsert_function +ExecForeignTruncate_function +ExecForeignUpdate_function +ExecParallelEstimateContext +ExecParallelInitializeDSMContext +ExecPhraseData +ExecProcNodeMtd +ExecRowMark +ExecScanAccessMtd +ExecScanRecheckMtd +ExecStatus +ExecStatusType +ExecuteStmt +ExecutorCheckPerms_hook_type +ExecutorEnd_hook_type +ExecutorFinish_hook_type +ExecutorRun_hook_type +ExecutorStart_hook_type +ExpandedArrayHeader +ExpandedObjectHeader +ExpandedObjectMethods +ExpandedRange +ExpandedRecordFieldInfo +ExpandedRecordHeader +ExplainDirectModify_function +ExplainForeignModify_function +ExplainForeignScan_function +ExplainFormat +ExplainOneQuery_hook_type +ExplainState +ExplainStmt +ExplainWorkersState +ExportedSnapshot +Expr +ExprContext +ExprContextCallbackFunction +ExprContext_CB +ExprDoneCond +ExprEvalOp +ExprEvalOpLookup +ExprEvalRowtypeCache +ExprEvalStep +ExprState +ExprStateEvalFunc +ExtensibleNode +ExtensibleNodeEntry +ExtensibleNodeMethods +ExtensionControlFile +ExtensionInfo +ExtensionMemberId +ExtensionVersionInfo +FDWCollateState +FD_SET +FILE +FILETIME +FILE_INFORMATION_CLASS +FILE_STANDARD_INFORMATION +FSMAddress +FSMPage +FSMPageData +FakeRelCacheEntry +FakeRelCacheEntryData +FastPathStrongRelationLockData +FdwInfo +FdwRoutine +FetchDirection +FetchStmt +FieldSelect +FieldStore +File +FileFdwExecutionState +FileFdwPlanState +FileNameMap +FileTag +FinalPathExtraData +FindColsContext +FindSplitData +FindSplitStrat +FixedParallelExecutorState +FixedParallelState +FixedParamState +FlagMode +FlushPosition +FmgrBuiltin +FmgrHookEventType +FmgrInfo +ForBothCellState +ForBothState +ForEachState +ForFiveState +ForFourState +ForThreeState +ForeignAsyncConfigureWait_function +ForeignAsyncNotify_function +ForeignAsyncRequest_function +ForeignDataWrapper +ForeignKeyCacheInfo +ForeignKeyOptInfo +ForeignPath +ForeignScan +ForeignScanState +ForeignServer +ForeignServerInfo +ForeignTable +ForeignTruncateInfo +ForkNumber +FormData_pg_aggregate +FormData_pg_am +FormData_pg_amop +FormData_pg_amproc +FormData_pg_attrdef +FormData_pg_attribute +FormData_pg_auth_members +FormData_pg_authid +FormData_pg_cast +FormData_pg_class +FormData_pg_collation +FormData_pg_constraint +FormData_pg_conversion +FormData_pg_database +FormData_pg_default_acl +FormData_pg_depend +FormData_pg_enum +FormData_pg_event_trigger +FormData_pg_extension +FormData_pg_foreign_data_wrapper +FormData_pg_foreign_server +FormData_pg_foreign_table +FormData_pg_index +FormData_pg_inherits +FormData_pg_language +FormData_pg_largeobject +FormData_pg_largeobject_metadata +FormData_pg_namespace +FormData_pg_opclass +FormData_pg_operator +FormData_pg_opfamily +FormData_pg_partitioned_table +FormData_pg_policy +FormData_pg_proc +FormData_pg_publication +FormData_pg_publication_rel +FormData_pg_range +FormData_pg_replication_origin +FormData_pg_rewrite +FormData_pg_sequence +FormData_pg_sequence_data +FormData_pg_shdepend +FormData_pg_statistic +FormData_pg_statistic_ext +FormData_pg_subscription +FormData_pg_subscription_rel +FormData_pg_tablespace +FormData_pg_transform +FormData_pg_trigger +FormData_pg_ts_config +FormData_pg_ts_config_map +FormData_pg_ts_dict +FormData_pg_ts_parser +FormData_pg_ts_template +FormData_pg_type +FormData_pg_user_mapping +Form_pg_aggregate +Form_pg_am +Form_pg_amop +Form_pg_amproc +Form_pg_attrdef +Form_pg_attribute +Form_pg_auth_members +Form_pg_authid +Form_pg_cast +Form_pg_class +Form_pg_collation +Form_pg_constraint +Form_pg_conversion +Form_pg_database +Form_pg_default_acl +Form_pg_depend +Form_pg_enum +Form_pg_event_trigger +Form_pg_extension +Form_pg_foreign_data_wrapper +Form_pg_foreign_server +Form_pg_foreign_table +Form_pg_index +Form_pg_inherits +Form_pg_language +Form_pg_largeobject +Form_pg_largeobject_metadata +Form_pg_namespace +Form_pg_opclass +Form_pg_operator +Form_pg_opfamily +Form_pg_partitioned_table +Form_pg_policy +Form_pg_proc +Form_pg_publication +Form_pg_publication_rel +Form_pg_range +Form_pg_replication_origin +Form_pg_rewrite +Form_pg_sequence +Form_pg_sequence_data +Form_pg_shdepend +Form_pg_statistic +Form_pg_statistic_ext +Form_pg_subscription +Form_pg_subscription_rel +Form_pg_tablespace +Form_pg_transform +Form_pg_trigger +Form_pg_ts_config +Form_pg_ts_config_map +Form_pg_ts_dict +Form_pg_ts_parser +Form_pg_ts_template +Form_pg_type +Form_pg_user_mapping +FormatNode +FreeBlockNumberArray +FreeListData +FreePageBtree +FreePageBtreeHeader +FreePageBtreeInternalKey +FreePageBtreeLeafKey +FreePageBtreeSearchResult +FreePageManager +FreePageSpanLeader +FromCharDateMode +FromExpr +FullTransactionId +FuncCall +FuncCallContext +FuncCandidateList +FuncDetailCode +FuncExpr +FuncInfo +FuncLookupError +FunctionCallInfo +FunctionCallInfoBaseData +FunctionParameter +FunctionParameterMode +FunctionScan +FunctionScanPerFuncState +FunctionScanState +FuzzyAttrMatchState +GBT_NUMKEY +GBT_NUMKEY_R +GBT_VARKEY +GBT_VARKEY_R +GENERAL_NAME +GISTBuildBuffers +GISTBuildState +GISTDeletedPageContents +GISTENTRY +GISTInsertStack +GISTInsertState +GISTIntArrayBigOptions +GISTIntArrayOptions +GISTNodeBuffer +GISTNodeBufferPage +GISTPageOpaque +GISTPageOpaqueData +GISTPageSplitInfo +GISTSTATE +GISTScanOpaque +GISTScanOpaqueData +GISTSearchHeapItem +GISTSearchItem +GISTTYPE +GIST_SPLITVEC +GMReaderTupleBuffer +GV +Gather +GatherMerge +GatherMergePath +GatherMergeState +GatherPath +GatherState +Gene +GeneratePruningStepsContext +GenerationBlock +GenerationChunk +GenerationContext +GenerationPointer +GenericCosts +GenericXLogState +GeqoPrivateData +GetForeignJoinPaths_function +GetForeignModifyBatchSize_function +GetForeignPaths_function +GetForeignPlan_function +GetForeignRelSize_function +GetForeignRowMarkType_function +GetForeignUpperPaths_function +GetState +GiSTOptions +GinBtree +GinBtreeData +GinBtreeDataLeafInsertData +GinBtreeEntryInsertData +GinBtreeStack +GinBuildState +GinChkVal +GinEntries +GinEntryAccumulator +GinIndexStat +GinMetaPageData +GinNullCategory +GinOptions +GinPageOpaque +GinPageOpaqueData +GinPlaceToPageRC +GinPostingList +GinQualCounts +GinScanEntry +GinScanKey +GinScanOpaque +GinScanOpaqueData +GinState +GinStatsData +GinTernaryValue +GinTupleCollector +GinVacuumState +GistBuildMode +GistEntryVector +GistHstoreOptions +GistInetKey +GistNSN +GistOptBufferingMode +GistSortedBuildPageState +GistSplitUnion +GistSplitVector +GistTsVectorOptions +GistVacState +GlobalTransaction +GlobalVisState +GrantRoleStmt +GrantStmt +GrantTargetType +Group +GroupClause +GroupPath +GroupPathExtraData +GroupResultPath +GroupState +GroupVarInfo +GroupingFunc +GroupingSet +GroupingSetData +GroupingSetKind +GroupingSetsPath +GucAction +GucBoolAssignHook +GucBoolCheckHook +GucContext +GucEnumAssignHook +GucEnumCheckHook +GucIntAssignHook +GucIntCheckHook +GucRealAssignHook +GucRealCheckHook +GucShowHook +GucSource +GucStack +GucStackState +GucStringAssignHook +GucStringCheckHook +HANDLE +HASHACTION +HASHBUCKET +HASHCTL +HASHELEMENT +HASHHDR +HASHSEGMENT +HASH_SEQ_STATUS +HCRYPTPROV +HE +HEntry +HIST_ENTRY +HKEY +HLOCAL +HMAC_CTX +HMODULE +HOldEntry +HRESULT +HSParser +HSpool +HStore +HTAB +HTSV_Result +HV +Hash +HashAggBatch +HashAggSpill +HashAllocFunc +HashBuildState +HashCompareFunc +HashCopyFunc +HashIndexStat +HashInstrumentation +HashJoin +HashJoinState +HashJoinTable +HashJoinTuple +HashMemoryChunk +HashMetaPage +HashMetaPageData +HashOptions +HashPageOpaque +HashPageOpaqueData +HashPageStat +HashPath +HashScanOpaque +HashScanOpaqueData +HashScanPosData +HashScanPosItem +HashSkewBucket +HashState +HashTapeInfo +HashValueFunc +HbaLine +HbaToken +HeadlineJsonState +HeadlineParsedText +HeadlineWordEntry +HeapCheckContext +HeapScanDesc +HeapTuple +HeapTupleData +HeapTupleFields +HeapTupleForceOption +HeapTupleHeader +HeapTupleHeaderData +HeapTupleTableSlot +HistControl +HotStandbyState +I32 +ICU_Convert_Func +ID +INFIX +INT128 +INTERFACE_INFO +IOFuncSelector +IO_STATUS_BLOCK +IPCompareMethod +ITEM +IV +IdentLine +IdentifierLookup +IdentifySystemCmd +IfStackElem +ImportForeignSchemaStmt +ImportForeignSchemaType +ImportForeignSchema_function +ImportQual +InProgressEnt +IncludeWal +InclusionOpaque +IncrementVarSublevelsUp_context +IncrementalSort +IncrementalSortExecutionStatus +IncrementalSortGroupInfo +IncrementalSortInfo +IncrementalSortPath +IncrementalSortState +Index +IndexAMProperty +IndexAmRoutine +IndexArrayKeyInfo +IndexAttachInfo +IndexAttrBitmapKind +IndexBuildCallback +IndexBuildResult +IndexBulkDeleteCallback +IndexBulkDeleteResult +IndexClause +IndexClauseSet +IndexDeleteCounts +IndexDeletePrefetchState +IndexElem +IndexFetchHeapData +IndexFetchTableData +IndexInfo +IndexList +IndexOnlyScan +IndexOnlyScanState +IndexOptInfo +IndexOrderByDistance +IndexPath +IndexRuntimeKeyInfo +IndexScan +IndexScanDesc +IndexScanState +IndexStateFlagsAction +IndexStmt +IndexTuple +IndexTupleData +IndexUniqueCheck +IndexVacuumInfo +IndxInfo +InferClause +InferenceElem +InfoItem +InhInfo +InheritableSocket +InitSampleScan_function +InitializeDSMForeignScan_function +InitializeWorkerForeignScan_function +InlineCodeBlock +InsertStmt +Instrumentation +Int128AggState +Int8TransTypeData +IntRBTreeNode +IntegerSet +InternalDefaultACL +InternalGrant +Interval +IntoClause +InvalidationChunk +InvalidationListHeader +IpcMemoryId +IpcMemoryKey +IpcMemoryState +IpcSemaphoreId +IpcSemaphoreKey +IsForeignPathAsyncCapable_function +IsForeignRelUpdatable_function +IsForeignScanParallelSafe_function +IsoConnInfo +IspellDict +Item +ItemId +ItemIdData +ItemPointer +ItemPointerData +IterateDirectModify_function +IterateForeignScan_function +IterateJsonStringValuesState +JEntry +JHashState +JOBOBJECTINFOCLASS +JOBOBJECT_BASIC_LIMIT_INFORMATION +JOBOBJECT_BASIC_UI_RESTRICTIONS +JOBOBJECT_SECURITY_LIMIT_INFORMATION +JitContext +JitInstrumentation +JitProviderCallbacks +JitProviderCompileExprCB +JitProviderInit +JitProviderReleaseContextCB +JitProviderResetAfterErrorCB +Join +JoinCostWorkspace +JoinExpr +JoinHashEntry +JoinPath +JoinPathExtraData +JoinState +JoinType +JsObject +JsValue +JsonAggState +JsonBaseObjectInfo +JsonHashEntry +JsonIterateStringValuesAction +JsonLexContext +JsonLikeRegexContext +JsonManifestFileField +JsonManifestParseContext +JsonManifestParseState +JsonManifestSemanticState +JsonManifestWALRangeField +JsonParseContext +JsonParseErrorType +JsonPath +JsonPathBool +JsonPathExecContext +JsonPathExecResult +JsonPathGinAddPathItemFunc +JsonPathGinContext +JsonPathGinExtractNodesFunc +JsonPathGinNode +JsonPathGinNodeType +JsonPathGinPath +JsonPathGinPathItem +JsonPathItem +JsonPathItemType +JsonPathKeyword +JsonPathParseItem +JsonPathParseResult +JsonPathPredicateCallback +JsonPathString +JsonSemAction +JsonTokenType +JsonTransformStringValuesAction +JsonTypeCategory +JsonValueList +JsonValueListIterator +Jsonb +JsonbAggState +JsonbContainer +JsonbInState +JsonbIterState +JsonbIterator +JsonbIteratorToken +JsonbPair +JsonbParseState +JsonbSubWorkspace +JsonbTypeCategory +JsonbValue +JumbleState +JunkFilter +KeyArray +KeySuffix +KeyWord +LARGE_INTEGER +LDAP +LDAPMessage +LDAPURLDesc +LDAP_TIMEVAL +LINE +LLVMAttributeRef +LLVMBasicBlockRef +LLVMBuilderRef +LLVMIntPredicate +LLVMJitContext +LLVMJitHandle +LLVMMemoryBufferRef +LLVMModuleRef +LLVMOrcJITStackRef +LLVMOrcModuleHandle +LLVMOrcTargetAddress +LLVMPassManagerBuilderRef +LLVMPassManagerRef +LLVMSharedModuleRef +LLVMTargetMachineRef +LLVMTargetRef +LLVMTypeRef +LLVMValueRef +LOCALLOCK +LOCALLOCKOWNER +LOCALLOCKTAG +LOCALPREDICATELOCK +LOCK +LOCKMASK +LOCKMETHODID +LOCKMODE +LOCKTAG +LONG +LONG_PTR +LOOP +LPBYTE +LPCTSTR +LPCWSTR +LPDWORD +LPSECURITY_ATTRIBUTES +LPSERVICE_STATUS +LPSTR +LPTHREAD_START_ROUTINE +LPTSTR +LPVOID +LPWSTR +LSEG +LUID +LVDeadTuples +LVPagePruneState +LVParallelState +LVRelState +LVSavedErrInfo +LVShared +LVSharedIndStats +LWLock +LWLockHandle +LWLockMode +LWLockPadded +LabelProvider +LagTracker +LargeObjectDesc +LastAttnumInfo +Latch +LerpFunc +LexDescr +LexemeEntry +LexemeHashKey +LexemeInfo +LexemeKey +LexizeData +LibraryInfo +Limit +LimitOption +LimitPath +LimitState +LimitStateCond +List +ListCell +ListDictionary +ListParsedLex +ListenAction +ListenActionKind +ListenStmt +LoadStmt +LocalBufferLookupEnt +LocalPgBackendStatus +LocalTransactionId +LocationIndex +LocationLen +LockAcquireResult +LockClauseStrength +LockData +LockInfoData +LockInstanceData +LockMethod +LockMethodData +LockRelId +LockRows +LockRowsPath +LockRowsState +LockStmt +LockTagType +LockTupleMode +LockViewRecurse_context +LockWaitPolicy +LockingClause +LogOpts +LogStmtLevel +LogicalDecodeBeginCB +LogicalDecodeBeginPrepareCB +LogicalDecodeChangeCB +LogicalDecodeCommitCB +LogicalDecodeCommitPreparedCB +LogicalDecodeFilterByOriginCB +LogicalDecodeFilterPrepareCB +LogicalDecodeMessageCB +LogicalDecodePrepareCB +LogicalDecodeRollbackPreparedCB +LogicalDecodeShutdownCB +LogicalDecodeStartupCB +LogicalDecodeStreamAbortCB +LogicalDecodeStreamChangeCB +LogicalDecodeStreamCommitCB +LogicalDecodeStreamMessageCB +LogicalDecodeStreamPrepareCB +LogicalDecodeStreamStartCB +LogicalDecodeStreamStopCB +LogicalDecodeStreamTruncateCB +LogicalDecodeTruncateCB +LogicalDecodingContext +LogicalErrorCallbackState +LogicalOutputPluginInit +LogicalOutputPluginWriterPrepareWrite +LogicalOutputPluginWriterUpdateProgress +LogicalOutputPluginWriterWrite +LogicalRepBeginData +LogicalRepCommitData +LogicalRepCtxStruct +LogicalRepMsgType +LogicalRepPartMapEntry +LogicalRepRelId +LogicalRepRelMapEntry +LogicalRepRelation +LogicalRepTupleData +LogicalRepTyp +LogicalRepWorker +LogicalRewriteMappingData +LogicalTape +LogicalTapeSet +LtreeGistOptions +LtreeSignature +MAGIC +MBuf +MCVItem +MCVList +MEMORY_BASIC_INFORMATION +MINIDUMPWRITEDUMP +MINIDUMP_TYPE +MJEvalResult +MTTargetRelLookup +MVDependencies +MVDependency +MVNDistinct +MVNDistinctItem +Material +MaterialPath +MaterialState +MdfdVec +Memoize +MemoizeEntry +MemoizeInstrumentation +MemoizeKey +MemoizePath +MemoizeState +MemoizeTuple +MemoryContext +MemoryContextCallback +MemoryContextCallbackFunction +MemoryContextCounters +MemoryContextData +MemoryContextMethods +MemoryStatsPrintFunc +MergeAppend +MergeAppendPath +MergeAppendState +MergeJoin +MergeJoinClause +MergeJoinState +MergePath +MergeScanSelCache +MetaCommand +MinMaxAggInfo +MinMaxAggPath +MinMaxExpr +MinMaxMultiOptions +MinMaxOp +MinimalTuple +MinimalTupleData +MinimalTupleTableSlot +MinmaxMultiOpaque +MinmaxOpaque +ModifyTable +ModifyTablePath +ModifyTableState +MorphOpaque +MsgType +MultiAssignRef +MultiSortSupport +MultiSortSupportData +MultiXactId +MultiXactMember +MultiXactOffset +MultiXactStateData +MultiXactStatus +MultirangeIOData +MultirangeParseState +MultirangeType +NDBOX +NODE +NTSTATUS +NUMCacheEntry +NUMDesc +NUMProc +NV +Name +NameData +NameHashEntry +NamedArgExpr +NamedLWLockTranche +NamedLWLockTrancheRequest +NamedTuplestoreScan +NamedTuplestoreScanState +NamespaceInfo +NestLoop +NestLoopParam +NestLoopState +NestPath +NewColumnValue +NewConstraint +NextSampleBlock_function +NextSampleTuple_function +NextValueExpr +Node +NodeTag +NonEmptyRange +Notification +NotificationHash +NotificationList +NotifyStmt +Nsrt +NullIfExpr +NullTest +NullTestType +NullableDatum +Numeric +NumericAggState +NumericDigit +NumericSortSupport +NumericSumAccum +NumericVar +OM_uint32 +OP +OSAPerGroupState +OSAPerQueryState +OSInfo +OSSLCipher +OSSLDigest +OVERLAPPED +ObjectAccessDrop +ObjectAccessNamespaceSearch +ObjectAccessPostAlter +ObjectAccessPostCreate +ObjectAccessType +ObjectAddress +ObjectAddressAndFlags +ObjectAddressExtra +ObjectAddressStack +ObjectAddresses +ObjectClass +ObjectPropertyType +ObjectType +ObjectWithArgs +Offset +OffsetNumber +OffsetVarNodes_context +Oid +OidOptions +OkeysState +OldSnapshotControlData +OldSnapshotTimeMapping +OldToNewMapping +OldToNewMappingData +OnCommitAction +OnCommitItem +OnConflictAction +OnConflictClause +OnConflictExpr +OnConflictSetState +OpBtreeInterpretation +OpClassCacheEnt +OpExpr +OpFamilyMember +OpFamilyOpFuncGroup +OpclassInfo +Operator +OperatorElement +OpfamilyInfo +OprCacheEntry +OprCacheKey +OprInfo +OprProofCacheEntry +OprProofCacheKey +OutputContext +OutputPluginCallbacks +OutputPluginOptions +OutputPluginOutputType +OverrideSearchPath +OverrideStackEntry +OverridingKind +PACE_HEADER +PACL +PATH +PBOOL +PCtxtHandle +PFN +PFN_NTQUERYINFORMATIONFILE +PGAlignedBlock +PGAlignedXLogBlock +PGAsyncStatusType +PGCALL2 +PGChecksummablePage +PGContextVisibility +PGEvent +PGEventConnDestroy +PGEventConnReset +PGEventId +PGEventProc +PGEventRegister +PGEventResultCopy +PGEventResultCreate +PGEventResultDestroy +PGFInfoFunction +PGFileType +PGFunction +PGLZ_HistEntry +PGLZ_Strategy +PGMessageField +PGModuleMagicFunction +PGNoticeHooks +PGOutputData +PGPROC +PGP_CFB +PGP_Context +PGP_MPI +PGP_PubKey +PGP_S2K +PGPing +PGQueryClass +PGRUsage +PGSemaphore +PGSemaphoreData +PGShmemHeader +PGTargetServerType +PGTernaryBool +PGTransactionStatusType +PGVerbosity +PG_Locale_Strategy +PG_Lock_Status +PG_init_t +PGcancel +PGcmdQueueEntry +PGconn +PGdataValue +PGlobjfuncs +PGnotify +PGpipelineStatus +PGresAttDesc +PGresAttValue +PGresParamDesc +PGresult +PGresult_data +PHANDLE +PIO_STATUS_BLOCK +PLAINTREE +PLAssignStmt +PLUID_AND_ATTRIBUTES +PLcword +PLpgSQL_case_when +PLpgSQL_condition +PLpgSQL_datum +PLpgSQL_datum_type +PLpgSQL_diag_item +PLpgSQL_exception +PLpgSQL_exception_block +PLpgSQL_execstate +PLpgSQL_expr +PLpgSQL_func_hashkey +PLpgSQL_function +PLpgSQL_getdiag_kind +PLpgSQL_if_elsif +PLpgSQL_label_type +PLpgSQL_nsitem +PLpgSQL_nsitem_type +PLpgSQL_plugin +PLpgSQL_promise_type +PLpgSQL_raise_option +PLpgSQL_raise_option_type +PLpgSQL_rec +PLpgSQL_recfield +PLpgSQL_resolve_option +PLpgSQL_row +PLpgSQL_stmt +PLpgSQL_stmt_assert +PLpgSQL_stmt_assign +PLpgSQL_stmt_block +PLpgSQL_stmt_call +PLpgSQL_stmt_case +PLpgSQL_stmt_close +PLpgSQL_stmt_commit +PLpgSQL_stmt_dynexecute +PLpgSQL_stmt_dynfors +PLpgSQL_stmt_execsql +PLpgSQL_stmt_exit +PLpgSQL_stmt_fetch +PLpgSQL_stmt_forc +PLpgSQL_stmt_foreach_a +PLpgSQL_stmt_fori +PLpgSQL_stmt_forq +PLpgSQL_stmt_fors +PLpgSQL_stmt_getdiag +PLpgSQL_stmt_if +PLpgSQL_stmt_loop +PLpgSQL_stmt_open +PLpgSQL_stmt_perform +PLpgSQL_stmt_raise +PLpgSQL_stmt_return +PLpgSQL_stmt_return_next +PLpgSQL_stmt_return_query +PLpgSQL_stmt_rollback +PLpgSQL_stmt_type +PLpgSQL_stmt_while +PLpgSQL_trigtype +PLpgSQL_type +PLpgSQL_type_type +PLpgSQL_var +PLpgSQL_variable +PLwdatum +PLword +PLyArrayToOb +PLyCursorObject +PLyDatumToOb +PLyDatumToObFunc +PLyExceptionEntry +PLyExecutionContext +PLyObToArray +PLyObToDatum +PLyObToDatumFunc +PLyObToDomain +PLyObToScalar +PLyObToTransform +PLyObToTuple +PLyObject_AsString_t +PLyPlanObject +PLyProcedure +PLyProcedureEntry +PLyProcedureKey +PLyResultObject +PLySRFState +PLySavedArgs +PLyScalarToOb +PLySubtransactionData +PLySubtransactionObject +PLyTransformToOb +PLyTupleToOb +PLyUnicode_FromStringAndSize_t +PLy_elog_impl_t +PMINIDUMP_CALLBACK_INFORMATION +PMINIDUMP_EXCEPTION_INFORMATION +PMINIDUMP_USER_STREAM_INFORMATION +PMSignalData +PMSignalReason +PMState +POLYGON +PQArgBlock +PQEnvironmentOption +PQExpBuffer +PQExpBufferData +PQcommMethods +PQconninfoOption +PQnoticeProcessor +PQnoticeReceiver +PQprintOpt +PQsslKeyPassHook_OpenSSL_type +PREDICATELOCK +PREDICATELOCKTAG +PREDICATELOCKTARGET +PREDICATELOCKTARGETTAG +PROCESS_INFORMATION +PROCLOCK +PROCLOCKTAG +PROC_HDR +PROC_QUEUE +PSID +PSID_AND_ATTRIBUTES +PSQL_COMP_CASE +PSQL_ECHO +PSQL_ECHO_HIDDEN +PSQL_ERROR_ROLLBACK +PTEntryArray +PTIterationArray +PTOKEN_PRIVILEGES +PTOKEN_USER +PUTENVPROC +PVOID +PX_Alias +PX_Cipher +PX_Combo +PX_HMAC +PX_MD +Page +PageData +PageGistNSN +PageHeader +PageHeaderData +PageXLogRecPtr +PagetableEntry +Pairs +ParallelAppendState +ParallelBitmapHeapState +ParallelBlockTableScanDesc +ParallelBlockTableScanWorker +ParallelBlockTableScanWorkerData +ParallelCompletionPtr +ParallelContext +ParallelExecutorInfo +ParallelHashGrowth +ParallelHashJoinBatch +ParallelHashJoinBatchAccessor +ParallelHashJoinState +ParallelIndexScanDesc +ParallelReadyList +ParallelSlot +ParallelSlotArray +ParallelSlotResultHandler +ParallelState +ParallelTableScanDesc +ParallelTableScanDescData +ParallelWorkerContext +ParallelWorkerInfo +Param +ParamCompileHook +ParamExecData +ParamExternData +ParamFetchHook +ParamKind +ParamListInfo +ParamPathInfo +ParamRef +ParamsErrorCbData +ParentMapEntry +ParseCallbackState +ParseExprKind +ParseNamespaceColumn +ParseNamespaceItem +ParseParamRefHook +ParseState +ParsedLex +ParsedScript +ParsedText +ParsedWord +ParserSetupHook +ParserState +PartClauseInfo +PartClauseMatchStatus +PartClauseTarget +PartitionBoundInfo +PartitionBoundInfoData +PartitionBoundSpec +PartitionCmd +PartitionDesc +PartitionDescData +PartitionDirectory +PartitionDirectoryEntry +PartitionDispatch +PartitionElem +PartitionHashBound +PartitionKey +PartitionListValue +PartitionMap +PartitionPruneCombineOp +PartitionPruneContext +PartitionPruneInfo +PartitionPruneState +PartitionPruneStep +PartitionPruneStepCombine +PartitionPruneStepOp +PartitionPruningData +PartitionRangeBound +PartitionRangeDatum +PartitionRangeDatumKind +PartitionScheme +PartitionSpec +PartitionTupleRouting +PartitionedRelPruneInfo +PartitionedRelPruningData +PartitionwiseAggregateType +PasswordType +Path +PathClauseUsage +PathCostComparison +PathHashStack +PathKey +PathKeysComparison +PathTarget +PatternInfo +PatternInfoArray +Pattern_Prefix_Status +Pattern_Type +PendingFsyncEntry +PendingRelDelete +PendingRelSync +PendingUnlinkEntry +PendingWriteback +PerlInterpreter +Perl_check_t +Perl_ppaddr_t +Permutation +PermutationStep +PermutationStepBlocker +PermutationStepBlockerType +PgArchData +PgBackendGSSStatus +PgBackendSSLStatus +PgBackendStatus +PgBenchExpr +PgBenchExprLink +PgBenchExprList +PgBenchExprType +PgBenchFunction +PgBenchValue +PgBenchValueType +PgChecksumMode +PgFdwAnalyzeState +PgFdwConnState +PgFdwDirectModifyState +PgFdwModifyState +PgFdwOption +PgFdwPathExtraData +PgFdwRelationInfo +PgFdwScanState +PgIfAddrCallback +PgStat_ArchiverStats +PgStat_BackendFunctionEntry +PgStat_Counter +PgStat_FunctionCallUsage +PgStat_FunctionCounts +PgStat_FunctionEntry +PgStat_GlobalStats +PgStat_Msg +PgStat_MsgAnalyze +PgStat_MsgAnlAncestors +PgStat_MsgArchiver +PgStat_MsgAutovacStart +PgStat_MsgBgWriter +PgStat_MsgChecksumFailure +PgStat_MsgConnect +PgStat_MsgDeadlock +PgStat_MsgDisconnect +PgStat_MsgDropdb +PgStat_MsgDummy +PgStat_MsgFuncpurge +PgStat_MsgFuncstat +PgStat_MsgHdr +PgStat_MsgInquiry +PgStat_MsgRecoveryConflict +PgStat_MsgReplSlot +PgStat_MsgResetcounter +PgStat_MsgResetreplslotcounter +PgStat_MsgResetsharedcounter +PgStat_MsgResetsinglecounter +PgStat_MsgResetslrucounter +PgStat_MsgSLRU +PgStat_MsgTabpurge +PgStat_MsgTabstat +PgStat_MsgTempFile +PgStat_MsgVacuum +PgStat_MsgWal +PgStat_SLRUStats +PgStat_Shared_Reset_Target +PgStat_Single_Reset_Type +PgStat_StatDBEntry +PgStat_StatFuncEntry +PgStat_StatReplSlotEntry +PgStat_StatTabEntry +PgStat_SubXactStatus +PgStat_TableCounts +PgStat_TableEntry +PgStat_TableStatus +PgStat_TableXactStatus +PgStat_WalStats +PgXmlErrorContext +PgXmlStrictness +Pg_finfo_record +Pg_magic_struct +PipeProtoChunk +PipeProtoHeader +PlaceHolderInfo +PlaceHolderVar +Plan +PlanDirectModify_function +PlanForeignModify_function +PlanInvalItem +PlanRowMark +PlanState +PlannedStmt +PlannerGlobal +PlannerInfo +PlannerParamItem +Point +Pointer +PolicyInfo +PolyNumAggState +Pool +PopulateArrayContext +PopulateArrayState +PopulateRecordCache +PopulateRecordsetState +Port +Portal +PortalHashEnt +PortalStatus +PortalStrategy +PostParseColumnRefHook +PostgresPollingStatusType +PostingItem +PostponedQual +PreParseColumnRefHook +PredClass +PredIterInfo +PredIterInfoData +PredXactList +PredXactListElement +PredicateLockData +PredicateLockTargetType +PrefetchBufferResult +PrepParallelRestorePtrType +PrepareStmt +PreparedStatement +PresortedKeyData +PrewarmType +PrintExtraTocPtrType +PrintTocDataPtrType +PrintfArgType +PrintfArgValue +PrintfTarget +PrinttupAttrInfo +PrivTarget +PrivateRefCountEntry +ProcArrayStruct +ProcLangInfo +ProcSignalBarrierType +ProcSignalHeader +ProcSignalReason +ProcSignalSlot +ProcState +ProcWaitStatus +ProcessUtilityContext +ProcessUtility_hook_type +ProcessingMode +ProgressCommandType +ProjectSet +ProjectSetPath +ProjectSetState +ProjectionInfo +ProjectionPath +ProtocolVersion +PrsStorage +PruneState +PruneStepResult +PsqlScanCallbacks +PsqlScanQuoteType +PsqlScanResult +PsqlScanState +PsqlScanStateData +PsqlSettings +Publication +PublicationActions +PublicationInfo +PublicationPartOpt +PublicationRelInfo +PullFilter +PullFilterOps +PushFilter +PushFilterOps +PushFunction +PyCFunction +PyCodeObject +PyMappingMethods +PyMethodDef +PyModuleDef +PyObject +PySequenceMethods +PyTypeObject +Py_ssize_t +QPRS_STATE +QTN2QTState +QTNode +QUERYTYPE +QUERY_SECURITY_CONTEXT_TOKEN_FN +QualCost +QualItem +Query +QueryCompletion +QueryDesc +QueryEnvironment +QueryInfo +QueryItem +QueryItemType +QueryMode +QueryOperand +QueryOperator +QueryRepresentation +QueryRepresentationOperand +QuerySource +QueueBackendStatus +QueuePosition +QuitSignalReason +RBTNode +RBTOrderControl +RBTree +RBTreeIterator +REPARSE_JUNCTION_DATA_BUFFER +RIX +RI_CompareHashEntry +RI_CompareKey +RI_ConstraintInfo +RI_QueryHashEntry +RI_QueryKey +RTEKind +RWConflict +RWConflictPoolHeader +RandomState +Range +RangeBound +RangeBox +RangeFunction +RangeIOData +RangeQueryClause +RangeSubselect +RangeTableFunc +RangeTableFuncCol +RangeTableSample +RangeTblEntry +RangeTblFunction +RangeTblRef +RangeType +RangeVar +RangeVarGetRelidCallback +Ranges +RawColumnDefault +RawParseMode +RawStmt +ReInitializeDSMForeignScan_function +ReScanForeignScan_function +ReadBufPtrType +ReadBufferMode +ReadBytePtrType +ReadExtraTocPtrType +ReadFunc +ReassignOwnedStmt +RecheckForeignScan_function +RecordCacheEntry +RecordCompareData +RecordIOData +RecoveryLockListsEntry +RecoveryPauseState +RecoveryState +RecoveryTargetTimeLineGoal +RecoveryTargetType +RectBox +RecursionContext +RecursiveUnion +RecursiveUnionPath +RecursiveUnionState +RefetchForeignRow_function +RefreshMatViewStmt +RegProcedure +Regis +RegisNode +RegisteredBgWorker +ReindexErrorInfo +ReindexIndexInfo +ReindexObjectType +ReindexParams +ReindexStmt +ReindexType +RelFileNode +RelFileNodeBackend +RelIdCacheEnt +RelInfo +RelInfoArr +RelMapFile +RelMapping +RelOptInfo +RelOptKind +RelSizeEntry +RelTag +RelToCheck +RelToCluster +RelabelType +Relation +RelationData +RelationInfo +RelationPtr +RelationSyncEntry +RelcacheCallbackFunction +RelfilenodeMapEntry +RelfilenodeMapKey +Relids +RelocationBufferInfo +RelptrFreePageBtree +RelptrFreePageManager +RelptrFreePageSpanLeader +RenameStmt +ReopenPtrType +ReorderBuffer +ReorderBufferApplyChangeCB +ReorderBufferApplyTruncateCB +ReorderBufferBeginCB +ReorderBufferChange +ReorderBufferCommitCB +ReorderBufferCommitPreparedCB +ReorderBufferDiskChange +ReorderBufferIterTXNEntry +ReorderBufferIterTXNState +ReorderBufferMessageCB +ReorderBufferPrepareCB +ReorderBufferRollbackPreparedCB +ReorderBufferStreamAbortCB +ReorderBufferStreamChangeCB +ReorderBufferStreamCommitCB +ReorderBufferStreamMessageCB +ReorderBufferStreamPrepareCB +ReorderBufferStreamStartCB +ReorderBufferStreamStopCB +ReorderBufferStreamTruncateCB +ReorderBufferTXN +ReorderBufferTXNByIdEnt +ReorderBufferToastEnt +ReorderBufferTupleBuf +ReorderBufferTupleCidEnt +ReorderBufferTupleCidKey +ReorderTuple +RepOriginId +ReparameterizeForeignPathByChild_function +ReplaceVarsFromTargetList_context +ReplaceVarsNoMatchOption +ReplicaIdentityStmt +ReplicationKind +ReplicationSlot +ReplicationSlotCtlData +ReplicationSlotOnDisk +ReplicationSlotPersistency +ReplicationSlotPersistentData +ReplicationState +ReplicationStateCtl +ReplicationStateOnDisk +ResTarget +ReservoirState +ReservoirStateData +ResourceArray +ResourceOwner +ResourceReleaseCallback +ResourceReleaseCallbackItem +ResourceReleasePhase +RestoreOptions +RestorePass +RestrictInfo +Result +ResultRelInfo +ResultState +ReturnSetInfo +ReturnStmt +RevmapContents +RewriteMappingDataEntry +RewriteMappingFile +RewriteRule +RewriteState +RmgrData +RmgrDescData +RmgrId +RmgrIds +RoleSpec +RoleSpecType +RoleStmtType +RollupData +RowCompareExpr +RowCompareType +RowExpr +RowIdentityVarInfo +RowMarkClause +RowMarkType +RowSecurityDesc +RowSecurityPolicy +RuleInfo +RuleLock +RuleStmt +RunningTransactions +RunningTransactionsData +SC_HANDLE +SECURITY_ATTRIBUTES +SECURITY_STATUS +SEG +SERIALIZABLEXACT +SERIALIZABLEXID +SERIALIZABLEXIDTAG +SERVICE_STATUS +SERVICE_STATUS_HANDLE +SERVICE_TABLE_ENTRY +SHM_QUEUE +SID_AND_ATTRIBUTES +SID_IDENTIFIER_AUTHORITY +SID_NAME_USE +SISeg +SIZE_T +SMgrRelation +SMgrRelationData +SMgrSortArray +SOCKADDR +SOCKET +SPELL +SPICallbackArg +SPIExecuteOptions +SPIParseOpenOptions +SPIPlanPtr +SPIPrepareOptions +SPITupleTable +SPLITCOST +SPNode +SPNodeData +SPPageDesc +SQLCmd +SQLDropObject +SQLFunctionCache +SQLFunctionCachePtr +SQLFunctionParseInfo +SQLFunctionParseInfoPtr +SQLValueFunction +SQLValueFunctionOp +SSL +SSLExtensionInfoContext +SSL_CTX +STARTUPINFO +STRLEN +SV +SYNCHRONIZATION_BARRIER +SampleScan +SampleScanGetSampleSize_function +SampleScanState +SamplerRandomState +ScalarArrayOpExpr +ScalarArrayOpExprHashEntry +ScalarArrayOpExprHashTable +ScalarIOData +ScalarItem +ScalarMCVItem +Scan +ScanDirection +ScanKey +ScanKeyData +ScanKeywordHashFunc +ScanKeywordList +ScanState +ScanTypeControl +ScannerCallbackState +SchemaQuery +SecBuffer +SecBufferDesc +SecLabelItem +SecLabelStmt +SeenRelsEntry +SelectLimit +SelectStmt +Selectivity +SemTPadded +SemiAntiJoinFactors +SeqScan +SeqScanState +SeqTable +SeqTableData +SerCommitSeqNo +SerialControl +SerializableXactHandle +SerializedActiveRelMaps +SerializedRanges +SerializedReindexState +SerializedSnapshotData +SerializedTransactionState +Session +SessionBackupState +SessionEndType +SetConstraintState +SetConstraintStateData +SetConstraintTriggerData +SetExprState +SetFunctionReturnMode +SetOp +SetOpCmd +SetOpPath +SetOpState +SetOpStatePerGroup +SetOpStrategy +SetOperation +SetOperationStmt +SetQuantifier +SetToDefault +SetupWorkerPtrType +ShDependObjectInfo +SharedAggInfo +SharedBitmapState +SharedDependencyObjectType +SharedDependencyType +SharedExecutorInstrumentation +SharedFileSet +SharedHashInfo +SharedIncrementalSortInfo +SharedInvalCatalogMsg +SharedInvalCatcacheMsg +SharedInvalRelcacheMsg +SharedInvalRelmapMsg +SharedInvalSmgrMsg +SharedInvalSnapshotMsg +SharedInvalidationMessage +SharedJitInstrumentation +SharedMemoizeInfo +SharedRecordTableEntry +SharedRecordTableKey +SharedRecordTypmodRegistry +SharedSortInfo +SharedTuplestore +SharedTuplestoreAccessor +SharedTuplestoreChunk +SharedTuplestoreParticipant +SharedTypmodTableEntry +Sharedsort +ShellTypeInfo +ShippableCacheEntry +ShippableCacheKey +ShmemIndexEnt +ShutdownForeignScan_function +ShutdownInformation +ShutdownMode +SignTSVector +SimpleActionList +SimpleActionListCell +SimpleEcontextStackEntry +SimpleOidList +SimpleOidListCell +SimplePtrList +SimplePtrListCell +SimpleStats +SimpleStringList +SimpleStringListCell +SingleBoundSortItem +Size +SkipPages +SlabBlock +SlabChunk +SlabContext +SlabSlot +SlotErrCallbackArg +SlotNumber +SlruCtl +SlruCtlData +SlruErrorCause +SlruPageStatus +SlruScanCallback +SlruShared +SlruSharedData +SlruWriteAll +SlruWriteAllData +SnapBuild +SnapBuildOnDisk +SnapBuildState +Snapshot +SnapshotData +SnapshotType +SockAddr +Sort +SortBy +SortByDir +SortByNulls +SortCoordinate +SortGroupClause +SortItem +SortPath +SortShimExtra +SortState +SortSupport +SortSupportData +SortTuple +SortTupleComparator +SortedPoint +SpGistBuildState +SpGistCache +SpGistDeadTuple +SpGistDeadTupleData +SpGistInnerTuple +SpGistInnerTupleData +SpGistLUPCache +SpGistLastUsedPage +SpGistLeafTuple +SpGistLeafTupleData +SpGistMetaPageData +SpGistNodeTuple +SpGistNodeTupleData +SpGistOptions +SpGistPageOpaque +SpGistPageOpaqueData +SpGistScanOpaque +SpGistScanOpaqueData +SpGistSearchItem +SpGistState +SpGistTypeDesc +SpecialJoinInfo +SpinDelayStatus +SplitInterval +SplitLR +SplitPoint +SplitTextOutputData +SplitVar +SplitedPageLayout +StackElem +StartBlobPtrType +StartBlobsPtrType +StartDataPtrType +StartReplicationCmd +StartupStatusEnum +StatEntry +StatExtEntry +StatMsgType +StateFileChunk +StatisticExtInfo +Stats +StatsBuildData +StatsData +StatsElem +StatsExtInfo +StdAnalyzeData +StdRdOptIndexCleanup +StdRdOptions +Step +StopList +StrategyNumber +StreamCtl +StreamXidHash +StringInfo +StringInfoData +StripnullState +SubLink +SubLinkType +SubPlan +SubPlanState +SubRemoveRels +SubTransactionId +SubXactCallback +SubXactCallbackItem +SubXactEvent +SubXactInfo +SubqueryScan +SubqueryScanPath +SubqueryScanState +SubscriptExecSetup +SubscriptExecSteps +SubscriptRoutines +SubscriptTransform +SubscriptingRef +SubscriptingRefState +Subscription +SubscriptionInfo +SubscriptionRelState +SupportRequestCost +SupportRequestIndexCondition +SupportRequestRows +SupportRequestSelectivity +SupportRequestSimplify +Syn +SyncOps +SyncRepConfigData +SyncRepStandbyData +SyncRequestHandler +SyncRequestType +SysFKRelationship +SysScanDesc +SyscacheCallbackFunction +SystemRowsSamplerData +SystemSamplerData +SystemTimeSamplerData +TAR_MEMBER +TBMIterateResult +TBMIteratingState +TBMIterator +TBMSharedIterator +TBMSharedIteratorState +TBMStatus +TBlockState +TIDBitmap +TM_FailureData +TM_IndexDelete +TM_IndexDeleteOp +TM_IndexStatus +TM_Result +TOKEN_DEFAULT_DACL +TOKEN_INFORMATION_CLASS +TOKEN_PRIVILEGES +TOKEN_USER +TParser +TParserCharTest +TParserPosition +TParserSpecial +TParserState +TParserStateAction +TParserStateActionItem +TQueueDestReceiver +TRGM +TSAnyCacheEntry +TSConfigCacheEntry +TSConfigInfo +TSDictInfo +TSDictionaryCacheEntry +TSExecuteCallback +TSLexeme +TSParserCacheEntry +TSParserInfo +TSQuery +TSQueryData +TSQueryParserState +TSQuerySign +TSReadPointer +TSTemplateInfo +TSTernaryValue +TSTokenTypeStorage +TSVector +TSVectorBuildState +TSVectorData +TSVectorParseState +TSVectorStat +TState +TStoreState +TXNEntryFile +TYPCATEGORY +T_Action +T_WorkerStatus +TabStatHashEntry +TabStatusArray +TableAmRoutine +TableAttachInfo +TableDataInfo +TableFunc +TableFuncRoutine +TableFuncScan +TableFuncScanState +TableInfo +TableLikeClause +TableSampleClause +TableScanDesc +TableScanDescData +TableSpaceCacheEntry +TableSpaceOpts +TablespaceList +TablespaceListCell +TapeBlockTrailer +TapeShare +TarMethodData +TarMethodFile +TargetEntry +TclExceptionNameMap +Tcl_DString +Tcl_FileProc +Tcl_HashEntry +Tcl_HashTable +Tcl_Interp +Tcl_NotifierProcs +Tcl_Obj +Tcl_Time +TempNamespaceStatus +TestDecodingData +TestDecodingTxnData +TestSpec +TextFreq +TextPositionState +TheLexeme +TheSubstitute +TidExpr +TidExprType +TidHashKey +TidOpExpr +TidPath +TidRangePath +TidRangeScan +TidRangeScanState +TidScan +TidScanState +TimeADT +TimeLineHistoryCmd +TimeLineHistoryEntry +TimeLineID +TimeOffset +TimeStamp +TimeTzADT +TimeZoneAbbrevTable +TimeoutId +TimeoutType +Timestamp +TimestampTz +TmFromChar +TmToChar +ToastAttrInfo +ToastCompressionId +ToastTupleContext +ToastedAttribute +TocEntry +TokenAuxData +TokenizedLine +TrackItem +TransInvalidationInfo +TransState +TransactionId +TransactionState +TransactionStateData +TransactionStmt +TransactionStmtKind +TransformInfo +TransformJsonStringValuesState +TransitionCaptureState +TrgmArc +TrgmArcInfo +TrgmBound +TrgmColor +TrgmColorInfo +TrgmGistOptions +TrgmNFA +TrgmPackArcInfo +TrgmPackedArc +TrgmPackedGraph +TrgmPackedState +TrgmPrefix +TrgmState +TrgmStateKey +TrieChar +Trigger +TriggerData +TriggerDesc +TriggerEvent +TriggerFlags +TriggerInfo +TriggerTransition +TruncateStmt +TsmRoutine +TupOutputState +TupSortStatus +TupStoreStatus +TupleConstr +TupleConversionMap +TupleDesc +TupleHashEntry +TupleHashEntryData +TupleHashIterator +TupleHashTable +TupleQueueReader +TupleTableSlot +TupleTableSlotOps +TuplesortInstrumentation +TuplesortMethod +TuplesortSpaceType +Tuplesortstate +Tuplestorestate +TwoPhaseCallback +TwoPhaseFileHeader +TwoPhaseLockRecord +TwoPhasePgStatRecord +TwoPhasePredicateLockRecord +TwoPhasePredicateRecord +TwoPhasePredicateRecordType +TwoPhasePredicateXactRecord +TwoPhaseRecordOnDisk +TwoPhaseRmgrId +TwoPhaseStateData +Type +TypeCacheEntry +TypeCacheEnumData +TypeCast +TypeCat +TypeFuncClass +TypeInfo +TypeName +U +U32 +U8 +UChar +UCharIterator +UColAttribute +UColAttributeValue +UCollator +UConverter +UErrorCode +UINT +ULARGE_INTEGER +ULONG +ULONG_PTR +UV +UVersionInfo +UnicodeNormalizationForm +UnicodeNormalizationQC +Unique +UniquePath +UniquePathMethod +UniqueState +UnlistenStmt +UnpackTarState +UnresolvedTup +UnresolvedTupData +UpdateStmt +UpperRelationKind +UpperUniquePath +UserAuth +UserMapping +UserOpts +VacAttrStats +VacAttrStatsP +VacErrPhase +VacOptValue +VacuumParams +VacuumRelation +VacuumStmt +ValidateIndexState +Value +ValuesScan +ValuesScanState +Var +VarBit +VarChar +VarParamState +VarString +VarStringSortSupport +Variable +VariableAssignHook +VariableCache +VariableCacheData +VariableSetKind +VariableSetStmt +VariableShowStmt +VariableSpace +VariableStatData +VariableSubstituteHook +VersionedQuery +Vfd +ViewCheckOption +ViewOptCheckOption +ViewOptions +ViewStmt +VirtualTransactionId +VirtualTupleTableSlot +VolatileFunctionStatus +Vsrt +WAIT_ORDER +WALAvailability +WALInsertLock +WALInsertLockPadded +WALOpenSegment +WALReadError +WALSegmentCloseCB +WALSegmentContext +WALSegmentOpenCB +WCHAR +WCOKind +WFW_WaitOption +WIDGET +WORD +WORKSTATE +WSABUF +WSADATA +WSANETWORKEVENTS +WSAPROTOCOL_INFO +WaitEvent +WaitEventActivity +WaitEventClient +WaitEventIO +WaitEventIPC +WaitEventSet +WaitEventTimeout +WaitPMResult +WalCloseMethod +WalLevel +Safekeeper +WalMessage +WalRcvData +WalRcvExecResult +WalRcvExecStatus +WalRcvState +WalRcvStreamOptions +WalReceiverConn +WalReceiverFunctionsType +WalSnd +WalSndCtlData +WalSndSendDataCallback +WalSndState +WalTimeSample +WalUsage +WalWriteMethod +Walfile +WindowAgg +WindowAggPath +WindowAggState +WindowClause +WindowClauseSortData +WindowDef +WindowFunc +WindowFuncExprState +WindowFuncLists +WindowObject +WindowObjectData +WindowStatePerAgg +WindowStatePerAggData +WindowStatePerFunc +WithCheckOption +WithClause +WordEntry +WordEntryIN +WordEntryPos +WordEntryPosVector +WordEntryPosVector1 +WorkTableScan +WorkTableScanState +WorkerInfo +WorkerInfoData +WorkerInstrumentation +WorkerJobDumpPtrType +WorkerJobRestorePtrType +Working_State +WriteBufPtrType +WriteBytePtrType +WriteDataCallback +WriteDataPtrType +WriteExtraTocPtrType +WriteFunc +WriteManifestState +WriteTarState +WritebackContext +X509 +X509_EXTENSION +X509_NAME +X509_NAME_ENTRY +X509_STORE +X509_STORE_CTX +XLTW_Oper +XLogCtlData +XLogCtlInsert +XLogDumpConfig +XLogDumpPrivate +XLogDumpStats +XLogLongPageHeader +XLogLongPageHeaderData +XLogPageHeader +XLogPageHeaderData +XLogPageReadCB +XLogPageReadPrivate +XLogReaderRoutine +XLogReaderState +XLogRecData +XLogRecPtr +XLogRecord +XLogRecordBlockCompressHeader +XLogRecordBlockHeader +XLogRecordBlockImageHeader +XLogRecordBuffer +XLogRedoAction +XLogSegNo +XLogSource +XLogwrtResult +XLogwrtRqst +XPVIV +XPVMG +XactCallback +XactCallbackItem +XactEvent +XactLockTableWaitInfo +XidBoundsViolation +XidCacheStatus +XidCommitStatus +XidStatus +XmlExpr +XmlExprOp +XmlOptionType +XmlSerialize +XmlTableBuilderData +YYLTYPE +YYSTYPE +YY_BUFFER_STATE +ZenithErrorResponse +ZenithExistsRequest +ZenithExistsResponse +ZenithGetPageRequest +ZenithGetPageResponse +ZenithMessage +ZenithMessageTag +ZenithNblocksRequest +ZenithNblocksResponse +ZenithRequest +ZenithResponse +_SPI_connection +_SPI_plan +__AssignProcessToJobObject +__CreateJobObject +__CreateRestrictedToken +__IsProcessInJob +__QueryInformationJobObject +__SetInformationJobObject +__time64_t +_dev_t +_ino_t +_resultmap +_stringlist +acquireLocksOnSubLinks_context +adjust_appendrel_attrs_context +aff_regex_struct +allocfunc +amadjustmembers_function +ambeginscan_function +ambuild_function +ambuildempty_function +ambuildphasename_function +ambulkdelete_function +amcanreturn_function +amcostestimate_function +amendscan_function +amestimateparallelscan_function +amgetbitmap_function +amgettuple_function +aminitparallelscan_function +aminsert_function +ammarkpos_function +amoptions_function +amparallelrescan_function +amproperty_function +amrescan_function +amrestrpos_function +amvacuumcleanup_function +amvalidate_function +array_iter +array_unnest_fctx +assign_collations_context +autovac_table +av_relation +avl_dbase +avl_node +avl_tree +avw_dbase +backslashResult +backup_manifest_info +backup_manifest_option +base_yy_extra_type +basebackup_options +bgworker_main_type +binaryheap +binaryheap_comparator +bitmapword +bits16 +bits32 +bits8 +bloom_filter +brin_column_state +brin_serialize_callback_type +bytea +cached_re_str +cashKEY +cfp +check_agg_arguments_context +check_function_callback +check_network_data +check_object_relabel_type +check_password_hook_type +check_ungrouped_columns_context +chr +clock_t +cmpEntriesArg +cmpfunc +codes_t +coercion +collation_cache_entry +color +colormaprange +compare_context +config_var_value +contain_aggs_of_level_context +convert_testexpr_context +copy_data_source_cb +core_YYSTYPE +core_yy_extra_type +core_yyscan_t +corrupt_items +cost_qual_eval_context +cp_hash_func +create_upper_paths_hook_type +createdb_failure_params +crosstab_HashEnt +crosstab_cat_desc +datapagemap_iterator_t +datapagemap_t +dateKEY +datetkn +dce_uuid_t +decimal +deparse_columns +deparse_context +deparse_expr_cxt +deparse_namespace +destructor +dev_t +digit +disassembledLeaf +dlist_head +dlist_iter +dlist_mutable_iter +dlist_node +ds_state +dsa_area +dsa_area_control +dsa_area_pool +dsa_area_span +dsa_handle +dsa_pointer +dsa_pointer_atomic +dsa_segment_header +dsa_segment_index +dsa_segment_map +dshash_compare_function +dshash_hash +dshash_hash_function +dshash_parameters +dshash_partition +dshash_table +dshash_table_control +dshash_table_handle +dshash_table_item +dsm_control_header +dsm_control_item +dsm_handle +dsm_op +dsm_segment +dsm_segment_detach_callback +eLogType +ean13 +eary +ec_matches_callback_type +ec_member_foreign_arg +ec_member_matches_arg +emit_log_hook_type +eval_const_expressions_context +exec_thread_arg +execution_state +explain_get_index_name_hook_type +f_smgr +fd_set +fe_scram_state +fe_scram_state_enum +fetch_range_request +file_action_t +file_entry_t +file_type_t +filehash_hash +filehash_iterator +filemap_t +fill_string_relopt +finalize_primnode_context +find_dependent_phvs_context +find_expr_references_context +fix_join_expr_context +fix_scan_expr_context +fix_upper_expr_context +flatten_join_alias_vars_context +float4 +float4KEY +float8 +float8KEY +floating_decimal_32 +floating_decimal_64 +fmAggrefPtr +fmExprContextCallbackFunction +fmNodePtr +fmStringInfo +fmgr_hook_type +foreign_glob_cxt +foreign_loc_cxt +freeaddrinfo_ptr_t +freefunc +fsec_t +gbt_vsrt_arg +gbtree_ninfo +gbtree_vinfo +generate_series_fctx +generate_series_numeric_fctx +generate_series_timestamp_fctx +generate_series_timestamptz_fctx +generate_subscripts_fctx +get_attavgwidth_hook_type +get_index_stats_hook_type +get_relation_info_hook_type +get_relation_stats_hook_type +getaddrinfo_ptr_t +getnameinfo_ptr_t +gid_t +gin_leafpage_items_state +ginxlogCreatePostingTree +ginxlogDeleteListPages +ginxlogDeletePage +ginxlogInsert +ginxlogInsertDataInternal +ginxlogInsertEntry +ginxlogInsertListPage +ginxlogRecompressDataLeaf +ginxlogSplit +ginxlogUpdateMeta +ginxlogVacuumDataLeafPage +gistxlogDelete +gistxlogPage +gistxlogPageDelete +gistxlogPageReuse +gistxlogPageSplit +gistxlogPageUpdate +grouping_sets_data +gseg_picksplit_item +gss_buffer_desc +gss_cred_id_t +gss_ctx_id_t +gss_name_t +gtrgm_consistent_cache +gzFile +hashfunc +hbaPort +heap_page_items_state +help_handler +hlCheck +hstoreCheckKeyLen_t +hstoreCheckValLen_t +hstorePairs_t +hstoreUniquePairs_t +hstoreUpgrade_t +hyperLogLogState +ifState +ilist +import_error_callback_arg +indexed_tlist +inet +inetKEY +inet_struct +init_function +inline_cte_walker_context +inline_error_callback_arg +ino_t +inquiry +instr_time +int128 +int16 +int16KEY +int2vector +int32 +int32KEY +int32_t +int64 +int64KEY +int8 +internalPQconninfoOption +intptr_t +intset_internal_node +intset_leaf_node +intset_node +intvKEY +itemIdCompact +itemIdCompactData +iterator +jmp_buf +join_search_hook_type +json_aelem_action +json_manifest_error_callback +json_manifest_perfile_callback +json_manifest_perwalrange_callback +json_ofield_action +json_scalar_action +json_struct_action +keyEntryData +key_t +lclContext +lclTocEntry +leafSegmentInfo +leaf_item +libpq_source +line_t +lineno_t +list_sort_comparator +local_relopt +local_relopts +local_source +locale_t +locate_agg_of_level_context +locate_var_of_level_context +locate_windowfunc_context +logstreamer_param +lquery +lquery_level +lquery_variant +ltree +ltree_gist +ltree_level +ltxtquery +mXactCacheEnt +mac8KEY +macKEY +macaddr +macaddr8 +macaddr_sortsupport_state +manifest_file +manifest_files_hash +manifest_files_iterator +manifest_wal_range +map_variable_attnos_context +max_parallel_hazard_context +mb2wchar_with_len_converter +mbchar_verifier +mbcharacter_incrementer +mbdisplaylen_converter +mblen_converter +mbstr_verifier +memoize_hash +memoize_iterator +metastring +mix_data_t +mixedStruct +mode_t +movedb_failure_params +mp_digit +mp_int +mp_result +mp_sign +mp_size +mp_small +mp_usmall +mp_word +mpz_t +multirange_bsearch_comparison +mxact +mxtruncinfo +needs_fmgr_hook_type +network_sortsupport_state +nodeitem +normal_rand_fctx +ntile_context +numeric +object_access_hook_type +off_t +oidKEY +oidvector +on_dsm_detach_callback +on_exit_nicely_callback +openssl_tls_init_hook_typ +ossl_EVP_cipher_func +other +output_type +pagetable_hash +pagetable_iterator +pairingheap +pairingheap_comparator +pairingheap_node +parallel_worker_main_type +parse_error_callback_arg +parser_context +partition_method_t +pendingPosition +pgParameterStatus +pg_atomic_flag +pg_atomic_uint32 +pg_atomic_uint64 +pg_checksum_context +pg_checksum_raw_context +pg_checksum_type +pg_conn_host +pg_conn_host_type +pg_conv_map +pg_crc32 +pg_crc32c +pg_cryptohash_ctx +pg_cryptohash_type +pg_ctype_cache +pg_enc +pg_enc2gettext +pg_enc2name +pg_encname +pg_funcptr_t +pg_gssinfo +pg_hmac_ctx +pg_int64 +pg_local_to_utf_combined +pg_locale_t +pg_mb_radix_tree +pg_md5_ctx +pg_on_exit_callback +pg_re_flags +pg_saslprep_rc +pg_sha1_ctx +pg_sha224_ctx +pg_sha256_ctx +pg_sha384_ctx +pg_sha512_ctx +pg_snapshot +pg_stack_base_t +pg_time_t +pg_time_usec_t +pg_tz +pg_tz_cache +pg_tzenum +pg_unicode_decompinfo +pg_unicode_decomposition +pg_unicode_norminfo +pg_unicode_normprops +pg_unicode_recompinfo +pg_utf_to_local_combined +pg_uuid_t +pg_wc_probefunc +pg_wchar +pg_wchar_tbl +pgp_armor_headers_state +pgpid_t +pgsocket +pgsql_thing_t +pgssEntry +pgssGlobalStats +pgssHashKey +pgssSharedState +pgssStoreKind +pgssVersion +pgstat_page +pgstattuple_type +pgthreadlock_t +pid_t +pivot_field +planner_hook_type +plperl_array_info +plperl_call_data +plperl_interp_desc +plperl_proc_desc +plperl_proc_key +plperl_proc_ptr +plperl_query_desc +plperl_query_entry +plpgsql_CastHashEntry +plpgsql_CastHashKey +plpgsql_HashEnt +pltcl_call_state +pltcl_interp_desc +pltcl_proc_desc +pltcl_proc_key +pltcl_proc_ptr +pltcl_query_desc +pointer +polymorphic_actuals +pos_trgm +post_parse_analyze_hook_type +postprocess_result_function +pqbool +pqsigfunc +printQueryOpt +printTableContent +printTableFooter +printTableOpt +printTextFormat +printTextLineFormat +printTextLineWrap +printTextRule +printfunc +priv_map +process_file_callback_t +process_sublinks_context +proclist_head +proclist_mutable_iter +proclist_node +promptStatus_t +pthread_barrier_t +pthread_cond_t +pthread_key_t +pthread_mutex_t +pthread_once_t +pthread_t +ptrdiff_t +pull_var_clause_context +pull_varattnos_context +pull_varnos_context +pull_vars_context +pullup_replace_vars_context +pushdown_safety_info +qc_hash_func +qsort_arg_comparator +qsort_comparator +query_pathkeys_callback +radius_attribute +radius_packet +rangeTableEntry_used_context +rank_context +rbt_allocfunc +rbt_combiner +rbt_comparator +rbt_freefunc +reduce_outer_joins_state +reference +regex_arc_t +regex_t +regexp +regexp_matches_ctx +registered_buffer +regmatch_t +regoff_t +regproc +relopt_bool +relopt_enum +relopt_enum_elt_def +relopt_gen +relopt_int +relopt_kind +relopt_parse_elt +relopt_real +relopt_string +relopt_type +relopt_value +relopts_validator +remoteConn +remoteConnHashEnt +remoteDep +rendezvousHashEntry +replace_rte_variables_callback +replace_rte_variables_context +ret_type +rewind_source +rewrite_event +rijndael_ctx +rm_detail_t +role_auth_extra +row_security_policy_hook_type +rsv_callback +saophash_hash +save_buffer +scram_state +scram_state_enum +sem_t +sequence_magic +set_join_pathlist_hook_type +set_rel_pathlist_hook_type +shm_mq +shm_mq_handle +shm_mq_iovec +shm_mq_result +shm_toc +shm_toc_entry +shm_toc_estimator +shmem_startup_hook_type +sig_atomic_t +sigjmp_buf +signedbitmapword +sigset_t +size_t +slist_head +slist_iter +slist_mutable_iter +slist_node +slock_t +socket_set +spgBulkDeleteState +spgChooseIn +spgChooseOut +spgChooseResultType +spgConfigIn +spgConfigOut +spgInnerConsistentIn +spgInnerConsistentOut +spgLeafConsistentIn +spgLeafConsistentOut +spgNodePtr +spgPickSplitIn +spgPickSplitOut +spgVacPendingItem +spgxlogAddLeaf +spgxlogAddNode +spgxlogMoveLeafs +spgxlogPickSplit +spgxlogSplitTuple +spgxlogState +spgxlogVacuumLeaf +spgxlogVacuumRedirect +spgxlogVacuumRoot +split_pathtarget_context +split_pathtarget_item +sql_error_callback_arg +sqlparseInfo +sqlparseState +ss_lru_item_t +ss_scan_location_t +ss_scan_locations_t +ssize_t +standard_qp_extra +stemmer_module +stmtCacheEntry +storeInfo +storeRes_func +stream_stop_callback +string +substitute_actual_parameters_context +substitute_actual_srf_parameters_context +substitute_phv_relids_context +svtype +symbol +tablespaceinfo +teSection +temp_tablespaces_extra +test_re_flags +test_regex_ctx +test_shm_mq_header +test_spec +test_start_function +text +timeKEY +time_t +timeout_handler_proc +timeout_params +timerCA +tlist_vinfo +toast_compress_header +transferMode +transfer_thread_arg +trgm +trgm_mb_char +trivalue +tsKEY +ts_parserstate +ts_tokenizer +ts_tokentype +tsearch_readline_state +tuplehash_hash +tuplehash_iterator +type +tzEntry +u1byte +u4byte +u_char +u_int +uchr +uid_t +uint128 +uint16 +uint16_t +uint32 +uint32_t +uint64 +uint64_t +uint8 +uint8_t +uintptr_t +unicodeStyleBorderFormat +unicodeStyleColumnFormat +unicodeStyleFormat +unicodeStyleRowFormat +unicode_linestyle +unit_conversion +unlogged_relation_entry +utf_local_conversion_func +uuidKEY +uuid_rc_t +uuid_sortsupport_state +uuid_t +va_list +vacuumingOptions +validate_string_relopt +varatt_expanded +varattrib_1b +varattrib_1b_e +varattrib_4b +vbits +verifier_context +walrcv_check_conninfo_fn +walrcv_connect_fn +walrcv_create_slot_fn +walrcv_disconnect_fn +walrcv_endstreaming_fn +walrcv_exec_fn +walrcv_get_backend_pid_fn +walrcv_get_conninfo_fn +walrcv_get_senderinfo_fn +walrcv_identify_system_fn +walrcv_readtimelinehistoryfile_fn +walrcv_receive_fn +walrcv_send_fn +walrcv_server_version_fn +walrcv_startstreaming_fn +wchar2mb_with_len_converter +wchar_t +win32_deadchild_waitinfo +wint_t +worker_state +worktable +wrap +xl_brin_createidx +xl_brin_desummarize +xl_brin_insert +xl_brin_revmap_extend +xl_brin_samepage_update +xl_brin_update +xl_btree_dedup +xl_btree_delete +xl_btree_insert +xl_btree_mark_page_halfdead +xl_btree_metadata +xl_btree_newroot +xl_btree_reuse_page +xl_btree_split +xl_btree_unlink_page +xl_btree_update +xl_btree_vacuum +xl_clog_truncate +xl_commit_ts_truncate +xl_dbase_create_rec +xl_dbase_drop_rec +xl_end_of_recovery +xl_hash_add_ovfl_page +xl_hash_delete +xl_hash_init_bitmap_page +xl_hash_init_meta_page +xl_hash_insert +xl_hash_move_page_contents +xl_hash_split_allocate_page +xl_hash_split_complete +xl_hash_squeeze_page +xl_hash_update_meta_page +xl_hash_vacuum_one_page +xl_heap_confirm +xl_heap_delete +xl_heap_freeze_page +xl_heap_freeze_tuple +xl_heap_header +xl_heap_inplace +xl_heap_insert +xl_heap_lock +xl_heap_lock_updated +xl_heap_multi_insert +xl_heap_new_cid +xl_heap_prune +xl_heap_rewrite_mapping +xl_heap_truncate +xl_heap_update +xl_heap_vacuum +xl_heap_visible +xl_invalid_page +xl_invalid_page_key +xl_invalidations +xl_logical_message +xl_multi_insert_tuple +xl_multixact_create +xl_multixact_truncate +xl_overwrite_contrecord +xl_parameter_change +xl_relmap_update +xl_replorigin_drop +xl_replorigin_set +xl_restore_point +xl_running_xacts +xl_seq_rec +xl_smgr_create +xl_smgr_truncate +xl_standby_lock +xl_standby_locks +xl_tblspc_create_rec +xl_tblspc_drop_rec +xl_xact_abort +xl_xact_assignment +xl_xact_commit +xl_xact_dbinfo +xl_xact_invals +xl_xact_origin +xl_xact_parsed_abort +xl_xact_parsed_commit +xl_xact_parsed_prepare +xl_xact_prepare +xl_xact_relfilenodes +xl_xact_subxacts +xl_xact_twophase +xl_xact_xinfo +xmlBuffer +xmlBufferPtr +xmlChar +xmlDocPtr +xmlErrorPtr +xmlExternalEntityLoader +xmlGenericErrorFunc +xmlNodePtr +xmlNodeSetPtr +xmlParserCtxtPtr +xmlParserInputPtr +xmlStructuredErrorFunc +xmlTextWriter +xmlTextWriterPtr +xmlXPathCompExprPtr +xmlXPathContextPtr +xmlXPathObjectPtr +xmltype +xpath_workspace +xsltSecurityPrefsPtr +xsltStylesheetPtr +xsltTransformContextPtr +yy_parser +yy_size_t +yyscan_t +z_stream +z_streamp +zic_t From b8eb908a3df34f437b4f123461b14b599be4a8b4 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 15:43:53 +0300 Subject: [PATCH 22/33] Rename old project name references --- Cargo.lock | 8 +- Cargo.toml | 2 +- Dockerfile | 16 +- compute_tools/Cargo.toml | 4 +- control_plane/Cargo.toml | 2 +- control_plane/simple.conf | 2 +- control_plane/src/bin/neon_local.rs | 34 +- control_plane/src/compute.rs | 20 +- control_plane/src/local_env.rs | 54 +-- control_plane/src/postgresql_conf.rs | 2 +- control_plane/src/safekeeper.rs | 8 +- control_plane/src/storage.rs | 36 +- docs/authentication.md | 4 +- docs/multitenancy.md | 18 +- docs/pageserver-services.md | 2 +- docs/pageserver-storage.md | 10 +- docs/pageserver-tenant-migration.md | 4 +- docs/rfcs/013-term-history.md | 2 +- docs/rfcs/cluster-size-limits.md | 8 +- docs/sourcetree.md | 11 +- libs/etcd_broker/src/subscription_key.rs | 26 +- libs/postgres_ffi/Cargo.toml | 2 +- libs/postgres_ffi/wal_craft/Cargo.toml | 2 +- libs/utils/Cargo.toml | 4 +- libs/utils/benches/benchmarks.rs | 4 +- libs/utils/src/auth.rs | 14 +- libs/utils/src/http/endpoint.rs | 6 +- libs/utils/src/http/mod.rs | 2 +- libs/utils/src/{zid.rs => id.rs} | 88 ++-- libs/utils/src/lib.rs | 2 +- libs/utils/src/postgres_backend.rs | 12 +- libs/utils/src/postgres_backend_async.rs | 4 +- pageserver/Cargo.toml | 8 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 4 +- pageserver/src/bin/update_metadata.rs | 2 +- pageserver/src/config.rs | 16 +- pageserver/src/http/models.rs | 24 +- pageserver/src/http/routes.rs | 42 +- pageserver/src/import_datadir.rs | 2 +- pageserver/src/lib.rs | 10 +- pageserver/src/metrics.rs | 6 +- pageserver/src/page_cache.rs | 14 +- pageserver/src/page_service.rs | 76 ++-- pageserver/src/pgdatadir_mapping.rs | 8 +- pageserver/src/repository.rs | 4 +- pageserver/src/storage_sync.rs | 72 ++-- pageserver/src/storage_sync/delete.rs | 8 +- pageserver/src/storage_sync/download.rs | 40 +- pageserver/src/storage_sync/index.rs | 42 +- pageserver/src/storage_sync/upload.rs | 12 +- pageserver/src/task_mgr.rs | 18 +- pageserver/src/tenant.rs | 100 ++--- pageserver/src/tenant/delta_layer.rs | 84 ++-- pageserver/src/tenant/ephemeral_file.rs | 36 +- pageserver/src/tenant/image_layer.rs | 84 ++-- pageserver/src/tenant/inmemory_layer.rs | 38 +- pageserver/src/tenant/layer_map.rs | 2 +- pageserver/src/tenant/metadata.rs | 20 +- pageserver/src/tenant/storage_layer.rs | 10 +- pageserver/src/tenant/timeline.rs | 16 +- pageserver/src/tenant_config.rs | 6 +- pageserver/src/tenant_mgr.rs | 36 +- pageserver/src/tenant_tasks.rs | 12 +- pageserver/src/timelines.rs | 14 +- pageserver/src/virtual_file.rs | 38 +- pageserver/src/walingest.rs | 26 +- .../src/walreceiver/connection_manager.rs | 18 +- .../src/walreceiver/walreceiver_connection.rs | 8 +- pageserver/src/walrecord.rs | 16 +- pageserver/src/walredo.rs | 80 ++-- pgxn/neon/inmem_smgr.c | 2 +- pgxn/neon/libpagestore.c | 49 ++- pgxn/neon/neon.c | 2 - pgxn/neon/pagestore_client.h | 153 ++++--- pgxn/neon/pagestore_smgr.c | 408 +++++++++--------- pgxn/neon/relsize_cache.c | 6 +- pgxn/neon/walproposer.c | 114 ++--- pgxn/neon/walproposer.h | 38 +- pgxn/neon_test_utils/neontest.c | 32 +- proxy/Cargo.toml | 2 +- pyproject.toml | 2 +- safekeeper/Cargo.toml | 6 +- safekeeper/src/bin/safekeeper.rs | 6 +- safekeeper/src/broker.rs | 10 +- safekeeper/src/control_file.rs | 18 +- safekeeper/src/control_file_upgrade.rs | 25 +- safekeeper/src/handler.rs | 30 +- safekeeper/src/http/models.rs | 4 +- safekeeper/src/http/routes.rs | 14 +- safekeeper/src/json_ctrl.rs | 4 +- safekeeper/src/lib.rs | 6 +- safekeeper/src/metrics.rs | 4 +- safekeeper/src/receive_wal.rs | 2 +- safekeeper/src/safekeeper.rs | 36 +- safekeeper/src/send_wal.rs | 8 +- safekeeper/src/timeline.rs | 49 ++- safekeeper/src/wal_backup.rs | 14 +- safekeeper/src/wal_storage.rs | 8 +- scripts/generate_and_push_perf_report.sh | 8 +- scripts/perf_report_template.html | 4 +- test_runner/README.md | 2 +- test_runner/fixtures/benchmark_fixture.py | 6 +- test_runner/fixtures/neon_fixtures.py | 132 +++--- test_runner/fixtures/types.py | 14 +- test_runner/performance/README.md | 2 +- test_runner/regress/test_ancestor_branch.py | 8 +- test_runner/regress/test_auth.py | 4 +- test_runner/regress/test_branch_behind.py | 4 +- test_runner/regress/test_broken_timeline.py | 4 +- test_runner/regress/test_fullbackup.py | 4 +- test_runner/regress/test_gc_aggressive.py | 8 +- test_runner/regress/test_import.py | 12 +- test_runner/regress/test_neon_cli.py | 8 +- test_runner/regress/test_old_request_lsn.py | 4 +- test_runner/regress/test_pageserver_api.py | 24 +- test_runner/regress/test_pitr_gc.py | 4 +- test_runner/regress/test_remote_storage.py | 8 +- test_runner/regress/test_tenant_detach.py | 8 +- test_runner/regress/test_tenant_relocation.py | 22 +- test_runner/regress/test_tenant_tasks.py | 10 +- test_runner/regress/test_tenants.py | 4 +- .../test_tenants_with_remote_storage.py | 8 +- test_runner/regress/test_timeline_delete.py | 6 +- test_runner/regress/test_timeline_size.py | 8 +- test_runner/regress/test_wal_acceptor.py | 64 ++- .../regress/test_wal_acceptor_async.py | 10 +- test_runner/regress/test_wal_restore.py | 4 +- 128 files changed, 1428 insertions(+), 1495 deletions(-) rename libs/utils/src/{zid.rs => id.rs} (76%) diff --git a/Cargo.lock b/Cargo.lock index e9ebcdc5ac..d4234d2b00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2048,7 +2048,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.2" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", @@ -2061,7 +2061,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "base64", "byteorder", @@ -2079,7 +2079,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.3" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", @@ -3295,7 +3295,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.6" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "async-trait", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 1936b261f7..bc2a705558 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,4 +70,4 @@ lto = true # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/Dockerfile b/Dockerfile index 3e173f4d5b..eacb88d168 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN set -e \ && rm -rf pg_install/v15/build \ && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . -# Build zenith binaries +# Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local @@ -60,12 +60,12 @@ RUN set -e \ openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ - && useradd -d /data zenith \ - && chown -R zenith:zenith /data + && useradd -d /data neon \ + && chown -R neon:neon /data -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin # v14 is default for now COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ @@ -73,7 +73,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. -RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ +RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ -c "broker_endpoints=['http://etcd:2379']" \ @@ -82,7 +82,7 @@ RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ -c "listen_http_addr='0.0.0.0:9898'" VOLUME ["/data"] -USER zenith +USER neon EXPOSE 6400 EXPOSE 9898 CMD ["/bin/bash"] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 78b85d0e79..b13f7f191d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,12 +10,12 @@ clap = "3.0" env_logger = "0.9" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } url = "2.2.2" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 8a79a6e566..ab9df8534c 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -8,7 +8,7 @@ clap = "3.0" comfy-table = "5.0.1" git-version = "0.3.5" tar = "0.4.38" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" toml = "0.5" diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 925e2f14ee..ae60657400 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -1,4 +1,4 @@ -# Minimal zenith environment with one safekeeper. This is equivalent to the built-in +# Minimal neon environment with one safekeeper. This is equivalent to the built-in # defaults that you get with no --config [pageserver] listen_pg_addr = '127.0.0.1:64000' diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index e3160db53b..e16fd8764a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -27,10 +27,10 @@ use std::process::exit; use std::str::FromStr; use utils::{ auth::{Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, project_git_version, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; // Default id of a safekeeper node, if not specified on the command line. @@ -72,7 +72,7 @@ struct TimelineTreeEl { /// Name, recovered from neon config mappings pub name: Option, /// Holds all direct children of this timeline referenced using `timeline_id`. - pub children: BTreeSet, + pub children: BTreeSet, } // Main entry point for the 'neon_local' CLI utility @@ -321,7 +321,7 @@ fn main() -> Result<()> { /// fn print_timelines_tree( timelines: Vec, - mut timeline_name_mappings: HashMap, + mut timeline_name_mappings: HashMap, ) -> Result<()> { let mut timelines_hash = timelines .iter() @@ -332,7 +332,7 @@ fn print_timelines_tree( info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings - .remove(&ZTenantTimelineId::new(t.tenant_id, t.timeline_id)), + .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)), }, ) }) @@ -374,7 +374,7 @@ fn print_timeline( nesting_level: usize, is_last: &[bool], timeline: &TimelineTreeEl, - timelines: &HashMap, + timelines: &HashMap, ) -> Result<()> { let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) { (None, None) => unreachable!("in this case no info for a timeline is found"), @@ -452,8 +452,8 @@ fn print_timeline( /// Connects to the pageserver to query this information. fn get_timeline_infos( env: &local_env::LocalEnv, - tenant_id: &ZTenantId, -) -> Result> { + tenant_id: &TenantId, +) -> Result> { Ok(PageServerNode::from_env(env) .timeline_list(tenant_id)? .into_iter() @@ -462,7 +462,7 @@ fn get_timeline_infos( } // Helper function to parse --tenant_id option, or get the default from config file -fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { +fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { tenant_id_from_arguments } else if let Some(default_id) = env.default_tenant_id { @@ -472,18 +472,18 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } -fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { +fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .value_of("tenant-id") - .map(ZTenantId::from_str) + .map(TenantId::from_str) .transpose() .context("Failed to parse tenant id from the argument string") } -fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { +fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .value_of("timeline-id") - .map(ZTimelineId::from_str) + .map(TimelineId::from_str) .transpose() .context("Failed to parse timeline id from the argument string") } @@ -504,9 +504,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; env.init().context("Failed to initialize neon repository")?; - - // default_tenantid was generated by the `env.init()` call above - let initial_tenant_id = env.default_tenant_id.unwrap(); + let initial_tenant_id = env + .default_tenant_id + .expect("default_tenant_id should be generated by the `env.init()` call above"); // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); @@ -759,7 +759,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; let branch_name = timeline_name_mappings - .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) + .get(&TenantTimelineId::new(tenant_id, node.timeline_id)) .map(|name| name.as_str()) .unwrap_or("?"); @@ -810,7 +810,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); - let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) { + let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 57b5e1e10a..b678d620df 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -13,9 +13,9 @@ use std::time::Duration; use anyhow::{Context, Result}; use utils::{ connstring::connection_host_port, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - zid::{ZTenantId, ZTimelineId}, }; use crate::local_env::LocalEnv; @@ -28,7 +28,7 @@ use crate::storage::PageServerNode; pub struct ComputeControlPlane { base_port: u16, pageserver: Arc, - pub nodes: BTreeMap<(ZTenantId, String), Arc>, + pub nodes: BTreeMap<(TenantId, String), Arc>, env: LocalEnv, } @@ -76,9 +76,9 @@ impl ComputeControlPlane { pub fn new_node( &mut self, - tenant_id: ZTenantId, + tenant_id: TenantId, name: &str, - timeline_id: ZTimelineId, + timeline_id: TimelineId, lsn: Option, port: Option, ) -> Result> { @@ -114,9 +114,9 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, uses_wal_proposer: bool, } @@ -148,8 +148,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; - let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; + let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; + let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // parse recovery_target_lsn, if any @@ -292,7 +292,7 @@ impl PostgresNode { // variable during compute pg startup. It is done this way because // otherwise user will be able to retrieve the value using SHOW // command or pg_settings - let password = if let AuthType::ZenithJWT = auth_type { + let password = if let AuthType::NeonJWT = auth_type { "$ZENITH_AUTH_TOKEN" } else { "" @@ -301,7 +301,7 @@ impl PostgresNode { // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN // We parse this string and build it back with token from env var, and for simplicity rebuild // uses only needed variables namely host, port, user, password. - format!("postgresql://no_user:{}@{}:{}", password, host, port) + format!("postgresql://no_user:{password}@{host}:{port}") }; conf.append("shared_preload_libraries", "neon"); conf.append_line(""); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index c4a61dbd7b..7afaad26dc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -14,8 +14,8 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, postgres_backend::AuthType, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use crate::safekeeper::SafekeeperNode; @@ -48,13 +48,13 @@ pub struct LocalEnv { // Path to pageserver binary. #[serde(default)] - pub zenith_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, - // Default tenant ID to use with the 'zenith' command line utility, when - // --tenantid is not explicitly specified. + // Default tenant ID to use with the 'neon_local' command line utility, when + // --tenant_id is not explicitly specified. #[serde(default)] #[serde_as(as = "Option")] - pub default_tenant_id: Option, + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -69,11 +69,11 @@ pub struct LocalEnv { /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. #[serde(default)] - // A `HashMap>` would be more appropriate here, + // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")] - branch_name_mappings: HashMap>, + branch_name_mappings: HashMap>, } /// Etcd broker config for cluster internal communication. @@ -204,20 +204,20 @@ impl LocalEnv { } pub fn pageserver_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("pageserver")) + Ok(self.neon_distrib_dir.join("pageserver")) } pub fn safekeeper_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("safekeeper")) + Ok(self.neon_distrib_dir.join("safekeeper")) } pub fn pg_data_dirs_path(&self) -> PathBuf { self.base_data_dir.join("pgdatadirs").join("tenants") } - pub fn pg_data_dir(&self, tenantid: &ZTenantId, branch_name: &str) -> PathBuf { + pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf { self.pg_data_dirs_path() - .join(tenantid.to_string()) + .join(tenant_id.to_string()) .join(branch_name) } @@ -233,8 +233,8 @@ impl LocalEnv { pub fn register_branch_mapping( &mut self, branch_name: String, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> anyhow::Result<()> { let existing_values = self .branch_name_mappings @@ -260,22 +260,22 @@ impl LocalEnv { pub fn get_branch_timeline_id( &self, branch_name: &str, - tenant_id: ZTenantId, - ) -> Option { + tenant_id: TenantId, + ) -> Option { self.branch_name_mappings .get(branch_name)? .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) - .map(ZTimelineId::from) + .map(TimelineId::from) } - pub fn timeline_name_mappings(&self) -> HashMap { + pub fn timeline_name_mappings(&self) -> HashMap { self.branch_name_mappings .iter() .flat_map(|(name, tenant_timelines)| { tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { - (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone()) + (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) }) }) .collect() @@ -299,14 +299,14 @@ impl LocalEnv { } } - // Find zenith binaries. - if env.zenith_distrib_dir == Path::new("") { - env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); + // Find neon binaries. + if env.neon_distrib_dir == Path::new("") { + env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } // If no initial tenant ID was given, generate it. if env.default_tenant_id.is_none() { - env.default_tenant_id = Some(ZTenantId::generate()); + env.default_tenant_id = Some(TenantId::generate()); } env.base_data_dir = base_path(); @@ -320,12 +320,12 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'neon_local init' first", + "Neon config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } - // TODO: check that it looks like a zenith repository + // TODO: check that it looks like a neon repository // load and parse file let config = fs::read_to_string(repopath.join("config"))?; @@ -404,10 +404,10 @@ impl LocalEnv { ); } for binary in ["pageserver", "safekeeper"] { - if !self.zenith_distrib_dir.join(binary).exists() { + if !self.neon_distrib_dir.join(binary).exists() { bail!( - "Can't find binary '{binary}' in zenith distrib dir '{}'", - self.zenith_distrib_dir.display() + "Can't find binary '{binary}' in neon distrib dir '{}'", + self.neon_distrib_dir.display() ); } } diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index a71108da01..34dc769e78 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -2,7 +2,7 @@ /// Module for parsing postgresql.conf file. /// /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just -/// enough to extract a few settings we need in Zenith, assuming you don't do +/// enough to extract a few settings we need in Neon, assuming you don't do /// funny stuff like include-directives or funny escaping. use anyhow::{bail, Context, Result}; use once_cell::sync::Lazy; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 2cc1ae7853..600a9ffe05 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -17,7 +17,7 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, - zid::{NodeId, ZTenantId, ZTimelineId}, + id::{NodeId, TenantId, TimelineId}, }; use crate::local_env::{LocalEnv, SafekeeperConf}; @@ -269,7 +269,7 @@ impl SafekeeperNode { fn http_request(&self, method: Method, url: U) -> RequestBuilder { // TODO: authentication - //if self.env.auth_type == AuthType::ZenithJWT { + //if self.env.auth_type == AuthType::NeonJWT { // builder = builder.bearer_auth(&self.env.safekeeper_auth_token) //} self.http_client.request(method, url) @@ -284,8 +284,8 @@ impl SafekeeperNode { pub fn timeline_create( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, peer_ids: Vec, ) -> Result<()> { Ok(self diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9fdab5f88c..d2cc5e096c 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -21,9 +21,9 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - zid::{ZTenantId, ZTimelineId}, }; use crate::local_env::LocalEnv; @@ -83,7 +83,7 @@ pub struct PageServerNode { impl PageServerNode { pub fn from_env(env: &LocalEnv) -> PageServerNode { - let password = if env.pageserver.auth_type == AuthType::ZenithJWT { + let password = if env.pageserver.auth_type == AuthType::NeonJWT { &env.pageserver.auth_token } else { "" @@ -109,10 +109,10 @@ impl PageServerNode { pub fn initialize( &self, - create_tenant: Option, - initial_timeline_id: Option, + create_tenant: Option, + initial_timeline_id: Option, config_overrides: &[&str], - ) -> anyhow::Result { + ) -> anyhow::Result { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = @@ -173,9 +173,9 @@ impl PageServerNode { fn try_init_timeline( &self, - new_tenant_id: Option, - new_timeline_id: Option, - ) -> anyhow::Result { + new_tenant_id: Option, + new_timeline_id: Option, + ) -> anyhow::Result { let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; let initial_timeline_info = self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; @@ -345,7 +345,7 @@ impl PageServerNode { fn http_request(&self, method: Method, url: U) -> RequestBuilder { let mut builder = self.http_client.request(method, url); - if self.env.pageserver.auth_type == AuthType::ZenithJWT { + if self.env.pageserver.auth_type == AuthType::NeonJWT { builder = builder.bearer_auth(&self.env.pageserver.auth_token) } builder @@ -368,9 +368,9 @@ impl PageServerNode { pub fn tenant_create( &self, - new_tenant_id: Option, + new_tenant_id: Option, settings: HashMap<&str, &str>, - ) -> anyhow::Result { + ) -> anyhow::Result { self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { new_tenant_id, @@ -422,7 +422,7 @@ impl PageServerNode { }) } - pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { + pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url)) .json(&TenantConfigRequest { tenant_id, @@ -471,7 +471,7 @@ impl PageServerNode { Ok(()) } - pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { + pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { let timeline_infos: Vec = self .http_request( Method::GET, @@ -486,10 +486,10 @@ impl PageServerNode { pub fn timeline_create( &self, - tenant_id: ZTenantId, - new_timeline_id: Option, + tenant_id: TenantId, + new_timeline_id: Option, ancestor_start_lsn: Option, - ancestor_timeline_id: Option, + ancestor_timeline_id: Option, ) -> anyhow::Result { self.http_request( Method::POST, @@ -524,8 +524,8 @@ impl PageServerNode { /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) pub fn timeline_import( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, ) -> anyhow::Result<()> { diff --git a/docs/authentication.md b/docs/authentication.md index 7200ffc62f..9748a7ab0d 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -2,14 +2,14 @@ ### Overview -Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `zenith init`. Using following openssl commands: +Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `neon_local init`. Using following openssl commands: ```bash openssl genrsa -out private_key.pem 2048 openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem ``` -CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `ZenithJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. +CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `NeonJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. Currently there is no authentication between compute and safekeepers, because this communication layer is under heavy refactoring. After this refactoring support for authentication will be added there too. Now safekeeper supports "hardcoded" token passed via environment variable to be able to use callmemaybe command in pageserver. diff --git a/docs/multitenancy.md b/docs/multitenancy.md index c697ae93cd..35c69e69a1 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -2,26 +2,26 @@ ### Overview -Zenith supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via zenith CLI. During page server setup tenant can be created using ```zenith init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```zenith tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So zenith tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. +Neon supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via neon_local CLI. During page server setup tenant can be created using ```neon_local init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```neon_local tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So neon_local tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. ### Tenants in other commands -By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. +By default during `neon_local init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenant_id=` is provided. So generally tenant_id more frequently appears in internal pageserver interface. Its commands take tenant_id argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: ```sh -zenith tenant list +neon_local tenant list -zenith tenant create // generates new id +neon_local tenant create // generates new id -zenith tenant create ee6016ec31116c1b7c33dfdfca38892f +neon_local tenant create ee6016ec31116c1b7c33dfdfca38892f -zenith pg create main // default tenant from zenith init +neon_local pg create main // default tenant from neon init -zenith pg create main --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local pg create main --tenant_id=ee6016ec31116c1b7c33dfdfca38892f -zenith branch --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local branch --tenant_id=ee6016ec31116c1b7c33dfdfca38892f ``` ### Data layout @@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id ### Safety -For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline). +For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenant_id, timeline_id) pair so there can only be one writer for particular (tenant_id, timeline_id). diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index 07a91f543d..fc259c8a5f 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -109,7 +109,7 @@ Repository The repository stores all the page versions, or WAL records needed to reconstruct them. Each tenant has a separate Repository, which is -stored in the .neon/tenants/ directory. +stored in the .neon/tenants/ directory. Repository is an abstract trait, defined in `repository.rs`. It is implemented by the LayeredRepository object in diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md index 8d03e68ac7..77e7ff35bc 100644 --- a/docs/pageserver-storage.md +++ b/docs/pageserver-storage.md @@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under -`.neon/tenants//timelines`. +`.neon/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file @@ -351,7 +351,7 @@ branch. Note: It doesn't make any difference if the child branch is created when the end of the main branch was at LSN 250, or later when the tip of the main branch had already moved on. The latter case, creating a -branch at a historic LSN, is how we support PITR in Zenith. +branch at a historic LSN, is how we support PITR in Neon. # Garbage collection @@ -396,9 +396,9 @@ table: main/orders_200_300 DELETE main/orders_300 STILL NEEDED BY orders_300_400 main/orders_300_400 KEEP, NEWER THAN GC HORIZON - main/orders_400 .. - main/orders_400_500 .. - main/orders_500 .. + main/orders_400 .. + main/orders_400_500 .. + main/orders_500 .. main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION diff --git a/docs/pageserver-tenant-migration.md b/docs/pageserver-tenant-migration.md index a846213ab2..5fb2097030 100644 --- a/docs/pageserver-tenant-migration.md +++ b/docs/pageserver-tenant-migration.md @@ -9,7 +9,7 @@ This feature allows to migrate a timeline from one pageserver to another by util Pageserver implements two new http handlers: timeline attach and timeline detach. Timeline migration is performed in a following way: 1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3. -2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049)) +2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/neondatabase/neon/issues/997)/[#1049](https://github.com/neondatabase/neon/issues/1049)) 3. Replication state can be tracked via timeline detail pageserver call. 4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console). 5. Timeline is detached from old pageserver. On disk data is removed. @@ -18,5 +18,5 @@ Timeline migration is performed in a following way: ### Implementation details Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code: -* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). +* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/neondatabase/neon/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). * We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail) diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 59833526c5..7e815abf73 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -70,7 +70,7 @@ two options. ...start sending WAL conservatively since the horizon (1.1), and truncate obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is -reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes. +reached, i.e. 2.3 transferred -- that's what https://github.com/neondatabase/neon/pull/505 proposes. Then the following is possible: diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md index bd4cb9ef32..4ef006d9a6 100644 --- a/docs/rfcs/cluster-size-limits.md +++ b/docs/rfcs/cluster-size-limits.md @@ -15,7 +15,7 @@ The stateless compute node that performs validation is separate from the storage Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future). First of all, this is needed to control our free tier production costs. -Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters, +Another reason to limit resources is risk management — we haven't (fully) tested and optimized neon for big clusters, so we don't want to give users access to the functionality that we don't think is ready. ## Components @@ -43,20 +43,20 @@ Then this size should be reported to compute node. `current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.` -(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037). +(PR about protocol changes https://github.com/neondatabase/neon/pull/1037). This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`. Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. -And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +And then every neon_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. (see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) TODO: We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`. It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level. See issues https://github.com/neondatabase/neon/issues/1245 -https://github.com/zenithdb/zenith/issues/1445 +https://github.com/neondatabase/neon/issues/1445 TODO: We should warn users if the limit is soon to be reached. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 339a90e0ba..c1a860f126 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation `/docs`: -Documentation of the Zenith features and concepts. +Documentation of the Neon features and concepts. Now it is mostly dev documentation. `/monitoring`: @@ -19,7 +19,7 @@ TODO `/pageserver`: -Zenith storage service. +Neon storage service. The pageserver has a few different duties: - Store and manage the data. @@ -54,7 +54,7 @@ PostgreSQL extension that contains functions needed for testing and debugging. `/safekeeper`: -The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. +The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. For more detailed info, see [walservice.md](./walservice.md) @@ -64,11 +64,6 @@ The workspace_hack crate exists only to pin down some dependencies. We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation. -`/zenith` - -Main entry point for the 'zenith' CLI utility. -TODO: Doesn't it belong to control_plane? - `/libs`: Unites granular neon helper crates under the hood. diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs index 8f8579f4e5..a11d2ab106 100644 --- a/libs/etcd_broker/src/subscription_key.rs +++ b/libs/etcd_broker/src/subscription_key.rs @@ -11,7 +11,7 @@ use std::{fmt::Display, str::FromStr}; use once_cell::sync::Lazy; use regex::{Captures, Regex}; -use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId}; /// The subscription kind to the timeline updates from safekeeper. #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -30,13 +30,13 @@ pub enum SubscriptionKind { /// Get every update in etcd. All, /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind. - TenantTimelines(ZTenantId), + TenantTimelines(TenantId), /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind. - Timeline(ZTenantTimelineId), + Timeline(TenantTimelineId), /// Get etcd timeline updates, specific to a certain node kind. - Node(ZTenantTimelineId, NodeKind), + Node(TenantTimelineId, NodeKind), /// Get etcd timeline updates for a certain operation on specific nodes. - Operation(ZTenantTimelineId, NodeKind, OperationKind), + Operation(TenantTimelineId, NodeKind, OperationKind), } /// All kinds of nodes, able to write into etcd. @@ -67,7 +67,7 @@ static SUBSCRIPTION_FULL_KEY_REGEX: Lazy = Lazy::new(|| { /// No other etcd keys are considered during system's work. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct SubscriptionFullKey { - pub id: ZTenantTimelineId, + pub id: TenantTimelineId, pub node_kind: NodeKind, pub operation: OperationKind, pub node_id: NodeId, @@ -83,7 +83,7 @@ impl SubscriptionKey { } /// Subscribes to a given timeline info updates from safekeepers. - pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self { + pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self { Self { cluster_prefix, kind: SubscriptionKind::Operation( @@ -97,7 +97,7 @@ impl SubscriptionKey { /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes. pub fn operation( cluster_prefix: String, - timeline: ZTenantTimelineId, + timeline: TenantTimelineId, node_kind: NodeKind, operation: OperationKind, ) -> Self { @@ -175,7 +175,7 @@ impl FromStr for SubscriptionFullKey { }; Ok(Self { - id: ZTenantTimelineId::new( + id: TenantTimelineId::new( parse_capture(&key_captures, 1)?, parse_capture(&key_captures, 2)?, ), @@ -247,7 +247,7 @@ impl FromStr for SkOperationKind { #[cfg(test)] mod tests { - use utils::zid::ZTimelineId; + use utils::id::TimelineId; use super::*; @@ -256,9 +256,9 @@ mod tests { let prefix = "neon"; let node_kind = NodeKind::Safekeeper; let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup); - let tenant_id = ZTenantId::generate(); - let timeline_id = ZTimelineId::generate(); - let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + let id = TenantTimelineId::new(tenant_id, timeline_id); let node_id = NodeId(1); let timeline_subscription_keys = [ diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 5b9ecb7394..2b453fa0dc 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] env_logger = "0.9" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } wal_craft = { path = "wal_craft" } [build-dependencies] diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 114f08113b..f848ac1273 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -11,6 +11,6 @@ clap = "3.0" env_logger = "0.9" log = "0.4" once_cell = "1.13.0" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ce55277f29..ef2aa8b305 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -10,8 +10,8 @@ bincode = "1.3" bytes = "1.0.1" hyper = { version = "0.14.7", features = ["full"] } pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } routerify = "3" serde = { version = "1.0", features = ["derive"] } serde_json = "1" diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 0339939934..badcb5774e 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,11 +1,11 @@ #![allow(unused)] use criterion::{criterion_group, criterion_main, Criterion}; -use utils::zid; +use utils::id; pub fn bench_zid_stringify(c: &mut Criterion) { // Can only use public methods. - let ztl = zid::ZTenantTimelineId::generate(); + let ztl = id::TenantTimelineId::generate(); c.bench_function("zid.to_string", |b| { b.iter(|| { diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 3bdabacad4..b190b0d1c5 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -14,7 +14,7 @@ use jsonwebtoken::{ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use crate::zid::ZTenantId; +use crate::id::TenantId; const JWT_ALGORITHM: Algorithm = Algorithm::RS256; @@ -30,23 +30,23 @@ pub enum Scope { pub struct Claims { #[serde(default)] #[serde_as(as = "Option")] - pub tenant_id: Option, + pub tenant_id: Option, pub scope: Scope, } impl Claims { - pub fn new(tenant_id: Option, scope: Scope) -> Self { + pub fn new(tenant_id: Option, scope: Scope) -> Self { Self { tenant_id, scope } } } -pub fn check_permission(claims: &Claims, tenantid: Option) -> Result<()> { - match (&claims.scope, tenantid) { +pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<()> { + match (&claims.scope, tenant_id) { (Scope::Tenant, None) => { bail!("Attempt to access management api with tenant scope. Permission denied") } - (Scope::Tenant, Some(tenantid)) => { - if claims.tenant_id.unwrap() != tenantid { + (Scope::Tenant, Some(tenant_id)) => { + if claims.tenant_id.unwrap() != tenant_id { bail!("Tenant id mismatch. Permission denied") } Ok(()) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 69bf5ef87a..4066791e2b 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,6 +1,6 @@ use crate::auth::{self, Claims, JwtAuth}; use crate::http::error; -use crate::zid::ZTenantId; +use crate::id::TenantId; use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; @@ -137,9 +137,9 @@ pub fn auth_middleware( }) } -pub fn check_permission(req: &Request, tenantid: Option) -> Result<(), ApiError> { +pub fn check_permission(req: &Request, tenant_id: Option) -> Result<(), ApiError> { match req.context::() { - Some(claims) => Ok(auth::check_permission(&claims, tenantid) + Some(claims) => Ok(auth::check_permission(&claims, tenant_id) .map_err(|err| ApiError::Forbidden(err.to_string()))?), None => Ok(()), // claims is None because auth is disabled } diff --git a/libs/utils/src/http/mod.rs b/libs/utils/src/http/mod.rs index 0bb53ef51d..74ed6bb5b2 100644 --- a/libs/utils/src/http/mod.rs +++ b/libs/utils/src/http/mod.rs @@ -3,6 +3,6 @@ pub mod error; pub mod json; pub mod request; -/// Current fast way to apply simple http routing in various Zenith binaries. +/// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; diff --git a/libs/utils/src/zid.rs b/libs/utils/src/id.rs similarity index 76% rename from libs/utils/src/zid.rs rename to libs/utils/src/id.rs index 6da5355f61..059ce69ca4 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/id.rs @@ -4,7 +4,7 @@ use hex::FromHex; use rand::Rng; use serde::{Deserialize, Serialize}; -/// Zenith ID is a 128-bit random ID. +/// Neon ID is a 128-bit random ID. /// Used to represent various identifiers. Provides handy utility methods and impls. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look @@ -13,13 +13,13 @@ use serde::{Deserialize, Serialize}; /// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. /// Check the `serde_with::serde_as` documentation for options for more complex types. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -struct ZId([u8; 16]); +struct Id([u8; 16]); -impl ZId { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { +impl Id { + pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id { let mut arr = [0u8; 16]; buf.copy_to_slice(&mut arr); - ZId::from(arr) + Id::from(arr) } pub fn as_arr(&self) -> [u8; 16] { @@ -29,7 +29,7 @@ impl ZId { pub fn generate() -> Self { let mut tli_buf = [0u8; 16]; rand::thread_rng().fill(&mut tli_buf); - ZId::from(tli_buf) + Id::from(tli_buf) } fn hex_encode(&self) -> String { @@ -44,54 +44,54 @@ impl ZId { } } -impl FromStr for ZId { +impl FromStr for Id { type Err = hex::FromHexError; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { Self::from_hex(s) } } -// this is needed for pretty serialization and deserialization of ZId's using serde integration with hex crate -impl FromHex for ZId { +// this is needed for pretty serialization and deserialization of Id's using serde integration with hex crate +impl FromHex for Id { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { let mut buf: [u8; 16] = [0u8; 16]; hex::decode_to_slice(hex, &mut buf)?; - Ok(ZId(buf)) + Ok(Id(buf)) } } -impl AsRef<[u8]> for ZId { +impl AsRef<[u8]> for Id { fn as_ref(&self) -> &[u8] { &self.0 } } -impl From<[u8; 16]> for ZId { +impl From<[u8; 16]> for Id { fn from(b: [u8; 16]) -> Self { - ZId(b) + Id(b) } } -impl fmt::Display for ZId { +impl fmt::Display for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } -impl fmt::Debug for ZId { +impl fmt::Debug for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } -macro_rules! zid_newtype { +macro_rules! id_newtype { ($t:ident) => { impl $t { pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t { - $t(ZId::get_from_buf(buf)) + $t(Id::get_from_buf(buf)) } pub fn as_arr(&self) -> [u8; 16] { @@ -99,11 +99,11 @@ macro_rules! zid_newtype { } pub fn generate() -> Self { - $t(ZId::generate()) + $t(Id::generate()) } pub const fn from_array(b: [u8; 16]) -> Self { - $t(ZId(b)) + $t(Id(b)) } } @@ -111,14 +111,14 @@ macro_rules! zid_newtype { type Err = hex::FromHexError; fn from_str(s: &str) -> Result<$t, Self::Err> { - let value = ZId::from_str(s)?; + let value = Id::from_str(s)?; Ok($t(value)) } } impl From<[u8; 16]> for $t { fn from(b: [u8; 16]) -> Self { - $t(ZId::from(b)) + $t(Id::from(b)) } } @@ -126,7 +126,7 @@ macro_rules! zid_newtype { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { - Ok($t(ZId::from_hex(hex)?)) + Ok($t(Id::from_hex(hex)?)) } } @@ -150,7 +150,7 @@ macro_rules! zid_newtype { }; } -/// Zenith timeline IDs are different from PostgreSQL timeline +/// Neon timeline IDs are different from PostgreSQL timeline /// IDs. They serve a similar purpose though: they differentiate /// between different "histories" of the same cluster. However, /// PostgreSQL timeline IDs are a bit cumbersome, because they are only @@ -158,7 +158,7 @@ macro_rules! zid_newtype { /// timeline history. Those limitations mean that we cannot generate a /// new PostgreSQL timeline ID by just generating a random number. And /// that in turn is problematic for the "pull/push" workflow, where you -/// have a local copy of a zenith repository, and you periodically sync +/// have a local copy of a Neon repository, and you periodically sync /// the local changes with a remote server. When you work "detached" /// from the remote server, you cannot create a PostgreSQL timeline ID /// that's guaranteed to be different from all existing timelines in @@ -168,55 +168,55 @@ macro_rules! zid_newtype { /// branches? If they pick the same one, and later try to push the /// branches to the same remote server, they will get mixed up. /// -/// To avoid those issues, Zenith has its own concept of timelines that +/// To avoid those issues, Neon has its own concept of timelines that /// is separate from PostgreSQL timelines, and doesn't have those -/// limitations. A zenith timeline is identified by a 128-bit ID, which +/// limitations. A Neon timeline is identified by a 128-bit ID, which /// is usually printed out as a hex string. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`ZId`] for alternative ways to serialize it. +/// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] -pub struct ZTimelineId(ZId); +pub struct TimelineId(Id); -zid_newtype!(ZTimelineId); +id_newtype!(TimelineId); -/// Zenith Tenant Id represents identifiar of a particular tenant. +/// Neon Tenant Id represents identifiar of a particular tenant. /// Is used for distinguishing requests and data belonging to different users. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`ZId`] for alternative ways to serialize it. +/// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -pub struct ZTenantId(ZId); +pub struct TenantId(Id); -zid_newtype!(ZTenantId); +id_newtype!(TenantId); -// A pair uniquely identifying Zenith instance. +// A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct ZTenantTimelineId { - pub tenant_id: ZTenantId, - pub timeline_id: ZTimelineId, +pub struct TenantTimelineId { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, } -impl ZTenantTimelineId { - pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { - ZTenantTimelineId { +impl TenantTimelineId { + pub fn new(tenant_id: TenantId, timeline_id: TimelineId) -> Self { + TenantTimelineId { tenant_id, timeline_id, } } pub fn generate() -> Self { - Self::new(ZTenantId::generate(), ZTimelineId::generate()) + Self::new(TenantId::generate(), TimelineId::generate()) } pub fn empty() -> Self { - Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16])) + Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16])) } } -impl fmt::Display for ZTenantTimelineId { +impl fmt::Display for TenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}/{}", self.tenant_id, self.timeline_id) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index caa7ac6c09..2c80556446 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -29,7 +29,7 @@ pub mod crashsafe_dir; pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. -pub mod zid; +pub mod id; // http endpoint utils pub mod http; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 604eb75aaf..0498e0887b 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -63,7 +63,7 @@ pub enum AuthType { Trust, MD5, // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT - ZenithJWT, + NeonJWT, } impl FromStr for AuthType { @@ -73,8 +73,8 @@ impl FromStr for AuthType { match s { "Trust" => Ok(Self::Trust), "MD5" => Ok(Self::MD5), - "ZenithJWT" => Ok(Self::ZenithJWT), - _ => bail!("invalid value \"{}\" for auth type", s), + "NeonJWT" => Ok(Self::NeonJWT), + _ => bail!("invalid value \"{s}\" for auth type"), } } } @@ -84,7 +84,7 @@ impl fmt::Display for AuthType { f.write_str(match self { AuthType::Trust => "Trust", AuthType::MD5 => "MD5", - AuthType::ZenithJWT => "ZenithJWT", + AuthType::NeonJWT => "NeonJWT", }) } } @@ -376,7 +376,7 @@ impl PostgresBackend { ))?; self.state = ProtoState::Authentication; } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; } @@ -403,7 +403,7 @@ impl PostgresBackend { bail!("auth failed: {}", e); } } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index 383ad3742f..87e4478a99 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -346,7 +346,7 @@ impl PostgresBackend { ))?; self.state = ProtoState::Authentication; } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; } @@ -374,7 +374,7 @@ impl PostgresBackend { bail!("auth failed: {}", e); } } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index e73c73bd9c..11d2d94906 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -27,10 +27,10 @@ clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio-util = { version = "0.7.3", features = ["io", "io-util"] } -postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" thiserror = "1.0" diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 7e766ce859..f5247ee609 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -12,7 +12,7 @@ use utils::project_git_version; project_git_version!(GIT_VERSION); fn main() -> Result<()> { - let arg_matches = App::new("Zenith dump_layerfile utility") + let arg_matches = App::new("Neon dump_layerfile utility") .about("Dump contents of one layer file, for debugging") .version(GIT_VERSION) .arg( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 679c6f76e7..92d5eab379 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -40,7 +40,7 @@ fn version() -> String { } fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Zenith page server") + let arg_matches = App::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) .arg( @@ -293,7 +293,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, - AuthType::ZenithJWT => { + AuthType::NeonJWT => { // unwrap is ok because check is performed when creating config, so path is set and file exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); Some(JwtAuth::from_key_path(key_path)?.into()) diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 3339564b0f..16359c2532 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -11,7 +11,7 @@ use utils::{lsn::Lsn, project_git_version}; project_git_version!(GIT_VERSION); fn main() -> Result<()> { - let arg_matches = App::new("Zenith update metadata utility") + let arg_matches = App::new("Neon update metadata utility") .about("Dump or update metadata file") .version(GIT_VERSION) .arg( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 56171f46e3..75c71b09d2 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -15,8 +15,8 @@ use toml_edit; use toml_edit::{Document, Item}; use url::Url; use utils::{ + id::{NodeId, TenantId, TimelineId}, postgres_backend::AuthType, - zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::tenant::TIMELINES_SEGMENT_NAME; @@ -342,16 +342,16 @@ impl PageServerConf { self.workdir.join("tenants") } - pub fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenants_path().join(tenantid.to_string()) + pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenants_path().join(tenant_id.to_string()) } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } - pub fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timelines_path(tenantid).join(timelineid.to_string()) + pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf { + self.timelines_path(tenant_id).join(timeline_id.to_string()) } // @@ -419,7 +419,7 @@ impl PageServerConf { let mut conf = builder.build().context("invalid config")?; - if conf.auth_type == AuthType::ZenithJWT { + if conf.auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 0ccf23776c..c0dc5b9677 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -3,8 +3,8 @@ use std::num::NonZeroU64; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ + id::{NodeId, TenantId, TimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::tenant::TenantState; @@ -14,10 +14,10 @@ use crate::tenant::TenantState; pub struct TimelineCreateRequest { #[serde(default)] #[serde_as(as = "Option")] - pub new_timeline_id: Option, + pub new_timeline_id: Option, #[serde(default)] #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, + pub ancestor_timeline_id: Option, #[serde(default)] #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, @@ -28,7 +28,7 @@ pub struct TimelineCreateRequest { pub struct TenantCreateRequest { #[serde(default)] #[serde_as(as = "Option")] - pub new_tenant_id: Option, + pub new_tenant_id: Option, pub checkpoint_distance: Option, pub checkpoint_timeout: Option, pub compaction_target_size: Option, @@ -46,7 +46,7 @@ pub struct TenantCreateRequest { #[serde_as] #[derive(Serialize, Deserialize)] #[serde(transparent)] -pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId); +pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId); #[derive(Serialize)] pub struct StatusResponse { @@ -54,7 +54,7 @@ pub struct StatusResponse { } impl TenantCreateRequest { - pub fn new(new_tenant_id: Option) -> TenantCreateRequest { + pub fn new(new_tenant_id: Option) -> TenantCreateRequest { TenantCreateRequest { new_tenant_id, ..Default::default() @@ -65,7 +65,7 @@ impl TenantCreateRequest { #[serde_as] #[derive(Serialize, Deserialize)] pub struct TenantConfigRequest { - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, #[serde(default)] #[serde_as(as = "Option")] pub checkpoint_distance: Option, @@ -83,7 +83,7 @@ pub struct TenantConfigRequest { } impl TenantConfigRequest { - pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest { + pub fn new(tenant_id: TenantId) -> TenantConfigRequest { TenantConfigRequest { tenant_id, checkpoint_distance: None, @@ -106,7 +106,7 @@ impl TenantConfigRequest { #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] - pub id: ZTenantId, + pub id: TenantId, pub state: TenantState, pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, @@ -116,7 +116,7 @@ pub struct TenantInfo { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct LocalTimelineInfo { #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, + pub ancestor_timeline_id: Option, #[serde_as(as = "Option")] pub ancestor_lsn: Option, #[serde_as(as = "DisplayFromStr")] @@ -154,9 +154,9 @@ pub struct RemoteTimelineInfo { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { #[serde_as(as = "DisplayFromStr")] - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, #[serde_as(as = "DisplayFromStr")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub local: Option, pub remote: Option, } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 36ba2e9b66..2e49429f38 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -25,8 +25,8 @@ use utils::{ request::parse_request_param, RequestExt, RouterBuilder, }, + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; struct State { @@ -128,10 +128,10 @@ fn local_timeline_info_from_timeline( } fn list_local_timelines( - tenant_id: ZTenantId, + tenant_id: TenantId, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, -) -> Result> { +) -> Result> { let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let timelines = tenant.list_timelines(); @@ -156,7 +156,7 @@ async fn status_handler(request: Request) -> Result, ApiErr } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_id))?; @@ -164,8 +164,8 @@ async fn timeline_create_handler(mut request: Request) -> Result { @@ -193,7 +193,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = @@ -229,7 +229,7 @@ async fn timeline_list_handler(request: Request) -> Result, .remote_index .read() .await - .timeline_entry(&ZTenantTimelineId { + .timeline_entry(&TenantTimelineId { tenant_id, timeline_id, }) @@ -257,8 +257,8 @@ fn query_param_present(request: &Request, param: &str) -> bool { } async fn timeline_detail_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = @@ -289,7 +289,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; info!("Handling tenant attach {tenant_id}"); @@ -402,8 +402,8 @@ async fn tenant_attach_handler(request: Request) -> Result, /// for details see comment to `storage_sync::gather_tenant_timelines_index_parts` async fn gather_tenant_timelines_index_parts( state: &State, - tenant_id: ZTenantId, -) -> anyhow::Result>> { + tenant_id: TenantId, +) -> anyhow::Result>> { let index_parts = match state.remote_storage.as_ref() { Some(storage) => { storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await @@ -425,8 +425,8 @@ async fn gather_tenant_timelines_index_parts( } async fn timeline_delete_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; let state = get_state(&request); @@ -436,7 +436,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let state = get_state(&request); @@ -479,7 +479,7 @@ async fn tenant_list_handler(request: Request) -> Result, A } async fn tenant_status(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; // if tenant is in progress of downloading it can be absent in global tenant map @@ -588,8 +588,8 @@ async fn tenant_create_handler(mut request: Request) -> Result(HashMap>); +pub struct TenantTimelineValues(HashMap>); impl TenantTimelineValues { fn new() -> Self { @@ -187,8 +187,8 @@ mod tests { #[test] fn tenant_timeline_value_mapping() { - let first_tenant = ZTenantId::generate(); - let second_tenant = ZTenantId::generate(); + let first_tenant = TenantId::generate(); + let second_tenant = TenantId::generate(); assert_ne!(first_tenant, second_tenant); let mut initial = TenantTimelineValues::new(); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ada0bbd359..2f03943429 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -5,7 +5,7 @@ use metrics::{ IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; /// Prometheus histogram buckets (in seconds) that capture the majority of /// latencies in the microsecond range but also extend far enough up to distinguish @@ -327,7 +327,7 @@ pub struct TimelineMetrics { } impl TimelineMetrics { - pub fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); let reconstruct_time_histo = RECONSTRUCT_TIME @@ -414,6 +414,6 @@ impl Drop for TimelineMetrics { } } -pub fn remove_tenant_metrics(tenant_id: &ZTenantId) { +pub fn remove_tenant_metrics(tenant_id: &TenantId) { let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 15c3c22dd6..d2fe06697e 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -49,8 +49,8 @@ use anyhow::Context; use once_cell::sync::OnceCell; use tracing::error; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::repository::Key; @@ -109,8 +109,8 @@ enum CacheKey { #[derive(Debug, PartialEq, Eq, Hash, Clone)] struct MaterializedPageHashKey { - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: Key, } @@ -308,8 +308,8 @@ impl PageCache { /// returned page. pub fn lookup_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: &Key, lsn: Lsn, ) -> Option<(Lsn, PageReadGuard)> { @@ -338,8 +338,8 @@ impl PageCache { /// pub fn memorize_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: Key, lsn: Lsn, img: &[u8], diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 388f40f916..b06814c557 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -23,12 +23,12 @@ use tokio_util::io::SyncIoBridge; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, simple_rcu::RcuReadGuard, - zid::{ZTenantId, ZTimelineId}, }; use crate::basebackup; @@ -123,7 +123,7 @@ impl PagestreamFeMessage { fn parse(mut body: Bytes) -> anyhow::Result { // TODO these gets can fail - // these correspond to the ZenithMessageTag enum in pagestore_client.h + // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. @@ -370,7 +370,7 @@ struct PageRequestMetrics { } impl PageRequestMetrics { - fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); @@ -415,8 +415,8 @@ impl PageServerHandler { async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> anyhow::Result<()> { // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association @@ -452,11 +452,11 @@ impl PageServerHandler { None => break, // client disconnected }; - trace!("query: {:?}", copy_data_bytes); + trace!("query: {copy_data_bytes:?}"); - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let response = match zenith_fe_msg { + let response = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { let _timer = metrics.get_rel_exists.start_timer(); self.handle_get_rel_exists_request(&timeline, &req).await @@ -494,8 +494,8 @@ impl PageServerHandler { async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, ) -> anyhow::Result<()> { @@ -557,8 +557,8 @@ impl PageServerHandler { async fn handle_import_wal( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, ) -> anyhow::Result<()> { @@ -750,8 +750,8 @@ impl PageServerHandler { async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, @@ -792,7 +792,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -815,7 +815,7 @@ impl postgres_backend_async::Handler for PageServerHandler { _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> anyhow::Result<()> { - // this unwrap is never triggered, because check_auth_jwt only called when auth_type is ZenithJWT + // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self .auth @@ -853,8 +853,8 @@ impl postgres_backend_async::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; @@ -869,8 +869,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; @@ -895,8 +895,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for get_last_record_rlsn command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; let timeline = get_local_timeline(tenant_id, timeline_id)?; @@ -923,8 +923,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for fullbackup command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { @@ -959,8 +959,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; @@ -984,8 +984,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; let start_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; @@ -1035,7 +1035,7 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); - let tenant_id = ZTenantId::from_str(params[0])?; + let tenant_id = TenantId::from_str(params[0])?; let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), @@ -1087,8 +1087,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let tenant = tenant_mgr::get_tenant(tenant_id, true)?; @@ -1131,8 +1131,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("Invalid compact: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; @@ -1148,8 +1148,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). @@ -1166,8 +1166,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; @@ -1192,7 +1192,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } } -fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { +fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result> { tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id)) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 2454b6f54f..9d4b438dc4 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -10,7 +10,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; use crate::tenant::Timeline; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; use postgres_ffi::v14::pg_constants; @@ -570,7 +570,7 @@ impl<'a> DatadirModification<'a> { &mut self, rel: RelTag, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); @@ -583,7 +583,7 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { self.put( slru_block_to_key(kind, segno, blknum), @@ -1401,7 +1401,7 @@ fn is_slru_block_key(key: Key) -> bool { #[cfg(test)] pub fn create_test_timeline( tenant: &crate::tenant::Tenant, - timeline_id: utils::zid::ZTimelineId, + timeline_id: utils::id::TimelineId, ) -> Result> { let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c3b08c93de..f6ea9d8c5d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,4 +1,4 @@ -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use bytes::Bytes; @@ -157,7 +157,7 @@ pub enum Value { /// replayed get the full value. Replaying the WAL record /// might need a previous version of the value (if will_init() /// returns false), or it may be replayed stand-alone (true). - WalRecord(ZenithWalRecord), + WalRecord(NeonWalRecord), } impl Value { diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index c104dba298..9d259bf1e2 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -68,7 +68,7 @@ //! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. //! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. -//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], +//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`TenantId`] and [`TimelineId`], //! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. @@ -183,7 +183,7 @@ use crate::{ TenantTimelineValues, }; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; @@ -227,7 +227,7 @@ pub struct SyncStartupData { struct SyncQueue { max_timelines_per_batch: NonZeroUsize, - queue: Mutex>, + queue: Mutex>, condvar: Condvar, } @@ -241,7 +241,7 @@ impl SyncQueue { } /// Queue a new task - fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) { + fn push(&self, sync_id: TenantTimelineId, new_task: SyncTask) { let mut q = self.queue.lock().unwrap(); q.push_back((sync_id, new_task)); @@ -254,7 +254,7 @@ impl SyncQueue { /// A timeline has to care to not to delete certain layers from the remote storage before the corresponding uploads happen. /// Other than that, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). - fn next_task_batch(&self) -> (HashMap, usize) { + fn next_task_batch(&self) -> (HashMap, usize) { // Wait for the first task in blocking fashion let mut q = self.queue.lock().unwrap(); while q.is_empty() { @@ -488,8 +488,8 @@ struct LayersDeletion { /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_upload( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, layers_to_upload: HashSet, metadata: Option, ) { @@ -501,7 +501,7 @@ pub fn schedule_layer_upload( } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -519,8 +519,8 @@ pub fn schedule_layer_upload( /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_delete( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, layers_to_delete: HashSet, ) { let sync_queue = match SYNC_QUEUE.get() { @@ -531,7 +531,7 @@ pub fn schedule_layer_delete( } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -551,7 +551,7 @@ pub fn schedule_layer_delete( /// On any failure, the task gets retried, omitting already downloaded layers. /// /// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { +pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) { debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}"); let sync_queue = match SYNC_QUEUE.get() { Some(queue) => queue, @@ -561,7 +561,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -604,7 +604,7 @@ pub fn spawn_storage_sync_task( let _ = empty_tenants.0.entry(tenant_id).or_default(); } else { for (timeline_id, timeline_data) in timeline_data { - let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let id = TenantTimelineId::new(tenant_id, timeline_id); keys_for_index_part_downloads.insert(id); timelines_to_sync.insert(id, timeline_data); } @@ -766,9 +766,9 @@ async fn process_batches( max_sync_errors: NonZeroU32, storage: GenericRemoteStorage, index: &RemoteIndex, - batched_tasks: HashMap, + batched_tasks: HashMap, sync_queue: &SyncQueue, -) -> HashSet { +) -> HashSet { let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { @@ -808,7 +808,7 @@ async fn process_sync_task_batch( conf: &'static PageServerConf, (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, batch: SyncTaskBatch, ) -> DownloadStatus { let sync_start = Instant::now(); @@ -949,7 +949,7 @@ async fn download_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, new_download_data: SyncData, sync_start: Instant, task_name: &str, @@ -999,7 +999,7 @@ async fn download_timeline_data( async fn update_local_metadata( conf: &'static PageServerConf, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, remote_timeline: Option<&RemoteTimeline>, ) -> anyhow::Result<()> { let remote_metadata = match remote_timeline { @@ -1031,7 +1031,7 @@ async fn update_local_metadata( info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); // clone because spawn_blocking requires static lifetime let cloned_metadata = remote_metadata.to_owned(); - let ZTenantTimelineId { + let TenantTimelineId { tenant_id, timeline_id, } = sync_id; @@ -1061,7 +1061,7 @@ async fn update_local_metadata( async fn delete_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, task_name: &str, @@ -1104,7 +1104,7 @@ async fn upload_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, new_upload_data: SyncData, sync_start: Instant, task_name: &str, @@ -1163,7 +1163,7 @@ async fn update_remote_data( conf: &'static PageServerConf, storage: &GenericRemoteStorage, index: &RemoteIndex, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, update: RemoteDataUpdate<'_>, ) -> anyhow::Result<()> { let updated_remote_timeline = { @@ -1261,7 +1261,7 @@ async fn validate_task_retries( fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, - local_timeline_files: HashMap)>, + local_timeline_files: HashMap)>, ) -> TenantTimelineValues { let mut local_timeline_init_statuses = TenantTimelineValues::new(); @@ -1331,8 +1331,8 @@ fn schedule_first_sync_tasks( /// bool in return value stands for awaits_download fn compare_local_and_remote_timeline( - new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, - sync_id: ZTenantTimelineId, + new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>, + sync_id: TenantTimelineId, local_metadata: TimelineMetadata, local_files: HashSet, remote_entry: &RemoteTimeline, @@ -1377,7 +1377,7 @@ fn compare_local_and_remote_timeline( } fn register_sync_status( - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, sync_start: Instant, sync_name: &str, sync_status: Option, @@ -1409,7 +1409,7 @@ mod test_utils { pub(super) async fn create_local_timeline( harness: &TenantHarness<'_>, - timeline_id: ZTimelineId, + timeline_id: TimelineId, filenames: &[&str], metadata: TimelineMetadata, ) -> anyhow::Result { @@ -1454,8 +1454,8 @@ mod tests { use super::*; - const TEST_SYNC_ID: ZTenantTimelineId = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("11223344556677881122334455667788")), + const TEST_SYNC_ID: TenantTimelineId = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("11223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; @@ -1464,12 +1464,12 @@ mod tests { let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); - let sync_id_2 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; - let sync_id_3 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("33223344556677881122334455667788")), + let sync_id_3 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("33223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; assert!(sync_id_2 != TEST_SYNC_ID); @@ -1591,8 +1591,8 @@ mod tests { layers_to_skip: HashSet::from([PathBuf::from("sk4")]), }; - let sync_id_2 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; assert!(sync_id_2 != TEST_SYNC_ID); diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 945f5fded8..21a3372e70 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -8,7 +8,7 @@ use tracing::{debug, error, info}; use crate::storage_sync::{SyncQueue, SyncTask}; use remote_storage::GenericRemoteStorage; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use super::{LayersDeletion, SyncData}; @@ -17,7 +17,7 @@ use super::{LayersDeletion, SyncData}; pub(super) async fn delete_timeline_layers( storage: &GenericRemoteStorage, sync_queue: &SyncQueue, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut delete_data: SyncData, ) -> bool { if !delete_data.data.deletion_registered { @@ -123,7 +123,7 @@ mod tests { async fn delete_timeline_negative() -> anyhow::Result<()> { let harness = TenantHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), @@ -157,7 +157,7 @@ mod tests { let harness = TenantHarness::create("delete_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 32f228b447..80d5ca5994 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -20,7 +20,7 @@ use crate::{ config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, TEMP_FILE_SUFFIX, }; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::{ index::{IndexPart, RemoteTimeline}, @@ -33,14 +33,14 @@ use super::{ // When data is received succesfully without errors Present variant is used. pub enum TenantIndexParts { Poisoned { - present: HashMap, - missing: HashSet, + present: HashMap, + missing: HashSet, }, - Present(HashMap), + Present(HashMap), } impl TenantIndexParts { - fn add_poisoned(&mut self, timeline_id: ZTimelineId) { + fn add_poisoned(&mut self, timeline_id: TimelineId) { match self { TenantIndexParts::Poisoned { missing, .. } => { missing.insert(timeline_id); @@ -64,9 +64,9 @@ impl Default for TenantIndexParts { pub async fn download_index_parts( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - keys: HashSet, -) -> HashMap { - let mut index_parts: HashMap = HashMap::new(); + keys: HashSet, +) -> HashMap { + let mut index_parts: HashMap = HashMap::new(); let mut part_downloads = keys .into_iter() @@ -112,8 +112,8 @@ pub async fn download_index_parts( pub async fn gather_tenant_timelines_index_parts( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - tenant_id: ZTenantId, -) -> anyhow::Result> { + tenant_id: TenantId, +) -> anyhow::Result> { let tenant_path = conf.timelines_path(&tenant_id); let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id) .await @@ -135,7 +135,7 @@ pub async fn gather_tenant_timelines_index_parts( async fn download_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, ) -> Result { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); @@ -197,7 +197,7 @@ pub(super) async fn download_timeline_layers<'a>( storage: &'a GenericRemoteStorage, sync_queue: &'a SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut download_data: SyncData, ) -> DownloadedTimeline { let remote_timeline = match remote_timeline { @@ -335,7 +335,7 @@ pub(super) async fn download_timeline_layers<'a>( } // fsync timeline directory which is a parent directory for downloaded files - let ZTenantTimelineId { + let TenantTimelineId { tenant_id, timeline_id, } = &sync_id; @@ -366,8 +366,8 @@ pub(super) async fn download_timeline_layers<'a>( async fn get_timeline_sync_ids( storage: &GenericRemoteStorage, tenant_path: &Path, - tenant_id: ZTenantId, -) -> anyhow::Result> { + tenant_id: TenantId, +) -> anyhow::Result> { let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { format!( "Failed to get tenant storage path for local path '{}'", @@ -395,11 +395,11 @@ async fn get_timeline_sync_ids( anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") })?; - let timeline_id: ZTimelineId = object_name.parse().with_context(|| { + let timeline_id: TimelineId = object_name.parse().with_context(|| { format!("failed to parse object name into timeline id '{object_name}'") })?; - sync_ids.insert(ZTenantTimelineId { + sync_ids.insert(TenantTimelineId { tenant_id, timeline_id, }); @@ -439,7 +439,7 @@ mod tests { let harness = TenantHarness::create("download_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), @@ -539,7 +539,7 @@ mod tests { async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = TenantHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), @@ -597,7 +597,7 @@ mod tests { #[tokio::test] async fn test_download_index_part() -> anyhow::Result<()> { let harness = TenantHarness::create("test_download_index_part")?; - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index cff14cde49..13495ffefe 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -17,8 +17,8 @@ use tracing::log::warn; use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata}; use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::download::TenantIndexParts; @@ -49,7 +49,7 @@ impl RelativePath { } #[derive(Debug, Clone, Default)] -pub struct TenantEntry(HashMap); +pub struct TenantEntry(HashMap); impl TenantEntry { pub fn has_in_progress_downloads(&self) -> bool { @@ -59,7 +59,7 @@ impl TenantEntry { } impl Deref for TenantEntry { - type Target = HashMap; + type Target = HashMap; fn deref(&self) -> &Self::Target { &self.0 @@ -72,8 +72,8 @@ impl DerefMut for TenantEntry { } } -impl From> for TenantEntry { - fn from(inner: HashMap) -> Self { +impl From> for TenantEntry { + fn from(inner: HashMap) -> Self { Self(inner) } } @@ -81,7 +81,7 @@ impl From> for TenantEntry { /// An index to track tenant files that exist on the remote storage. #[derive(Debug, Clone, Default)] pub struct RemoteTimelineIndex { - entries: HashMap, + entries: HashMap, } /// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. @@ -91,9 +91,9 @@ pub struct RemoteIndex(Arc>); impl RemoteIndex { pub fn from_parts( conf: &'static PageServerConf, - index_parts: HashMap, + index_parts: HashMap, ) -> anyhow::Result { - let mut entries: HashMap = HashMap::new(); + let mut entries: HashMap = HashMap::new(); for (tenant_id, index_parts) in index_parts { match index_parts { @@ -136,30 +136,30 @@ impl Clone for RemoteIndex { impl RemoteTimelineIndex { pub fn timeline_entry( &self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: &ZTenantTimelineId, + }: &TenantTimelineId, ) -> Option<&RemoteTimeline> { self.entries.get(tenant_id)?.get(timeline_id) } pub fn timeline_entry_mut( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: &ZTenantTimelineId, + }: &TenantTimelineId, ) -> Option<&mut RemoteTimeline> { self.entries.get_mut(tenant_id)?.get_mut(timeline_id) } pub fn add_timeline_entry( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, entry: RemoteTimeline, ) { self.entries @@ -170,10 +170,10 @@ impl RemoteTimelineIndex { pub fn remove_timeline_entry( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, ) -> Option { self.entries .entry(tenant_id) @@ -181,25 +181,25 @@ impl RemoteTimelineIndex { .remove(&timeline_id) } - pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> { + pub fn tenant_entry(&self, tenant_id: &TenantId) -> Option<&TenantEntry> { self.entries.get(tenant_id) } - pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> { + pub fn tenant_entry_mut(&mut self, tenant_id: &TenantId) -> Option<&mut TenantEntry> { self.entries.get_mut(tenant_id) } - pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry { + pub fn add_tenant_entry(&mut self, tenant_id: TenantId) -> &mut TenantEntry { self.entries.entry(tenant_id).or_default() } - pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option { + pub fn remove_tenant_entry(&mut self, tenant_id: &TenantId) -> Option { self.entries.remove(tenant_id) } pub fn set_awaits_download( &mut self, - id: &ZTenantTimelineId, + id: &TenantTimelineId, awaits_download: bool, ) -> anyhow::Result<()> { self.timeline_entry_mut(id) diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index bd09e6b898..aa5a2232cf 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -8,7 +8,7 @@ use remote_storage::GenericRemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use super::{ index::{IndexPart, RemoteTimeline}, @@ -21,7 +21,7 @@ use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::me pub(super) async fn upload_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, index_part: IndexPart, ) -> anyhow::Result<()> { let index_part_bytes = serde_json::to_vec(&index_part) @@ -58,7 +58,7 @@ pub(super) async fn upload_timeline_layers<'a>( storage: &'a GenericRemoteStorage, sync_queue: &SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut upload_data: SyncData, ) -> UploadedTimeline { let upload = &mut upload_data.data; @@ -213,7 +213,7 @@ mod tests { async fn regular_layer_upload() -> anyhow::Result<()> { let harness = TenantHarness::create("regular_layer_upload")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; let storage = GenericRemoteStorage::new(LocalFs::new( @@ -301,7 +301,7 @@ mod tests { async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { let harness = TenantHarness::create("layer_upload_after_local_fs_update")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; let storage = GenericRemoteStorage::new(LocalFs::new( @@ -395,7 +395,7 @@ mod tests { #[tokio::test] async fn test_upload_index_part() -> anyhow::Result<()> { let harness = TenantHarness::create("test_upload_index_part")?; - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 2aa803d119..dad6e0039d 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -51,7 +51,7 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; use crate::shutdown_pageserver; @@ -210,8 +210,8 @@ pub enum TaskKind { #[derive(Default)] struct MutableTaskState { /// Tenant and timeline that this task is associated with. - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, /// Handle for waiting for the task to exit. It can be None, if the /// the task has already exited. @@ -238,8 +238,8 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, name: &str, shutdown_process_on_error: bool, future: F, @@ -371,7 +371,7 @@ async fn task_finish( } // expected to be called from the task of the given id. -pub fn associate_with(tenant_id: Option, timeline_id: Option) { +pub fn associate_with(tenant_id: Option, timeline_id: Option) { CURRENT_TASK.with(|ct| { let mut task_mut = ct.mutable.lock().unwrap(); task_mut.tenant_id = tenant_id; @@ -391,12 +391,12 @@ pub fn associate_with(tenant_id: Option, timeline_id: Option, - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, ) { let mut victim_tasks = Vec::new(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4ef810faba..41fd98ec07 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4,7 +4,7 @@ //! The functions here are responsible for locating the correct layer for the //! get/put call, walking back the timeline branching history as needed. //! -//! The files are stored in the .neon/tenants//timelines/ +//! The files are stored in the .neon/tenants//timelines/ //! directory. See docs/pageserver-storage.md for how the files are managed. //! In addition to the layer files, there is a metadata file in the same //! directory that contains information about the timeline, in particular its @@ -48,8 +48,8 @@ use crate::CheckpointConfig; use toml_edit; use utils::{ crashsafe_dir, + id::{TenantId, TimelineId}, lsn::{Lsn, RecordLsn}, - zid::{ZTenantId, ZTimelineId}, }; mod blob_io; @@ -80,7 +80,7 @@ pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; -/// Parts of the `.neon/tenants//timelines/` directory prefix. +/// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// @@ -98,8 +98,8 @@ pub struct Tenant { // This is necessary to allow global config updates. tenant_conf: Arc>, - tenant_id: ZTenantId, - timelines: Mutex>>, + tenant_id: TenantId, + timelines: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration (especially with enforced checkpoint) @@ -134,7 +134,7 @@ pub enum TenantState { impl Tenant { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result> { + pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { self.timelines .lock() .unwrap() @@ -151,7 +151,7 @@ impl Tenant { /// Lists timelines the tenant contains. /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. - pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { + pub fn list_timelines(&self) -> Vec<(TimelineId, Arc)> { self.timelines .lock() .unwrap() @@ -164,7 +164,7 @@ impl Tenant { /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. pub fn create_empty_timeline( &self, - new_timeline_id: ZTimelineId, + new_timeline_id: TimelineId, initdb_lsn: Lsn, ) -> Result> { // XXX: keep the lock to avoid races during timeline creation @@ -207,8 +207,8 @@ impl Tenant { /// Branch a timeline pub fn branch_timeline( &self, - src: ZTimelineId, - dst: ZTimelineId, + src: TimelineId, + dst: TimelineId, start_lsn: Option, ) -> Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn @@ -302,14 +302,14 @@ impl Tenant { /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// - /// 'timelineid' specifies the timeline to GC, or None for all. + /// 'target_timeline_id' specifies the timeline to GC, or None for all. /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC /// to make tests more deterministic. /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? pub fn gc_iteration( &self, - target_timeline_id: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, @@ -337,13 +337,13 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) + .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone())) .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_compact { + for (timeline_id, timeline) in &timelines_to_compact { let _entered = - info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered(); + info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered(); timeline.compact()?; } @@ -362,13 +362,13 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - .map(|(timelineid, timeline)| (*timelineid, Arc::clone(timeline))) + .map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline))) .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_compact { + for (timeline_id, timeline) in &timelines_to_compact { let _entered = - info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenant_id) + info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id) .entered(); timeline.checkpoint(CheckpointConfig::Flush)?; } @@ -377,7 +377,7 @@ impl Tenant { } /// Removes timeline-related in-memory data - pub fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + pub fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> { // in order to be retriable detach needs to be idempotent // (or at least to a point that each time the detach is called it can make progress) let mut timelines = self.timelines.lock().unwrap(); @@ -416,7 +416,7 @@ impl Tenant { pub fn init_attach_timelines( &self, - timelines: HashMap, + timelines: HashMap, ) -> anyhow::Result<()> { let sorted_timelines = if timelines.len() == 1 { timelines.into_iter().collect() @@ -505,13 +505,13 @@ impl Tenant { /// perform a topological sort, so that the parent of each timeline comes /// before the children. fn tree_sort_timelines( - timelines: HashMap, -) -> Result> { + timelines: HashMap, +) -> Result> { let mut result = Vec::with_capacity(timelines.len()); let mut now = Vec::with_capacity(timelines.len()); // (ancestor, children) - let mut later: HashMap> = + let mut later: HashMap> = HashMap::with_capacity(timelines.len()); for (timeline_id, metadata) in timelines { @@ -636,9 +636,9 @@ impl Tenant { fn initialize_new_timeline( &self, - new_timeline_id: ZTimelineId, + new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, + timelines: &mut MutexGuard>>, ) -> anyhow::Result> { let ancestor = match new_metadata.ancestor_timeline() { Some(ancestor_timeline_id) => Some( @@ -680,7 +680,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_conf: TenantConfOpt, walredo_mgr: Arc, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: RemoteIndex, upload_layers: bool, ) -> Tenant { @@ -701,7 +701,7 @@ impl Tenant { /// Locate and load config pub fn load_tenant_config( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result { let target_config_path = TenantConf::path(conf, tenant_id); let target_config_display = target_config_path.display(); @@ -830,7 +830,7 @@ impl Tenant { // we do. fn gc_iteration_internal( &self, - target_timeline_id: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, @@ -848,7 +848,7 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); + let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new(); let timeline_ids = { if let Some(target_timeline_id) = target_timeline_id.as_ref() { if timelines.get(target_timeline_id).is_none() { @@ -861,11 +861,11 @@ impl Tenant { .map(|(timeline_id, timeline_entry)| { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. - // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + // Somewhat related: https://github.com/neondatabase/neon/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timeline_id { - if ancestor_timeline_id == &timelineid { + if let Some(timeline_id) = target_timeline_id { + if ancestor_timeline_id == &timeline_id { all_branchpoints.insert(( *ancestor_timeline_id, timeline_entry.get_ancestor_lsn(), @@ -895,8 +895,8 @@ impl Tenant { .with_context(|| format!("Timeline {timeline_id} was not found"))?; // If target_timeline is specified, ignore all other timelines - if let Some(target_timelineid) = target_timeline_id { - if timeline_id != target_timelineid { + if let Some(target_timeline_id) = target_timeline_id { + if timeline_id != target_timeline_id { continue; } } @@ -952,7 +952,7 @@ impl Tenant { Ok(totals) } - pub fn tenant_id(&self) -> ZTenantId { + pub fn tenant_id(&self) -> TenantId { self.tenant_id } } @@ -998,7 +998,7 @@ pub mod harness { config::PageServerConf, repository::Key, tenant::Tenant, - walrecord::ZenithWalRecord, + walrecord::NeonWalRecord, walredo::{WalRedoError, WalRedoManager}, }; @@ -1006,12 +1006,12 @@ pub mod harness { use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::zid::{ZTenantId, ZTimelineId}; + use utils::id::{TenantId, TimelineId}; - pub const TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const NEW_TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + pub const NEW_TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("AA223344556677881122334455667788")); /// Convenience function to create a page image with given string as the only content #[allow(non_snake_case)] @@ -1047,7 +1047,7 @@ pub mod harness { pub struct TenantHarness<'a> { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, pub lock_guard: ( Option>, @@ -1080,7 +1080,7 @@ pub mod harness { let tenant_conf = TenantConf::dummy_conf(); - let tenant_id = ZTenantId::generate(); + let tenant_id = TenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.timelines_path(&tenant_id))?; @@ -1113,7 +1113,7 @@ pub mod harness { .expect("should be able to read timelines dir") { let timeline_dir_entry = timeline_dir_entry?; - let timeline_id: ZTimelineId = timeline_dir_entry + let timeline_id: TimelineId = timeline_dir_entry .path() .file_name() .unwrap() @@ -1128,15 +1128,15 @@ pub mod harness { Ok(tenant) } - pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { + pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf { self.conf.timeline_path(timeline_id, &self.tenant_id) } } fn load_metadata( conf: &'static PageServerConf, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, ) -> anyhow::Result { let metadata_path = metadata_path(conf, timeline_id, tenant_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { @@ -1162,7 +1162,7 @@ pub mod harness { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result { let s = format!( "redo for {} to get to {}, with {} and {} records", @@ -1747,7 +1747,7 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { - let new_tline_id = ZTimelineId::generate(); + let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant .get_timeline(new_tline_id) @@ -1808,7 +1808,7 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { - let new_tline_id = ZTimelineId::generate(); + let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant .get_timeline(new_tline_id) diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index ff6d3652f9..892000c20b 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -7,7 +7,7 @@ //! must be page images or WAL records with the 'will_init' flag set, so that //! they can be replayed without referring to an older page version. //! -//! The delta files are stored in timelines/ directory. Currently, +//! The delta files are stored in timelines/ directory. Currently, //! there are no subdirectories, and each delta file is named like this: //! //! -__-, lsn_range: Range, @@ -81,8 +81,8 @@ impl From<&DeltaLayer> for Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: layer.tenantid, - timelineid: layer.timelineid, + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, key_range: layer.key_range.clone(), lsn_range: layer.lsn_range.clone(), @@ -173,8 +173,8 @@ impl DeltaKey { pub struct DeltaLayer { path_or_conf: PathOrConf, - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub key_range: Range, pub lsn_range: Range, @@ -194,12 +194,12 @@ pub struct DeltaLayerInner { } impl Layer for DeltaLayer { - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -344,8 +344,8 @@ impl Layer for DeltaLayer { fn dump(&self, verbose: bool) -> Result<()> { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, self.key_range.start, self.key_range.end, self.lsn_range.start, @@ -419,22 +419,22 @@ impl Layer for DeltaLayer { impl DeltaLayer { fn path_for( path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &DeltaFileName, ) -> PathBuf { match path_or_conf { PathOrConf::Path(path) => path.clone(), PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(fname.to_string()), } } fn temp_path_for( conf: &PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: &Range, ) -> PathBuf { @@ -444,7 +444,7 @@ impl DeltaLayer { .map(char::from) .collect(); - conf.timeline_path(&timelineid, &tenantid).join(format!( + conf.timeline_path(&timeline_id, &tenant_id).join(format!( "{}-XXX__{:016X}-{:016X}.{}.{}", key_start, u64::from(lsn_range.start), @@ -535,14 +535,14 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, filename: &DeltaFileName, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: filename.key_range.clone(), lsn_range: filename.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { @@ -568,8 +568,8 @@ impl DeltaLayer { Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, key_range: summary.key_range, lsn_range: summary.lsn_range, inner: RwLock::new(DeltaLayerInner { @@ -592,8 +592,8 @@ impl DeltaLayer { pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &self.layer_name(), ) } @@ -613,8 +613,8 @@ impl DeltaLayer { pub struct DeltaLayerWriter { conf: &'static PageServerConf, path: PathBuf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: Range, @@ -630,8 +630,8 @@ impl DeltaLayerWriter { /// pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: Range, ) -> Result { @@ -641,7 +641,7 @@ impl DeltaLayerWriter { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range); + let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range); let mut file = VirtualFile::create(&path)?; // make room for the header block @@ -656,8 +656,8 @@ impl DeltaLayerWriter { Ok(DeltaLayerWriter { conf, path, - timelineid, - tenantid, + timeline_id, + tenant_id, key_start, lsn_range, tree: tree_builder, @@ -718,8 +718,8 @@ impl DeltaLayerWriter { let summary = Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), index_start_blk, @@ -733,8 +733,8 @@ impl DeltaLayerWriter { // set inner.file here. The first read will have to re-open it. let layer = DeltaLayer { path_or_conf: PathOrConf::Conf(self.conf), - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { @@ -753,8 +753,8 @@ impl DeltaLayerWriter { // FIXME: throw an error instead? let final_path = DeltaLayer::path_for( &PathOrConf::Conf(self.conf), - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &DeltaFileName { key_range: self.key_start..key_end, lsn_range: self.lsn_range, diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index c675e4e778..0774fa42a6 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -17,7 +17,7 @@ use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use tracing::*; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; use std::os::unix::fs::FileExt; @@ -39,8 +39,8 @@ pub struct EphemeralFiles { pub struct EphemeralFile { file_id: u64, - _tenantid: ZTenantId, - _timelineid: ZTimelineId, + _tenant_id: TenantId, + _timeline_id: TimelineId, file: Arc, pub size: u64, @@ -49,15 +49,15 @@ pub struct EphemeralFile { impl EphemeralFile { pub fn create( conf: &PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> Result { let mut l = EPHEMERAL_FILES.write().unwrap(); let file_id = l.next_file_id; l.next_file_id += 1; let filename = conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(PathBuf::from(format!("ephemeral-{}", file_id))); let file = VirtualFile::open_with_options( @@ -69,8 +69,8 @@ impl EphemeralFile { Ok(EphemeralFile { file_id, - _tenantid: tenantid, - _timelineid: timelineid, + _tenant_id: tenant_id, + _timeline_id: timeline_id, file: file_rc, size: 0, }) @@ -338,7 +338,7 @@ mod tests { fn harness( test_name: &str, - ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { + ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); let conf = PageServerConf::dummy_conf(repo_dir); @@ -346,11 +346,11 @@ mod tests { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap(); - let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap(); - fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?; + let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap(); + let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); + fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?; - Ok((conf, tenantid, timelineid)) + Ok((conf, tenant_id, timeline_id)) } // Helper function to slurp contents of a file, starting at the current position, @@ -368,9 +368,9 @@ mod tests { #[test] fn test_ephemeral_files() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = harness("ephemeral_files")?; + let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?; - let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; + let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?; file_a.write_all_at(b"foo", 0)?; assert_eq!("foo", read_string(&file_a, 0, 20)?); @@ -381,7 +381,7 @@ mod tests { // Open a lot of files, enough to cause some page evictions. let mut efiles = Vec::new(); for fileno in 0..100 { - let efile = EphemeralFile::create(conf, tenantid, timelineid)?; + let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?; efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?; assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?); efiles.push((fileno, efile)); @@ -399,9 +399,9 @@ mod tests { #[test] fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = harness("ephemeral_blobs")?; + let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?; let pos_foo = file.write_blob(b"foo")?; assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs index 518643241d..92bf022fee 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -4,7 +4,7 @@ //! but does not exist in the layer, does not exist. //! //! An image layer is stored in a file on disk. The file is stored in -//! timelines/ directory. Currently, there are no +//! timelines/ directory. Currently, there are no //! subdirectories, and each image layer file is named like this: //! //! -__ @@ -44,8 +44,8 @@ use tracing::*; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; /// @@ -56,12 +56,12 @@ use utils::{ /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { - /// Magic value to identify this as a zenith image file. Always IMAGE_FILE_MAGIC. + /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC. magic: u16, format_version: u16, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key_range: Range, lsn: Lsn, @@ -77,8 +77,8 @@ impl From<&ImageLayer> for Summary { Self { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: layer.tenantid, - timelineid: layer.timelineid, + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, key_range: layer.key_range.clone(), lsn: layer.lsn, @@ -97,8 +97,8 @@ impl From<&ImageLayer> for Summary { /// pub struct ImageLayer { path_or_conf: PathOrConf, - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub key_range: Range, // This entry contains an image of all pages as of this LSN @@ -128,12 +128,12 @@ impl Layer for ImageLayer { Some(self.path()) } - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -202,7 +202,7 @@ impl Layer for ImageLayer { fn dump(&self, verbose: bool) -> Result<()> { println!( "----- image layer for ten {} tli {} key {}-{} at {} ----", - self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn + self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn ); if !verbose { @@ -228,22 +228,22 @@ impl Layer for ImageLayer { impl ImageLayer { fn path_for( path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &ImageFileName, ) -> PathBuf { match path_or_conf { PathOrConf::Path(path) => path.to_path_buf(), PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(fname.to_string()), } } fn temp_path_for( conf: &PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &ImageFileName, ) -> PathBuf { let rand_string: String = rand::thread_rng() @@ -252,7 +252,7 @@ impl ImageLayer { .map(char::from) .collect(); - conf.timeline_path(&timelineid, &tenantid) + conf.timeline_path(&timeline_id, &tenant_id) .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } @@ -336,14 +336,14 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, filename: &ImageFileName, ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: filename.key_range.clone(), lsn: filename.lsn, inner: RwLock::new(ImageLayerInner { @@ -369,8 +369,8 @@ impl ImageLayer { Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, key_range: summary.key_range, lsn: summary.lsn, inner: RwLock::new(ImageLayerInner { @@ -393,8 +393,8 @@ impl ImageLayer { pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &self.layer_name(), ) } @@ -414,8 +414,8 @@ impl ImageLayer { pub struct ImageLayerWriter { conf: &'static PageServerConf, path: PathBuf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_range: Range, lsn: Lsn, @@ -426,8 +426,8 @@ pub struct ImageLayerWriter { impl ImageLayerWriter { pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_range: &Range, lsn: Lsn, ) -> anyhow::Result { @@ -435,8 +435,8 @@ impl ImageLayerWriter { // We'll atomically rename it to the final name when we're done. let path = ImageLayer::temp_path_for( conf, - timelineid, - tenantid, + timeline_id, + tenant_id, &ImageFileName { key_range: key_range.clone(), lsn, @@ -458,8 +458,8 @@ impl ImageLayerWriter { let writer = ImageLayerWriter { conf, path, - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: key_range.clone(), lsn, tree: tree_builder, @@ -502,8 +502,8 @@ impl ImageLayerWriter { let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_range.clone(), lsn: self.lsn, index_start_blk, @@ -517,8 +517,8 @@ impl ImageLayerWriter { // set inner.file here. The first read will have to re-open it. let layer = ImageLayer { path_or_conf: PathOrConf::Conf(self.conf), - timelineid: self.timelineid, - tenantid: self.tenantid, + timeline_id: self.timeline_id, + tenant_id: self.tenant_id, key_range: self.key_range.clone(), lsn: self.lsn, inner: RwLock::new(ImageLayerInner { @@ -538,8 +538,8 @@ impl ImageLayerWriter { // FIXME: throw an error instead? let final_path = ImageLayer::path_for( &PathOrConf::Conf(self.conf), - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &ImageFileName { key_range: self.key_range.clone(), lsn: self.lsn, diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs index 0e7b215b1e..9aa33a72ca 100644 --- a/pageserver/src/tenant/inmemory_layer.rs +++ b/pageserver/src/tenant/inmemory_layer.rs @@ -18,9 +18,9 @@ use std::collections::HashMap; use tracing::*; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, vec_map::VecMap, - zid::{ZTenantId, ZTimelineId}, }; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods @@ -37,8 +37,8 @@ thread_local! { pub struct InMemoryLayer { conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, /// /// This layer contains all the changes from 'start_lsn'. The @@ -94,12 +94,12 @@ impl Layer for InMemoryLayer { None } - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -197,7 +197,7 @@ impl Layer for InMemoryLayer { println!( "----- in-memory layer for tli {} LSNs {}-{} ----", - self.timelineid, self.start_lsn, end_str, + self.timeline_id, self.start_lsn, end_str, ); if !verbose { @@ -251,22 +251,18 @@ impl InMemoryLayer { /// pub fn create( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, start_lsn: Lsn, ) -> Result { - trace!( - "initializing new empty InMemoryLayer for writing on timeline {} at {}", - timelineid, - start_lsn - ); + trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenantid, timelineid)?; + let file = EphemeralFile::create(conf, tenant_id, timeline_id)?; Ok(InMemoryLayer { conf, - timelineid, - tenantid, + timeline_id, + tenant_id, start_lsn, inner: RwLock::new(InMemoryLayerInner { end_lsn: None, @@ -281,7 +277,7 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); + trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); @@ -344,8 +340,8 @@ impl InMemoryLayer { let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, Key::MIN, self.start_lsn..inner.end_lsn.unwrap(), )?; diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c24e3976fb..8abeebf54c 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -2,7 +2,7 @@ //! The layer map tracks what layers exist in a timeline. //! //! When the timeline is first accessed, the server lists of all layer files -//! in the timelines/ directory, and populates this map with +//! in the timelines/ directory, and populates this map with //! ImageLayer and DeltaLayer structs corresponding to each file. When the first //! new WAL record is received, we create an InMemoryLayer to hold the incoming //! records. Now and then, in the checkpoint() function, the in-memory layer is diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 4ea2b7d55b..ace4dc91e9 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -15,8 +15,8 @@ use serde::{Deserialize, Serialize}; use tracing::info_span; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; @@ -63,7 +63,7 @@ struct TimelineMetadataBody { // doing a clean shutdown, so that there is no more WAL beyond // 'disk_consistent_lsn' prev_record_lsn: Option, - ancestor_timeline: Option, + ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, @@ -73,7 +73,7 @@ impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, prev_record_lsn: Option, - ancestor_timeline: Option, + ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, @@ -149,7 +149,7 @@ impl TimelineMetadata { self.body.prev_record_lsn } - pub fn ancestor_timeline(&self) -> Option { + pub fn ancestor_timeline(&self) -> Option { self.body.ancestor_timeline } @@ -170,23 +170,23 @@ impl TimelineMetadata { /// where certain timeline's metadata file should be located. pub fn metadata_path( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, ) -> PathBuf { - conf.timeline_path(&timelineid, &tenantid) + conf.timeline_path(&timeline_id, &tenant_id) .join(METADATA_FILE_NAME) } /// Save timeline metadata to file pub fn save_metadata( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, data: &TimelineMetadata, first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); + let path = metadata_path(conf, timeline_id, tenant_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e10330bdd3..8dafcab124 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -3,15 +3,15 @@ //! use crate::repository::{Key, Value}; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; pub fn range_overlaps(a: &Range, b: &Range) -> bool @@ -50,7 +50,7 @@ where /// #[derive(Debug)] pub struct ValueReconstructState { - pub records: Vec<(Lsn, ZenithWalRecord)>, + pub records: Vec<(Lsn, NeonWalRecord)>, pub img: Option<(Lsn, Bytes)>, } @@ -84,10 +84,10 @@ pub enum ValueReconstructResult { /// LSN /// pub trait Layer: Send + Sync { - fn get_tenant_id(&self) -> ZTenantId; + fn get_tenant_id(&self) -> TenantId; /// Identify the timeline this layer belongs to - fn get_timeline_id(&self) -> ZTimelineId; + fn get_timeline_id(&self) -> TimelineId; /// Range of keys that this layer covers fn get_key_range(&self) -> Range; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c96ad99909..e821ef1b9a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -39,10 +39,10 @@ use crate::tenant_config::TenantConfOpt; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ + id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, simple_rcu::{Rcu, RcuReadGuard}, - zid::{ZTenantId, ZTimelineId}, }; use crate::repository::GcResult; @@ -58,8 +58,8 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - pub tenant_id: ZTenantId, - pub timeline_id: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub layers: RwLock, @@ -312,7 +312,7 @@ impl Timeline { } /// Get the ancestor's timeline id - pub fn get_ancestor_timeline_id(&self) -> Option { + pub fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id) @@ -531,8 +531,8 @@ impl Timeline { tenant_conf: Arc>, metadata: TimelineMetadata, ancestor: Option>, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, walredo_mgr: Arc, upload_layers: bool, ) -> Timeline { @@ -1250,7 +1250,7 @@ impl Timeline { None }; - let ancestor_timelineid = self + let ancestor_timeline_id = self .ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id); @@ -1258,7 +1258,7 @@ impl Timeline { let metadata = TimelineMetadata::new( disk_consistent_lsn, ondisk_prev_record_lsn, - ancestor_timelineid, + ancestor_timeline_id, self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 73bf3636d2..4448ffc456 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; use std::path::PathBuf; use std::time::Duration; -use utils::zid::ZTenantId; +use utils::id::TenantId; pub const TENANT_CONFIG_NAME: &str = "config"; @@ -217,8 +217,8 @@ impl TenantConf { /// Points to a place in pageserver's local directory, /// where certain tenant's tenantconf file should be located. - pub fn path(conf: &'static PageServerConf, tenantid: ZTenantId) -> PathBuf { - conf.tenant_path(&tenantid).join(TENANT_CONFIG_NAME) + pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf { + conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) } #[cfg(test)] diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a8a9926c77..d6fa843305 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -27,7 +27,7 @@ use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; use utils::crashsafe_dir; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; mod tenants_state { use once_cell::sync::Lazy; @@ -35,20 +35,20 @@ mod tenants_state { collections::HashMap, sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; - use utils::zid::ZTenantId; + use utils::id::TenantId; use crate::tenant::Tenant; - static TENANTS: Lazy>>> = + static TENANTS: Lazy>>> = Lazy::new(|| RwLock::new(HashMap::new())); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { TENANTS .read() .expect("Failed to read() tenants lock, it got poisoned") } - pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { TENANTS .write() .expect("Failed to write() tenants lock, it got poisoned") @@ -159,7 +159,7 @@ pub fn attach_local_tenants( fn load_local_tenant( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: &RemoteIndex, ) -> Arc { let tenant = Arc::new(Tenant::new( @@ -225,7 +225,7 @@ pub async fn shutdown_all_tenants() { fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( @@ -310,9 +310,9 @@ fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyho pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { match tenants_state::write_tenants().entry(tenant_id) { hash_map::Entry::Occupied(_) => { debug!("tenant {tenant_id} already exists"); @@ -339,7 +339,7 @@ pub fn create_tenant( pub fn update_tenant_config( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); @@ -349,7 +349,7 @@ pub fn update_tenant_config( /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result> { +pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -361,7 +361,7 @@ pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result anyhow::Result<()> { +pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join @@ -398,7 +398,7 @@ pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> pub async fn detach_tenant( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { let tenant = match { let mut tenants_accessor = tenants_state::write_tenants(); @@ -565,14 +565,14 @@ fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, ) -> anyhow::Result<( - ZTenantId, - HashMap)>, + TenantId, + HashMap)>, )> { let tenant_id = tenant_path .file_name() .and_then(OsStr::to_str) .unwrap_or_default() - .parse::() + .parse::() .context("Could not parse tenant id out of the tenant dir name")?; let timelines_dir = config.timelines_path(&tenant_id); @@ -644,7 +644,7 @@ fn collect_timelines_for_tenant( // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { +) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet)> { let mut timeline_files = HashSet::new(); let mut timeline_metadata_path = None; @@ -652,7 +652,7 @@ fn collect_timeline_files( .file_name() .and_then(OsStr::to_str) .unwrap_or_default() - .parse::() + .parse::() .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 3ef54838af..c543a0ecb1 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -10,9 +10,9 @@ use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::{Tenant, TenantState}; use crate::tenant_mgr; use tracing::*; -use utils::zid::ZTenantId; +use utils::id::TenantId; -pub fn start_background_loops(tenant_id: ZTenantId) { +pub fn start_background_loops(tenant_id: TenantId) { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -42,9 +42,8 @@ pub fn start_background_loops(tenant_id: ZTenantId) { /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: ZTenantId) { +async fn compaction_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting compaction loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { @@ -90,9 +89,8 @@ async fn compaction_loop(tenant_id: ZTenantId) { /// /// GC task's main loop /// -async fn gc_loop(tenant_id: ZTenantId) { +async fn gc_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting gc loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { @@ -138,7 +136,7 @@ async fn gc_loop(tenant_id: ZTenantId) { } async fn wait_for_active_tenant( - tenant_id: ZTenantId, + tenant_id: TenantId, wait: Duration, ) -> ControlFlow<(), Arc> { let tenant = loop { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 69d14babf0..88b26e18f4 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -14,8 +14,8 @@ use tracing::*; use remote_storage::path_with_suffix_extension; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; @@ -61,8 +61,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // fn bootstrap_timeline( conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, tenant: &Tenant, ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` @@ -115,12 +115,12 @@ fn bootstrap_timeline( /// pub(crate) async fn create_timeline( conf: &'static PageServerConf, - tenant_id: ZTenantId, - new_timeline_id: Option, - ancestor_timeline_id: Option, + tenant_id: TenantId, + new_timeline_id: Option, + ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, ) -> Result>> { - let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); let tenant = tenant_mgr::get_tenant(tenant_id, true)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 7a2c699b44..896c2603a2 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -53,8 +53,8 @@ pub struct VirtualFile { pub path: PathBuf, open_options: OpenOptions, - tenantid: String, - timelineid: String, + tenant_id: String, + timeline_id: String, } #[derive(Debug, PartialEq, Clone, Copy)] @@ -149,7 +149,7 @@ impl OpenFiles { // old file. // if let Some(old_file) = slot_guard.file.take() { - // We do not have information about tenantid/timelineid of evicted file. + // We do not have information about tenant_id/timeline_id of evicted file. // It is possible to store path together with file or use filepath crate, // but as far as close() is not expected to be fast, it is not so critical to gather // precise per-tenant statistic here. @@ -197,18 +197,18 @@ impl VirtualFile { ) -> Result { let path_str = path.to_string_lossy(); let parts = path_str.split('/').collect::>(); - let tenantid; - let timelineid; + let tenant_id; + let timeline_id; if parts.len() > 5 && parts[parts.len() - 5] == "tenants" { - tenantid = parts[parts.len() - 4].to_string(); - timelineid = parts[parts.len() - 2].to_string(); + tenant_id = parts[parts.len() - 4].to_string(); + timeline_id = parts[parts.len() - 2].to_string(); } else { - tenantid = "*".to_string(); - timelineid = "*".to_string(); + tenant_id = "*".to_string(); + timeline_id = "*".to_string(); } let (handle, mut slot_guard) = get_open_files().find_victim_slot(); let file = STORAGE_IO_TIME - .with_label_values(&["open", &tenantid, &timelineid]) + .with_label_values(&["open", &tenant_id, &timeline_id]) .observe_closure_duration(|| open_options.open(path))?; // Strip all options other than read and write. @@ -226,8 +226,8 @@ impl VirtualFile { pos: 0, path: path.to_path_buf(), open_options: reopen_options, - tenantid, - timelineid, + tenant_id, + timeline_id, }; slot_guard.file.replace(file); @@ -267,7 +267,7 @@ impl VirtualFile { // Found a cached file descriptor. slot.recently_used.store(true, Ordering::Relaxed); return Ok(STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(file))); } } @@ -294,7 +294,7 @@ impl VirtualFile { // Open the physical file let file = STORAGE_IO_TIME - .with_label_values(&["open", &self.tenantid, &self.timelineid]) + .with_label_values(&["open", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it @@ -308,7 +308,7 @@ impl VirtualFile { // may deadlock on subsequent read calls. // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(&file)); // Store the File in the slot and update the handle in the VirtualFile @@ -333,11 +333,11 @@ impl Drop for VirtualFile { if slot_guard.tag == handle.tag { slot.recently_used.store(false, Ordering::Relaxed); // Unlike files evicted by replacement algorithm, here - // we group close time by tenantid/timelineid. + // we group close time by tenant_id/timeline_id. // At allows to compare number/time of "normal" file closes // with file eviction. STORAGE_IO_TIME - .with_label_values(&["close", &self.tenantid, &self.timelineid]) + .with_label_values(&["close", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| slot_guard.file.take()); } } @@ -399,7 +399,7 @@ impl FileExt for VirtualFile { let result = self.with_file("read", |file| file.read_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["read", &self.tenantid, &self.timelineid]) + .with_label_values(&["read", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result @@ -409,7 +409,7 @@ impl FileExt for VirtualFile { let result = self.with_file("write", |file| file.write_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenantid, &self.timelineid]) + .with_label_values(&["write", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 45d0916dec..bede4ac13e 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1,5 +1,5 @@ //! -//! Parse PostgreSQL WAL records and store them in a zenith Timeline. +//! Parse PostgreSQL WAL records and store them in a neon Timeline. //! //! The pipeline for ingesting WAL looks like this: //! @@ -9,7 +9,7 @@ //! and decodes it to individual WAL records. It feeds the WAL records //! to WalIngest, which parses them and stores them in the Repository. //! -//! The zenith Repository can store page versions in two formats: as +//! The neon Repository can store page versions in two formats: as //! page images, or a WAL records. WalIngest::ingest_record() extracts //! page images out of some WAL records, but most it stores as WAL //! records. If a WAL record modifies multiple pages, WalIngest @@ -315,7 +315,7 @@ impl<'a> WalIngest<'a> { assert_eq!(image.len(), BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { - let rec = ZenithWalRecord::Postgres { + let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; @@ -428,7 +428,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, new_vm_blk.unwrap(), - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -442,7 +442,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, new_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno: None, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -454,7 +454,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, old_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno: None, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -642,12 +642,12 @@ impl<'a> WalIngest<'a> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { + NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; page_xids = Vec::new(); @@ -662,12 +662,12 @@ impl<'a> WalIngest<'a> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { + NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; @@ -760,7 +760,7 @@ impl<'a> WalIngest<'a> { SlruKind::MultiXactOffsets, segno, rpageno, - ZenithWalRecord::MultixactOffsetCreate { + NeonWalRecord::MultixactOffsetCreate { mid: xlrec.mid, moff: xlrec.moff, }, @@ -794,7 +794,7 @@ impl<'a> WalIngest<'a> { SlruKind::MultiXactMembers, pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, pageno % pg_constants::SLRU_PAGES_PER_SEGMENT, - ZenithWalRecord::MultixactMembersCreate { + NeonWalRecord::MultixactMembersCreate { moff: offset, members: this_page_members, }, @@ -901,7 +901,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { self.handle_rel_extend(modification, rel, blknum)?; modification.put_rel_wal_record(rel, blknum, rec)?; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 69e400f291..1e4b4e7d52 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -34,8 +34,8 @@ use crate::{ DEFAULT_MAX_BACKOFF_SECONDS, }; use utils::{ + id::{NodeId, TenantTimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantTimelineId}, }; use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; @@ -101,7 +101,7 @@ async fn connection_manager_loop_step( etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, ) { - let id = ZTenantTimelineId { + let id = TenantTimelineId { tenant_id: walreceiver_state.timeline.tenant_id, timeline_id: walreceiver_state.timeline.timeline_id, }; @@ -230,7 +230,7 @@ fn cleanup_broker_connection( async fn subscribe_for_timeline_updates( etcd_client: &mut Client, broker_prefix: &str, - id: ZTenantTimelineId, + id: TenantTimelineId, ) -> BrokerSubscription { let mut attempt = 0; loop { @@ -266,7 +266,7 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { - id: ZTenantTimelineId, + id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, @@ -331,7 +331,7 @@ impl WalreceiverState { lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, ) -> Self { - let id = ZTenantTimelineId { + let id = TenantTimelineId { tenant_id: timeline.tenant_id, timeline_id: timeline.timeline_id, }; @@ -746,10 +746,10 @@ enum ReconnectReason { } fn wal_stream_connection_string( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, listen_pg_addr_str: &str, ) -> anyhow::Result { let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); @@ -760,7 +760,7 @@ fn wal_stream_connection_string( })?; let (host, port) = utils::connstring::connection_host_port(&me_conf); Ok(format!( - "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + "host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" )) } @@ -1355,7 +1355,7 @@ mod tests { fn dummy_state(harness: &TenantHarness) -> WalreceiverState { WalreceiverState { - id: ZTenantTimelineId { + id: TenantTimelineId { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 6f1fbc2c9d..29c4cea882 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -30,7 +30,7 @@ use crate::{ walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; /// Status of the connection. @@ -288,7 +288,7 @@ pub async fn handle_walreceiver_connection( .await // here we either do not have this timeline in remote index // or there were no checkpoints for it yet - .timeline_entry(&ZTenantTimelineId { + .timeline_entry(&TenantTimelineId { tenant_id, timeline_id, }) @@ -316,7 +316,7 @@ pub async fn handle_walreceiver_connection( }; *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal); - // Send zenith feedback message. + // Send the replication feedback message. // Regular standby_status_update fields are put into this message. let status_update = ReplicationFeedback { current_timeline_size: timeline @@ -328,7 +328,7 @@ pub async fn handle_walreceiver_connection( ps_replytime: ts, }; - debug!("zenith_status_update {status_update:?}"); + debug!("neon_status_update {status_update:?}"); let mut data = BytesMut::new(); status_update.serialize(&mut data)?; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index c718a4c30c..dbf9bf9d33 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -13,10 +13,10 @@ use serde::{Deserialize, Serialize}; use tracing::*; use utils::bin_ser::DeserializeError; -/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom zenith-specific "record". +/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom neon-specific "record". #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum ZenithWalRecord { +pub enum NeonWalRecord { /// Native PostgreSQL WAL record Postgres { will_init: bool, rec: Bytes }, @@ -45,14 +45,14 @@ pub enum ZenithWalRecord { }, } -impl ZenithWalRecord { +impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { match self { - ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, + NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, - // None of the special zenith record types currently initialize the page + // None of the special neon record types currently initialize the page _ => false, } } @@ -767,9 +767,9 @@ pub fn decode_wal_record( /// Build a human-readable string to describe a WAL record /// /// For debugging purposes -pub fn describe_wal_record(rec: &ZenithWalRecord) -> Result { +pub fn describe_wal_record(rec: &NeonWalRecord) -> Result { match rec { - ZenithWalRecord::Postgres { will_init, rec } => Ok(format!( + NeonWalRecord::Postgres { will_init, rec } => Ok(format!( "will_init: {}, {}", will_init, describe_postgres_wal_record(rec)? diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index dd946659bb..9faabfebda 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -36,7 +36,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; +use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, @@ -44,7 +44,7 @@ use crate::metrics::{ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, @@ -81,7 +81,7 @@ pub trait WalRedoManager: Send + Sync { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result; } @@ -93,20 +93,20 @@ pub trait WalRedoManager: Send + Sync { /// records. /// pub struct PostgresRedoManager { - tenantid: ZTenantId, + tenant_id: TenantId, conf: &'static PageServerConf, process: Mutex>, } -/// Can this request be served by zenith redo functions +/// Can this request be served by neon redo functions /// or we need to pass it to wal-redo postgres process? -fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { +fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any - // Postgres WAL records. But everything else is handled in zenith. + // Postgres WAL records. But everything else is handled in neon. #[allow(clippy::match_like_matches_macro)] match rec { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, } => false, @@ -143,7 +143,7 @@ impl WalRedoManager for PostgresRedoManager { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -151,14 +151,14 @@ impl WalRedoManager for PostgresRedoManager { } let mut img: Option = base_img; - let mut batch_zenith = can_apply_in_zenith(&records[0].1); + let mut batch_neon = can_apply_in_neon(&records[0].1); let mut batch_start = 0; for i in 1..records.len() { - let rec_zenith = can_apply_in_zenith(&records[i].1); + let rec_neon = can_apply_in_neon(&records[i].1); - if rec_zenith != batch_zenith { - let result = if batch_zenith { - self.apply_batch_zenith(key, lsn, img, &records[batch_start..i]) + if rec_neon != batch_neon { + let result = if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..i]) } else { self.apply_batch_postgres( key, @@ -170,13 +170,13 @@ impl WalRedoManager for PostgresRedoManager { }; img = Some(result?); - batch_zenith = rec_zenith; + batch_neon = rec_neon; batch_start = i; } } // last batch - if batch_zenith { - self.apply_batch_zenith(key, lsn, img, &records[batch_start..]) + if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..]) } else { self.apply_batch_postgres( key, @@ -193,10 +193,10 @@ impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// - pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager { + pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { - tenantid, + tenant_id, conf, process: Mutex::new(None), } @@ -210,7 +210,7 @@ impl PostgresRedoManager { key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; @@ -222,7 +222,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenantid)?; + let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -263,14 +263,14 @@ impl PostgresRedoManager { } /// - /// Process a batch of WAL records using bespoken Zenith code. + /// Process a batch of WAL records using bespoken Neon code. /// - fn apply_batch_zenith( + fn apply_batch_neon( &self, key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], ) -> Result { let start_time = Instant::now(); @@ -280,13 +280,13 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - error!("invalid zenith WAL redo request with no base image"); + error!("invalid neon WAL redo request with no base image"); return Err(WalRedoError::InvalidRequest); } // Apply all the WAL records in the batch for (record_lsn, record) in records.iter() { - self.apply_record_zenith(key, &mut page, *record_lsn, record)?; + self.apply_record_neon(key, &mut page, *record_lsn, record)?; } // Success! let end_time = Instant::now(); @@ -294,7 +294,7 @@ impl PostgresRedoManager { WAL_REDO_TIME.observe(duration.as_secs_f64()); debug!( - "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}", + "neon applied {} WAL records in {} ms to reconstruct page image at LSN {}", records.len(), duration.as_micros(), lsn @@ -303,22 +303,22 @@ impl PostgresRedoManager { Ok(page.freeze()) } - fn apply_record_zenith( + fn apply_record_neon( &self, key: Key, page: &mut BytesMut, _record_lsn: Lsn, - record: &ZenithWalRecord, + record: &NeonWalRecord, ) -> Result<(), WalRedoError> { match record { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, } => { - error!("tried to pass postgres wal record to zenith WAL redo"); + error!("tried to pass postgres wal record to neon WAL redo"); return Err(WalRedoError::InvalidRequest); } - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags, @@ -360,7 +360,7 @@ impl PostgresRedoManager { } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. - ZenithWalRecord::ClogSetCommitted { xids, timestamp } => { + NeonWalRecord::ClogSetCommitted { xids, timestamp } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -410,7 +410,7 @@ impl PostgresRedoManager { ); } } - ZenithWalRecord::ClogSetAborted { xids } => { + NeonWalRecord::ClogSetAborted { xids } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -441,7 +441,7 @@ impl PostgresRedoManager { transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); } } - ZenithWalRecord::MultixactOffsetCreate { mid, moff } => { + NeonWalRecord::MultixactOffsetCreate { mid, moff } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -474,7 +474,7 @@ impl PostgresRedoManager { LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } - ZenithWalRecord::MultixactMembersCreate { moff, members } => { + NeonWalRecord::MultixactMembersCreate { moff, members } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -570,7 +570,7 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenant_id: &ZTenantId) -> Result { + fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. @@ -686,7 +686,7 @@ impl PostgresRedoProcess { &mut self, tag: BufferTag, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { // Serialize all the messages to send the WAL redo process first. @@ -700,7 +700,7 @@ impl PostgresRedoProcess { build_push_page_msg(tag, &img, &mut writebuf); } for (lsn, rec) in records.iter() { - if let ZenithWalRecord::Postgres { + if let NeonWalRecord::Postgres { will_init: _, rec: postgres_rec, } = rec @@ -709,7 +709,7 @@ impl PostgresRedoProcess { } else { return Err(Error::new( ErrorKind::Other, - "tried to pass zenith wal record to postgres WAL redo", + "tried to pass neon wal record to postgres WAL redo", )); } } diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 4926d759e8..bc0ee352b8 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -86,7 +86,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum) } /* - * inmem_create() -- Create a new relation on zenithd storage + * inmem_create() -- Create a new relation on neon storage * * If isRedo is true, it's okay for the relation to exist already. */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 55285a6345..296865838d 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -30,13 +30,12 @@ #include "walproposer.h" #include "walproposer_utils.h" - #define PageStoreTrace DEBUG5 #define NEON_TAG "[NEON_SMGR] " -#define neon_log(tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ - errhidestmt(true), errhidecontext(true))) +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) bool connected = false; PGconn *pageserver_conn = NULL; @@ -65,7 +64,7 @@ pageserver_connect() errdetail_internal("%s", msg))); } - query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); + query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); ret = PQsendQuery(pageserver_conn, query); if (ret != 1) { @@ -169,7 +168,7 @@ pageserver_disconnect(void) } static void -pageserver_send(ZenithRequest *request) +pageserver_send(NeonRequest * request) { StringInfoData req_buff; @@ -205,18 +204,18 @@ pageserver_send(ZenithRequest *request) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((ZenithMessage *) request); + char *msg = zm_to_string((NeonMessage *) request); neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); } } -static ZenithResponse * +static NeonResponse * pageserver_receive(void) { StringInfoData resp_buff; - ZenithResponse *resp; + NeonResponse *resp; PG_TRY(); { @@ -236,7 +235,7 @@ pageserver_receive(void) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((ZenithMessage *) resp); + char *msg = zm_to_string((NeonMessage *) resp); neon_log(PageStoreTrace, "got response: %s", msg); pfree(msg); @@ -249,7 +248,7 @@ pageserver_receive(void) } PG_END_TRY(); - return (ZenithResponse *) resp; + return (NeonResponse *) resp; } @@ -265,8 +264,8 @@ pageserver_flush(void) } } -static ZenithResponse * -pageserver_call(ZenithRequest *request) +static NeonResponse * +pageserver_call(NeonRequest * request) { pageserver_send(request); pageserver_flush(); @@ -281,7 +280,7 @@ page_server_api api = { }; static bool -check_zenith_id(char **newval, void **extra, GucSource source) +check_neon_id(char **newval, void **extra, GucSource source) { uint8 zid[16]; @@ -403,22 +402,22 @@ pg_init_libpagestore(void) NULL, NULL, NULL); DefineCustomStringVariable("neon.timeline_id", - "Zenith timelineid the server is running on", + "Neon timeline_id the server is running on", NULL, - &zenith_timeline, + &neon_timeline, "", PGC_POSTMASTER, 0, /* no flags required */ - check_zenith_id, NULL, NULL); + check_neon_id, NULL, NULL); DefineCustomStringVariable("neon.tenant_id", - "Neon tenantid the server is running on", + "Neon tenant_id the server is running on", NULL, - &zenith_tenant, + &neon_tenant, "", PGC_POSTMASTER, 0, /* no flags required */ - check_zenith_id, NULL, NULL); + check_neon_id, NULL, NULL); DefineCustomBoolVariable("neon.wal_redo", "start in wal-redo mode", @@ -450,8 +449,8 @@ pg_init_libpagestore(void) page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); /* Is there more correct way to pass CustomGUC to postgres code? */ - zenith_timeline_walproposer = zenith_timeline; - zenith_tenant_walproposer = zenith_tenant; + neon_timeline_walproposer = neon_timeline; + neon_tenant_walproposer = neon_tenant; if (wal_redo) { @@ -462,8 +461,8 @@ pg_init_libpagestore(void) else if (page_server_connstring && page_server_connstring[0]) { neon_log(PageStoreTrace, "set neon_smgr hook"); - smgr_hook = smgr_zenith; - smgr_init_hook = smgr_init_zenith; - dbsize_hook = zenith_dbsize; + smgr_hook = smgr_neon; + smgr_init_hook = smgr_init_neon; + dbsize_hook = neon_dbsize; } } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 5346680b0b..2a2a163ee8 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -28,7 +28,6 @@ PG_MODULE_MAGIC; void _PG_init(void); - void _PG_init(void) { @@ -56,7 +55,6 @@ pg_cluster_size(PG_FUNCTION_ARGS) PG_RETURN_INT64(size); } - Datum backpressure_lsns(PG_FUNCTION_ARGS) { diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 7dc38c13fb..633c7b465c 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -28,31 +28,29 @@ typedef enum { /* pagestore_client -> pagestore */ - T_ZenithExistsRequest = 0, - T_ZenithNblocksRequest, - T_ZenithGetPageRequest, - T_ZenithDbSizeRequest, + T_NeonExistsRequest = 0, + T_NeonNblocksRequest, + T_NeonGetPageRequest, + T_NeonDbSizeRequest, /* pagestore -> pagestore_client */ - T_ZenithExistsResponse = 100, - T_ZenithNblocksResponse, - T_ZenithGetPageResponse, - T_ZenithErrorResponse, - T_ZenithDbSizeResponse, -} ZenithMessageTag; - - + T_NeonExistsResponse = 100, + T_NeonNblocksResponse, + T_NeonGetPageResponse, + T_NeonErrorResponse, + T_NeonDbSizeResponse, +} NeonMessageTag; /* base struct for c-style inheritance */ typedef struct { - ZenithMessageTag tag; -} ZenithMessage; + NeonMessageTag tag; +} NeonMessage; -#define messageTag(m) (((const ZenithMessage *)(m))->tag) +#define messageTag(m) (((const NeonMessage *)(m))->tag) /* - * supertype of all the Zenith*Request structs below + * supertype of all the Neon*Request structs below * * If 'latest' is true, we are requesting the latest page version, and 'lsn' * is just a hint to the server that we know there are no versions of the page @@ -60,81 +58,79 @@ typedef struct */ typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; bool latest; /* if true, request latest page version */ XLogRecPtr lsn; /* request page version @ this LSN */ -} ZenithRequest; +} NeonRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; -} ZenithExistsRequest; +} NeonExistsRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; -} ZenithNblocksRequest; - +} NeonNblocksRequest; typedef struct { - ZenithRequest req; + NeonRequest req; Oid dbNode; -} ZenithDbSizeRequest; - +} NeonDbSizeRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; -} ZenithGetPageRequest; +} NeonGetPageRequest; -/* supertype of all the Zenith*Response structs below */ +/* supertype of all the Neon*Response structs below */ typedef struct { - ZenithMessageTag tag; -} ZenithResponse; + NeonMessageTag tag; +} NeonResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; bool exists; -} ZenithExistsResponse; +} NeonExistsResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; uint32 n_blocks; -} ZenithNblocksResponse; +} NeonNblocksResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; char page[FLEXIBLE_ARRAY_MEMBER]; -} ZenithGetPageResponse; +} NeonGetPageResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; int64 db_size; -} ZenithDbSizeResponse; +} NeonDbSizeResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error * message */ -} ZenithErrorResponse; +} NeonErrorResponse; -extern StringInfoData zm_pack_request(ZenithRequest *msg); -extern ZenithResponse *zm_unpack_response(StringInfo s); -extern char *zm_to_string(ZenithMessage *msg); +extern StringInfoData zm_pack_request(NeonRequest * msg); +extern NeonResponse * zm_unpack_response(StringInfo s); +extern char *zm_to_string(NeonMessage * msg); /* * API @@ -142,57 +138,57 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { - ZenithResponse *(*request) (ZenithRequest *request); - void (*send) (ZenithRequest *request); - ZenithResponse *(*receive) (void); + NeonResponse *(*request) (NeonRequest * request); + void (*send) (NeonRequest * request); + NeonResponse *(*receive) (void); void (*flush) (void); } page_server_api; extern page_server_api * page_server; extern char *page_server_connstring; -extern char *zenith_timeline; -extern char *zenith_tenant; +extern char *neon_timeline; +extern char *neon_tenant; extern bool wal_redo; extern int32 max_cluster_size; -extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); -extern void smgr_init_zenith(void); +extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); +extern void smgr_init_neon(void); extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); extern void smgr_init_inmem(void); extern void smgr_shutdown_inmem(void); -/* zenith storage manager functionality */ +/* Neon storage manager functionality */ -extern void zenith_init(void); -extern void zenith_open(SMgrRelation reln); -extern void zenith_close(SMgrRelation reln, ForkNumber forknum); -extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); -extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void zenith_reset_prefetch(SMgrRelation reln); -extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); +extern void neon_init(void); +extern void neon_open(SMgrRelation reln); +extern void neon_close(SMgrRelation reln, ForkNumber forknum); +extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool neon_exists(SMgrRelation reln, ForkNumber forknum); +extern void neon_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void neon_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void neon_reset_prefetch(SMgrRelation reln); +extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); -extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +extern void neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); -extern void zenith_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); -extern int64 zenith_dbsize(Oid dbNode); -extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); +extern void neon_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); +extern int64 neon_dbsize(Oid dbNode); +extern void neon_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); -/* zenith wal-redo storage manager functionality */ +/* neon wal-redo storage manager functionality */ extern void inmem_init(void); extern void inmem_open(SMgrRelation reln); @@ -215,8 +211,7 @@ extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - -/* utils for zenith relsize cache */ +/* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 504ae60d4a..24adee019f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -96,9 +96,9 @@ page_server_api *page_server; /* GUCs */ char *page_server_connstring; -//with substituted password -char *zenith_timeline; -char *zenith_tenant; +/*with substituted password*/ +char *neon_timeline; +char *neon_tenant; bool wal_redo = false; int32 max_cluster_size; @@ -143,7 +143,7 @@ consume_prefetch_responses(void) { for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { - ZenithResponse *resp = page_server->receive(); + NeonResponse *resp = page_server->receive(); pfree(resp); } @@ -151,16 +151,16 @@ consume_prefetch_responses(void) n_prefetch_responses = 0; } -static ZenithResponse * +static NeonResponse * page_server_request(void const *req) { consume_prefetch_responses(); - return page_server->request((ZenithRequest *) req); + return page_server->request((NeonRequest *) req); } StringInfoData -zm_pack_request(ZenithRequest *msg) +zm_pack_request(NeonRequest * msg) { StringInfoData s; @@ -170,9 +170,9 @@ zm_pack_request(ZenithRequest *msg) switch (messageTag(msg)) { /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: + case T_NeonExistsRequest: { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -183,9 +183,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithNblocksRequest: + case T_NeonNblocksRequest: { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -196,9 +196,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithDbSizeRequest: + case T_NeonDbSizeRequest: { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -206,9 +206,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithGetPageRequest: + case T_NeonGetPageRequest: { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -222,91 +222,91 @@ zm_pack_request(ZenithRequest *msg) } /* pagestore -> pagestore_client. We never need to create these. */ - case T_ZenithExistsResponse: - case T_ZenithNblocksResponse: - case T_ZenithGetPageResponse: - case T_ZenithErrorResponse: - case T_ZenithDbSizeResponse: + case T_NeonExistsResponse: + case T_NeonNblocksResponse: + case T_NeonGetPageResponse: + case T_NeonErrorResponse: + case T_NeonDbSizeResponse: default: - elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); + elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag); break; } return s; } -ZenithResponse * +NeonResponse * zm_unpack_response(StringInfo s) { - ZenithMessageTag tag = pq_getmsgbyte(s); - ZenithResponse *resp = NULL; + NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse *resp = NULL; switch (tag) { /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: + case T_NeonExistsResponse: { - ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); + NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); msg_resp->tag = tag; msg_resp->exists = pq_getmsgbyte(s); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithNblocksResponse: + case T_NeonNblocksResponse: { - ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); + NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); msg_resp->tag = tag; msg_resp->n_blocks = pq_getmsgint(s, 4); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithGetPageResponse: + case T_NeonGetPageResponse: { - ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); + NeonGetPageResponse *msg_resp = palloc0(offsetof(NeonGetPageResponse, page) + BLCKSZ); msg_resp->tag = tag; /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithDbSizeResponse: + case T_NeonDbSizeResponse: { - ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); + NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); msg_resp->tag = tag; msg_resp->db_size = pq_getmsgint64(s); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithErrorResponse: + case T_NeonErrorResponse: { - ZenithErrorResponse *msg_resp; + NeonErrorResponse *msg_resp; size_t msglen; const char *msgtext; msgtext = pq_getmsgrawstring(s); msglen = strlen(msgtext); - msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); + msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); msg_resp->tag = tag; memcpy(msg_resp->message, msgtext, msglen + 1); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } @@ -315,12 +315,12 @@ zm_unpack_response(StringInfo s) * * We create these ourselves, and don't need to decode them. */ - case T_ZenithExistsRequest: - case T_ZenithNblocksRequest: - case T_ZenithGetPageRequest: - case T_ZenithDbSizeRequest: + case T_NeonExistsRequest: + case T_NeonNblocksRequest: + case T_NeonGetPageRequest: + case T_NeonDbSizeRequest: default: - elog(ERROR, "unexpected zenith message tag 0x%02x", tag); + elog(ERROR, "unexpected neon message tag 0x%02x", tag); break; } @@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -zm_to_string(ZenithMessage *msg) +zm_to_string(NeonMessage * msg) { StringInfoData s; @@ -338,11 +338,11 @@ zm_to_string(ZenithMessage *msg) switch (messageTag(msg)) { /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: + case T_NeonExistsRequest: { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -354,11 +354,11 @@ zm_to_string(ZenithMessage *msg) break; } - case T_ZenithNblocksRequest: + case T_NeonNblocksRequest: { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -370,11 +370,11 @@ zm_to_string(ZenithMessage *msg) break; } - case T_ZenithGetPageRequest: + case T_NeonGetPageRequest: { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -386,11 +386,11 @@ zm_to_string(ZenithMessage *msg) appendStringInfoChar(&s, '}'); break; } - case T_ZenithDbSizeRequest: + case T_NeonDbSizeRequest: { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); @@ -398,61 +398,57 @@ zm_to_string(ZenithMessage *msg) break; } - /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: + case T_NeonExistsResponse: { - ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; + NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); appendStringInfo(&s, ", \"exists\": %d}", - msg_resp->exists - ); + msg_resp->exists); appendStringInfoChar(&s, '}'); break; } - case T_ZenithNblocksResponse: + case T_NeonNblocksResponse: { - ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; + NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks - ); + msg_resp->n_blocks); appendStringInfoChar(&s, '}'); break; } - case T_ZenithGetPageResponse: + case T_NeonGetPageResponse: { #if 0 - ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; + NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; #endif - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); appendStringInfo(&s, ", \"page\": \"XXX\"}"); appendStringInfoChar(&s, '}'); break; } - case T_ZenithErrorResponse: + case T_NeonErrorResponse: { - ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; + NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; /* FIXME: escape double-quotes in the message */ - appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); appendStringInfoChar(&s, '}'); break; } - case T_ZenithDbSizeResponse: + case T_NeonDbSizeResponse: { - ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; + NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); appendStringInfo(&s, ", \"db_size\": %ld}", - msg_resp->db_size - ); + msg_resp->db_size); appendStringInfoChar(&s, '}'); break; @@ -494,7 +490,7 @@ PageIsEmptyHeapPage(char *buffer) } static void -zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { XLogRecPtr lsn = PageGetLSN(buffer); @@ -551,8 +547,8 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { /* * When PostgreSQL extends a relation, it calls smgrextend() with an - * all-zeros pages, and we can just ignore that in Zenith. We do need - * to remember the new size, though, so that smgrnblocks() returns the + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the * right answer after the rel has been extended. We rely on the * relsize cache for that. * @@ -616,12 +612,11 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); } - /* - * zenith_init() -- Initialize private state + * neon_init() -- Initialize private state */ void -zenith_init(void) +neon_init(void) { /* noop */ #ifdef DEBUG_COMPARE_LOCAL @@ -658,7 +653,7 @@ zm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; @@ -666,14 +661,14 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc { *latest = false; lsn = GetXLogReplayRecPtr(NULL); - elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", (uint32) ((lsn) >> 32), (uint32) (lsn)); } else if (am_walsender) { *latest = true; lsn = InvalidXLogRecPtr; - elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); + elog(DEBUG1, "am walsender neon_get_request_lsn lsn 0 "); } else { @@ -687,7 +682,7 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc *latest = true; lsn = GetLastWrittenLSN(rnode, forknum, blkno); Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ", + elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); lsn = zm_adjust_lsn(lsn); @@ -717,15 +712,14 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc return lsn; } - /* - * zenith_exists() -- Does the physical file exist? + * neon_exists() -- Does the physical file exist? */ bool -zenith_exists(SMgrRelation reln, ForkNumber forkNum) +neon_exists(SMgrRelation reln, ForkNumber forkNum) { bool exists; - ZenithResponse *resp; + NeonResponse *resp; BlockNumber n_blocks; bool latest; XLogRecPtr request_lsn; @@ -777,26 +771,25 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithExistsRequest request = { - .req.tag = T_ZenithExistsRequest, + NeonExistsRequest request = { + .req.tag = T_NeonExistsRequest, .req.latest = latest, .req.lsn = request_lsn, .rnode = reln->smgr_rnode.node, - .forknum = forkNum - }; + .forknum = forkNum}; resp = page_server_request(&request); } switch (resp->tag) { - case T_ZenithExistsResponse: - exists = ((ZenithExistsResponse *) resp)->exists; + case T_NeonExistsResponse: + exists = ((NeonExistsResponse *) resp)->exists; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -806,7 +799,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -817,12 +810,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) } /* - * zenith_create() -- Create a new relation on zenithd storage + * neon_create() -- Create a new relation on neond storage * * If isRedo is true, it's okay for the relation to exist already. */ void -zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { switch (reln->smgr_relpersistence) { @@ -866,7 +859,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) } /* - * zenith_unlink() -- Unlink a relation. + * neon_unlink() -- Unlink a relation. * * Note that we're passed a RelFileNodeBackend --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. @@ -884,7 +877,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * we are usually not in a transaction anymore when this is called. */ void -zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { /* * Might or might not exist locally, depending on whether it's an unlogged @@ -899,7 +892,7 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) } /* - * zenith_extend() -- Add a block to the specified relation. + * neon_extend() -- Add a block to the specified relation. * * The semantics are nearly the same as mdwrite(): write at the * specified position. However, this is to be used for the case of @@ -908,8 +901,8 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) * causes intervening file space to become filled with zeroes. */ void -zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer, bool skipFsync) +neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) { XLogRecPtr lsn; @@ -951,7 +944,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, errhint("This limit is defined by neon.max_cluster_size GUC"))); } - zenith_wallog_page(reln, forkNum, blkno, buffer); + neon_wallog_page(reln, forkNum, blkno, buffer); set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); lsn = PageGetLSN(buffer); @@ -971,10 +964,10 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, } /* - * zenith_open() -- Initialize newly-opened relation. + * neon_open() -- Initialize newly-opened relation. */ void -zenith_open(SMgrRelation reln) +neon_open(SMgrRelation reln) { /* * We don't have anything special to do here. Call mdopen() to let md.c @@ -985,14 +978,14 @@ zenith_open(SMgrRelation reln) mdopen(reln); /* no work */ - elog(SmgrTrace, "[ZENITH_SMGR] open noop"); + elog(SmgrTrace, "[NEON_SMGR] open noop"); } /* - * zenith_close() -- Close the specified relation, if it isn't closed already. + * neon_close() -- Close the specified relation, if it isn't closed already. */ void -zenith_close(SMgrRelation reln, ForkNumber forknum) +neon_close(SMgrRelation reln, ForkNumber forknum) { /* * Let md.c close it, if it had it open. Doesn't hurt to do this even for @@ -1003,19 +996,19 @@ zenith_close(SMgrRelation reln, ForkNumber forknum) /* - * zenith_reset_prefetch() -- reoe all previously rgistered prefeth requests + * neon_reset_prefetch() -- reoe all previously rgistered prefeth requests */ void -zenith_reset_prefetch(SMgrRelation reln) +neon_reset_prefetch(SMgrRelation reln) { n_prefetch_requests = 0; } /* - * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ bool -zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { switch (reln->smgr_relpersistence) { @@ -1046,14 +1039,14 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) } /* - * zenith_writeback() -- Tell the kernel to write pages back to storage. + * neon_writeback() -- Tell the kernel to write pages back to storage. * * This accepts a range of blocks because flushing several pages at once is * considerably more efficient than doing so individually. */ void -zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) +neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) { switch (reln->smgr_relpersistence) { @@ -1075,7 +1068,7 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, } /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); + elog(SmgrTrace, "[NEON_SMGR] writeback noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1084,14 +1077,14 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, } /* - * While function is defined in the zenith extension it's used within neon_test_utils directly. + * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ void -zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) +neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) { - ZenithResponse *resp; + NeonResponse *resp; int i; /* @@ -1103,12 +1096,12 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) { resp = page_server->receive(); - if (resp->tag == T_ZenithGetPageResponse && + if (resp->tag == T_NeonGetPageResponse && RelFileNodeEquals(prefetch_responses[i].rnode, rnode) && prefetch_responses[i].forkNum == forkNum && prefetch_responses[i].blockNum == blkno) { - char *page = ((ZenithGetPageResponse *) resp)->page; + char *page = ((NeonGetPageResponse *) resp)->page; /* * Check if prefetched page is still relevant. If it is updated by @@ -1135,8 +1128,8 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, n_prefetch_responses = 0; n_prefetch_misses += 1; { - ZenithGetPageRequest request = { - .req.tag = T_ZenithGetPageRequest, + NeonGetPageRequest request = { + .req.tag = T_NeonGetPageRequest, .req.latest = request_latest, .req.lsn = request_lsn, .rnode = rnode, @@ -1147,14 +1140,14 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, if (n_prefetch_requests > 0) { /* Combine all prefetch requests with primary request */ - page_server->send((ZenithRequest *) &request); + page_server->send((NeonRequest *) & request); for (i = 0; i < n_prefetch_requests; i++) { request.rnode = prefetch_requests[i].rnode; request.forknum = prefetch_requests[i].forkNum; request.blkno = prefetch_requests[i].blockNum; prefetch_responses[i] = prefetch_requests[i]; - page_server->send((ZenithRequest *) &request); + page_server->send((NeonRequest *) & request); } page_server->flush(); n_prefetch_responses = n_prefetch_requests; @@ -1164,16 +1157,16 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, } else { - resp = page_server->request((ZenithRequest *) &request); + resp = page_server->request((NeonRequest *) & request); } } switch (resp->tag) { - case T_ZenithGetPageResponse: - memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -1184,7 +1177,7 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -1195,11 +1188,11 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, } /* - * zenith_read() -- Read the specified block from a relation. + * neon_read() -- Read the specified block from a relation. */ void -zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) { bool latest; XLogRecPtr request_lsn; @@ -1221,8 +1214,8 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); - zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); + neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1328,15 +1321,15 @@ hexdump_page(char *page) #endif /* - * zenith_write() -- Write the supplied block at the appropriate location. + * neon_write() -- Write the supplied block at the appropriate location. * * This is to be used only for updating already-existing blocks of a * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ void -zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) +neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) { XLogRecPtr lsn; @@ -1372,7 +1365,7 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - zenith_wallog_page(reln, forknum, blocknum, buffer); + neon_wallog_page(reln, forknum, blocknum, buffer); lsn = PageGetLSN(buffer); elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", @@ -1389,12 +1382,12 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * zenith_nblocks() -- Get the number of blocks stored in a relation. + * neon_nblocks() -- Get the number of blocks stored in a relation. */ BlockNumber -zenith_nblocks(SMgrRelation reln, ForkNumber forknum) +neon_nblocks(SMgrRelation reln, ForkNumber forknum) { - ZenithResponse *resp; + NeonResponse *resp; BlockNumber n_blocks; bool latest; XLogRecPtr request_lsn; @@ -1426,10 +1419,10 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithNblocksRequest request = { - .req.tag = T_ZenithNblocksRequest, + NeonNblocksRequest request = { + .req.tag = T_NeonNblocksRequest, .req.latest = latest, .req.lsn = request_lsn, .rnode = reln->smgr_rnode.node, @@ -1441,11 +1434,11 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) switch (resp->tag) { - case T_ZenithNblocksResponse: - n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; + case T_NeonNblocksResponse: + n_blocks = ((NeonNblocksResponse *) resp)->n_blocks; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -1455,7 +1448,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -1463,7 +1456,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) } update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); - elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, @@ -1476,21 +1469,21 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) } /* - * zenith_db_size() -- Get the size of the database in bytes. + * neon_db_size() -- Get the size of the database in bytes. */ int64 -zenith_dbsize(Oid dbNode) +neon_dbsize(Oid dbNode) { - ZenithResponse *resp; + NeonResponse *resp; int64 db_size; XLogRecPtr request_lsn; bool latest; RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; - request_lsn = zenith_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithDbSizeRequest request = { - .req.tag = T_ZenithDbSizeRequest, + NeonDbSizeRequest request = { + .req.tag = T_NeonDbSizeRequest, .req.latest = latest, .req.lsn = request_lsn, .dbNode = dbNode, @@ -1501,25 +1494,25 @@ zenith_dbsize(Oid dbNode) switch (resp->tag) { - case T_ZenithDbSizeResponse: - db_size = ((ZenithDbSizeResponse *) resp)->db_size; + case T_NeonDbSizeResponse: + db_size = ((NeonDbSizeResponse *) resp)->db_size; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read db size of db %u from page server at lsn %X/%08X", dbNode, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); } - elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", + elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", dbNode, (uint32) (request_lsn >> 32), (uint32) request_lsn, db_size); @@ -1529,10 +1522,10 @@ zenith_dbsize(Oid dbNode) } /* - * zenith_truncate() -- Truncate relation to specified number of blocks. + * neon_truncate() -- Truncate relation to specified number of blocks. */ void -zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { XLogRecPtr lsn; @@ -1591,7 +1584,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } /* - * zenith_immedsync() -- Immediately sync a relation to stable storage. + * neon_immedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. We @@ -1602,7 +1595,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * segment may survive recovery, reintroducing unwanted data into the table. */ void -zenith_immedsync(SMgrRelation reln, ForkNumber forknum) +neon_immedsync(SMgrRelation reln, ForkNumber forknum) { switch (reln->smgr_relpersistence) { @@ -1622,7 +1615,7 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); + elog(SmgrTrace, "[NEON_SMGR] immedsync noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1631,16 +1624,16 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum) } /* - * zenith_start_unlogged_build() -- Starting build operation on a rel. + * neon_start_unlogged_build() -- Starting build operation on a rel. * * Some indexes are built in two phases, by first populating the table with * regular inserts, using the shared buffer cache but skipping WAL-logging, - * and WAL-logging the whole relation after it's done. Zenith relies on the + * and WAL-logging the whole relation after it's done. Neon relies on the * WAL to reconstruct pages, so we cannot use the page server in the * first phase when the changes are not logged. */ static void -zenith_start_unlogged_build(SMgrRelation reln) +neon_start_unlogged_build(SMgrRelation reln) { /* * Currently, there can be only one unlogged relation build operation in @@ -1692,13 +1685,13 @@ zenith_start_unlogged_build(SMgrRelation reln) } /* - * zenith_finish_unlogged_build_phase_1() + * neon_finish_unlogged_build_phase_1() * * Call this after you have finished populating a relation in unlogged mode, * before you start WAL-logging it. */ static void -zenith_finish_unlogged_build_phase_1(SMgrRelation reln) +neon_finish_unlogged_build_phase_1(SMgrRelation reln) { Assert(unlogged_build_rel == reln); @@ -1718,7 +1711,7 @@ zenith_finish_unlogged_build_phase_1(SMgrRelation reln) } /* - * zenith_end_unlogged_build() -- Finish an unlogged rel build. + * neon_end_unlogged_build() -- Finish an unlogged rel build. * * Call this after you have finished WAL-logging an relation that was * first populated without WAL-logging. @@ -1727,7 +1720,7 @@ zenith_finish_unlogged_build_phase_1(SMgrRelation reln) * WAL-logged and is present in the page server. */ static void -zenith_end_unlogged_build(SMgrRelation reln) +neon_end_unlogged_build(SMgrRelation reln) { Assert(unlogged_build_rel == reln); @@ -1769,7 +1762,7 @@ zenith_end_unlogged_build(SMgrRelation reln) } static void -AtEOXact_zenith(XactEvent event, void *arg) +AtEOXact_neon(XactEvent event, void *arg) { switch (event) { @@ -1802,47 +1795,46 @@ AtEOXact_zenith(XactEvent event, void *arg) } } -static const struct f_smgr zenith_smgr = +static const struct f_smgr neon_smgr = { - .smgr_init = zenith_init, + .smgr_init = neon_init, .smgr_shutdown = NULL, - .smgr_open = zenith_open, - .smgr_close = zenith_close, - .smgr_create = zenith_create, - .smgr_exists = zenith_exists, - .smgr_unlink = zenith_unlink, - .smgr_extend = zenith_extend, - .smgr_prefetch = zenith_prefetch, - .smgr_reset_prefetch = zenith_reset_prefetch, - .smgr_read = zenith_read, - .smgr_write = zenith_write, - .smgr_writeback = zenith_writeback, - .smgr_nblocks = zenith_nblocks, - .smgr_truncate = zenith_truncate, - .smgr_immedsync = zenith_immedsync, + .smgr_open = neon_open, + .smgr_close = neon_close, + .smgr_create = neon_create, + .smgr_exists = neon_exists, + .smgr_unlink = neon_unlink, + .smgr_extend = neon_extend, + .smgr_prefetch = neon_prefetch, + .smgr_reset_prefetch = neon_reset_prefetch, + .smgr_read = neon_read, + .smgr_write = neon_write, + .smgr_writeback = neon_writeback, + .smgr_nblocks = neon_nblocks, + .smgr_truncate = neon_truncate, + .smgr_immedsync = neon_immedsync, - .smgr_start_unlogged_build = zenith_start_unlogged_build, - .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, - .smgr_end_unlogged_build = zenith_end_unlogged_build, + .smgr_start_unlogged_build = neon_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = neon_end_unlogged_build, }; - const f_smgr * -smgr_zenith(BackendId backend, RelFileNode rnode) +smgr_neon(BackendId backend, RelFileNode rnode) { /* Don't use page server for temp relations */ if (backend != InvalidBackendId) return smgr_standard(backend, rnode); else - return &zenith_smgr; + return &neon_smgr; } void -smgr_init_zenith(void) +smgr_init_neon(void) { - RegisterXactCallback(AtEOXact_zenith, NULL); + RegisterXactCallback(AtEOXact_neon, NULL); smgr_init_standard(); - zenith_init(); + neon_init(); } diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 31021f3e41..d4262c730a 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -56,7 +56,7 @@ static void relsize_shmem_request(void); #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) static void -zenith_smgr_shmem_startup(void) +neon_smgr_shmem_startup(void) { static HASHCTL info; @@ -174,14 +174,14 @@ relsize_hash_init(void) #endif prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = zenith_smgr_shmem_startup; + shmem_startup_hook = neon_smgr_shmem_startup; } } #if PG_VERSION_NUM >= 150000 /* * shmem_request hook: request additional shared resources. We'll allocate or - * attach to the shared resources in zenith_smgr_shmem_startup(). + * attach to the shared resources in neon_smgr_shmem_startup(). */ static void relsize_shmem_request(void) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 05257ced4c..fc0b660a64 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -71,14 +71,13 @@ #include "walproposer_utils.h" #include "replication/walpropshim.h" - char *wal_acceptors_list; int wal_acceptor_reconnect_timeout; int wal_acceptor_connect_timeout; bool am_wal_proposer; -char *zenith_timeline_walproposer = NULL; -char *zenith_tenant_walproposer = NULL; +char *neon_timeline_walproposer = NULL; +char *neon_tenant_walproposer = NULL; /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ WalProposerFunctionsType *WalProposerFunctions = NULL; @@ -89,7 +88,7 @@ static int n_safekeepers = 0; static int quorum = 0; static Safekeeper safekeeper[MAX_SAFEKEEPERS]; static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to* * safekeepers */ static ProposerGreeting greetRequest; static VoteRequest voteRequest; /* Vote request for safekeeper */ @@ -162,7 +161,6 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); - static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void nwp_prepare_shmem(void); @@ -176,7 +174,6 @@ static shmem_request_hook_type prev_shmem_request_hook = NULL; static void walproposer_shmem_request(void); #endif - void pg_init_walproposer(void) { @@ -207,10 +204,9 @@ nwp_register_gucs(void) &wal_acceptors_list, /* valueAddr */ "", /* bootValue */ PGC_POSTMASTER, - GUC_LIST_INPUT, /* extensions can't use + GUC_LIST_INPUT, /* extensions can't use* * GUC_LIST_QUOTE */ - NULL, NULL, NULL - ); + NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", @@ -220,8 +216,7 @@ nwp_register_gucs(void) 1000, 0, INT_MAX, /* default, min, max */ PGC_SIGHUP, /* context */ GUC_UNIT_MS, /* flags */ - NULL, NULL, NULL - ); + NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_connect_timeout", @@ -231,9 +226,7 @@ nwp_register_gucs(void) 5000, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, - NULL, NULL, NULL - ); - + NULL, NULL, NULL); } /* shmem handling */ @@ -499,19 +492,19 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) greetRequest.pgVersion = PG_VERSION_NUM; pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); greetRequest.systemId = systemId; - if (!zenith_timeline_walproposer) + if (!neon_timeline_walproposer) elog(FATAL, "neon.timeline_id is not provided"); - if (*zenith_timeline_walproposer != '\0' && - !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) - elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); - if (!zenith_tenant_walproposer) + if (*neon_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer); + if (!neon_tenant_walproposer) elog(FATAL, "neon.tenant_id is not provided"); - if (*zenith_tenant_walproposer != '\0' && - !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) - elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); + if (*neon_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer); #if PG_VERSION_NUM >= 150000 -/* FIXME don't use hardcoded timeline id */ + /* FIXME don't use hardcoded timeline id */ greetRequest.timeline = 1; #else greetRequest.timeline = ThisTimeLineID; @@ -657,8 +650,8 @@ ResetConnection(Safekeeper *sk) int written = 0; written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer); /* * currently connection string is not that long, but once we pass @@ -1326,8 +1319,7 @@ DetermineEpochStartLsn(void) propTerm, LSN_FORMAT_ARGS(propEpochStartLsn), safekeeper[donor].host, safekeeper[donor].port, - LSN_FORMAT_ARGS(truncateLsn) - ); + LSN_FORMAT_ARGS(truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN @@ -1373,8 +1365,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec WalReceiverConn *wrconn; WalRcvStreamOptions options; - sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + safekeeper[donor].host, safekeeper[donor].port, neon_timeline_walproposer, neon_tenant_walproposer); wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); if (!wrconn) { @@ -1544,8 +1536,7 @@ SendProposerElected(Safekeeper *sk) else { XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; - XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : - sk->voteResponse.flushLsn); + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : sk->voteResponse.flushLsn); sk->startStreamingAt = Min(propEndLsn, skEndLsn); } @@ -1759,7 +1750,7 @@ SendAppendRequests(Safekeeper *sk) req->beginLsn, req->endLsn - req->beginLsn, #if PG_VERSION_NUM >= 150000 - /* FIXME don't use hardcoded timelineid here */ + /* FIXME don't use hardcoded timeline_id here */ 1, #else ThisTimeLineID, @@ -1784,9 +1775,9 @@ SendAppendRequests(Safekeeper *sk) case PG_ASYNC_WRITE_TRY_FLUSH: /* - * We still need to call PQflush some more to finish the job. - * Caller function will handle this by setting right event - * set. + * * We still need to call PQflush some more to finish the + * job. Caller function will handle this by setting right + * event* set. */ sk->flushWrite = true; return true; @@ -1885,40 +1876,40 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * if (strcmp(key, "current_timeline_size") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->currentClusterSize = pq_getmsgint64(reply_message); + /* read value length */ + rf->currentClusterSize = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", rf->currentClusterSize); } else if (strcmp(key, "ps_writelsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_writelsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_writelsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", LSN_FORMAT_ARGS(rf->ps_writelsn)); } else if (strcmp(key, "ps_flushlsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_flushlsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_flushlsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", LSN_FORMAT_ARGS(rf->ps_flushlsn)); } else if (strcmp(key, "ps_applylsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_applylsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_applylsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", LSN_FORMAT_ARGS(rf->ps_applylsn)); } else if (strcmp(key, "ps_replytime") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_replytime = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; @@ -1933,13 +1924,13 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * else { len = pq_getmsgint(reply_message, sizeof(int32)); - //read value length + /* read value length */ /* * Skip unknown keys to support backward compatibile protocol * changes */ - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1973,7 +1964,6 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) } } - /* * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the * last WAL record that can be safely discarded. @@ -2009,8 +1999,7 @@ GetAcknowledgedByQuorumWALPosition(void) * Like in Raft, we aren't allowed to commit entries from previous * terms, so ignore reported LSN until it gets to epochStartLsn. */ - responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? - safekeeper[i].appendResponse.flushLsn : 0; + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? safekeeper[i].appendResponse.flushLsn : 0; } qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); @@ -2058,7 +2047,6 @@ replication_feedback_set(ReplicationFeedback * rf) SpinLockRelease(&walprop_shared->mutex); } - void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { @@ -2069,12 +2057,11 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe SpinLockRelease(&walprop_shared->mutex); } - /* * Get ReplicationFeedback fields from the most advanced safekeeper */ static void -GetLatestZentihFeedback(ReplicationFeedback * rf) +GetLatestNeonFeedback(ReplicationFeedback * rf) { int latest_safekeeper = 0; XLogRecPtr ps_writelsn = InvalidXLogRecPtr; @@ -2094,7 +2081,7 @@ GetLatestZentihFeedback(ReplicationFeedback * rf) rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; - elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," + elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", rf->currentClusterSize, LSN_FORMAT_ARGS(rf->ps_writelsn), @@ -2113,14 +2100,13 @@ HandleSafekeeperResponse(void) XLogRecPtr diskConsistentLsn; XLogRecPtr minFlushLsn; - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; if (!syncSafekeepers) { /* Get ReplicationFeedback fields from the most advanced safekeeper */ - GetLatestZentihFeedback(&quorumFeedback.rf); + GetLatestNeonFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } @@ -2139,7 +2125,7 @@ HandleSafekeeperResponse(void) quorumFeedback.flushLsn, /* - * apply_lsn - This is what processed and durably saved at + * apply_lsn - This is what processed and durably saved at* * pageserver. */ quorumFeedback.rf.ps_flushlsn, @@ -2460,7 +2446,7 @@ backpressure_lag_impl(void) XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); -#define MB ((XLogRecPtr)1024*1024) +#define MB ((XLogRecPtr)1024 * 1024) elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), @@ -2468,23 +2454,17 @@ backpressure_lag_impl(void) LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr)); - if ((writePtr != InvalidXLogRecPtr - && max_replication_write_lag > 0 - && myFlushLsn > writePtr + max_replication_write_lag * MB)) + if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB)) { return (myFlushLsn - writePtr - max_replication_write_lag * MB); } - if ((flushPtr != InvalidXLogRecPtr - && max_replication_flush_lag > 0 - && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) + if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) { return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); } - if ((applyPtr != InvalidXLogRecPtr - && max_replication_apply_lag > 0 - && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) + if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) { return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 59e70f33bf..051c7c02a6 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -10,16 +10,16 @@ #include "utils/uuid.h" #include "replication/walreceiver.h" -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 -#define MAX_SAFEKEEPERS 32 -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single - * WAL message */ -#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL + * message */ +#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* * message header */ -#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender +#define XLOG_HDR_END_POS (1 + 8) /* offset of end position in wal sender* * message header */ /* @@ -39,8 +39,8 @@ typedef struct WalProposerConn WalProposerConn; struct WalMessage; typedef struct WalMessage WalMessage; -extern char *zenith_timeline_walproposer; -extern char *zenith_tenant_walproposer; +extern char *neon_timeline_walproposer; +extern char *neon_tenant_walproposer; /* Possible return values from ReadPGAsync */ typedef enum @@ -170,8 +170,8 @@ typedef struct ProposerGreeting uint32 pgVersion; pg_uuid_t proposerId; uint64 systemId; /* Postgres system identifier */ - uint8 ztimelineid[16]; /* Zenith timeline id */ - uint8 ztenantid[16]; + uint8 timeline_id[16]; /* Neon timeline id */ + uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; } ProposerGreeting; @@ -226,7 +226,7 @@ typedef struct VoteResponse * proposer to choose the most advanced one. */ XLogRecPtr flushLsn; - XLogRecPtr truncateLsn; /* minimal LSN which may be needed for + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ @@ -283,7 +283,6 @@ typedef struct HotStandbyFeedback FullTransactionId catalog_xmin; } HotStandbyFeedback; - typedef struct ReplicationFeedback { /* current size of the timeline on pageserver */ @@ -295,7 +294,6 @@ typedef struct ReplicationFeedback TimestampTz ps_replytime; } ReplicationFeedback; - typedef struct WalproposerShmemState { slock_t mutex; @@ -323,7 +321,7 @@ typedef struct AppendResponse XLogRecPtr commitLsn; HotStandbyFeedback hs; /* Feedback recieved from pageserver includes standby_status_update fields */ - /* and custom zenith feedback. */ + /* and custom neon feedback. */ /* This part of the message is extensible. */ ReplicationFeedback rf; } AppendResponse; @@ -332,7 +330,6 @@ typedef struct AppendResponse /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) - /* * Descriptor of safekeeper */ @@ -340,7 +337,7 @@ typedef struct Safekeeper { char const *host; char const *port; - char conninfo[MAXCONNINFO]; /* connection info for + char conninfo[MAXCONNINFO]; /* connection info for* * connecting/reconnecting */ /* @@ -366,12 +363,12 @@ typedef struct Safekeeper */ XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush, + bool flushWrite; /* set to true if we need to call AsyncFlush,* * to flush pending messages */ XLogRecPtr streamingAt; /* current streaming position */ AppendRequestHeader appendRequest; /* request for sending to safekeeper */ - int eventPos; /* position in wait event set. Equal to -1 if + int eventPos; /* position in wait event set. Equal to -1 if* * no event */ SafekeeperState state; /* safekeeper state machine state */ TimestampTz startedConnAt; /* when connection attempt started */ @@ -380,7 +377,6 @@ typedef struct Safekeeper AppendResponse appendResponse; /* feedback for master */ } Safekeeper; - extern PGDLLIMPORT void WalProposerMain(Datum main_arg); void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); void WalProposerPoll(void); diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 07bd7bdd28..e0cea4177b 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -36,13 +36,13 @@ PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); /* - * Linkage to functions in zenith module. + * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*zenith_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +typedef void (*neon_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); -static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; +static neon_read_at_lsn_type neon_read_at_lsn_ptr; /* * Module initialize function: fetch function pointers for cross-module calls. @@ -51,13 +51,13 @@ void _PG_init(void) { /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); - zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) - load_external_function("$libdir/neon", "zenith_read_at_lsn", + AssertVariableIsOfType(&neon_read_at_lsn, neon_read_at_lsn_type); + neon_read_at_lsn_ptr = (neon_read_at_lsn_type) + load_external_function("$libdir/neon", "neon_read_at_lsn", true, NULL); } -#define zenith_read_at_lsn zenith_read_at_lsn_ptr +#define neon_read_at_lsn neon_read_at_lsn_ptr /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. @@ -96,7 +96,7 @@ test_consume_xids(PG_FUNCTION_ARGS) Datum clear_buffer_cache(PG_FUNCTION_ARGS) { - bool save_zenith_test_evict; + bool save_neon_test_evict; /* * Temporarily set the zenith_test_evict GUC, so that when we pin and @@ -104,7 +104,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) * buffers, as there is no explicit "evict this buffer" function in the * buffer manager. */ - save_zenith_test_evict = zenith_test_evict; + save_neon_test_evict = zenith_test_evict; zenith_test_evict = true; PG_TRY(); { @@ -149,14 +149,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS) PG_FINALLY(); { /* restore the GUC */ - zenith_test_evict = save_zenith_test_evict; + zenith_test_evict = save_neon_test_evict; } PG_END_TRY(); PG_RETURN_VOID(); } - /* * Reads the page from page server without buffer cache * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN @@ -232,7 +231,6 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); - forknum = forkname_to_number(text_to_cstring(forkname)); /* Initialize buffer to copy to */ @@ -240,7 +238,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); relation_close(rel, AccessShareLock); @@ -272,8 +270,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) RelFileNode rnode = { .spcNode = PG_GETARG_OID(0), .dbNode = PG_GETARG_OID(1), - .relNode = PG_GETARG_OID(2) - }; + .relNode = PG_GETARG_OID(2)}; ForkNumber forknum = PG_GETARG_UINT32(3); @@ -281,14 +278,13 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) bool request_latest = PG_ARGISNULL(5); uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); - /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5a450793f1..5417f4f2b3 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -32,7 +32,7 @@ sha2 = "0.10.2" socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" git-version = "0.3.5" diff --git a/pyproject.toml b/pyproject.toml index ec166ea7cd..9c2aa39c7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "zenith" +name = "neon" version = "0.1.0" description = "" authors = [] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 4ed30413e2..cae095c3c2 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -14,8 +14,8 @@ tracing = "0.1.27" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["macros", "fs"] } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" @@ -25,7 +25,7 @@ serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 244c793250..d518ac01cc 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -30,8 +30,8 @@ use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ - http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, - zid::NodeId, + http::endpoint, id::NodeId, logging, project_git_version, shutdown::exit_now, signals, + tcp_listener, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; @@ -39,7 +39,7 @@ const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Zenith safekeeper") + let arg_matches = App::new("Neon safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") .version(GIT_VERSION) .arg( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index ce66131700..f276fad613 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -22,7 +22,7 @@ use etcd_broker::{ subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, Client, PutOptions, }; -use utils::zid::{NodeId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; @@ -45,7 +45,7 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( broker_etcd_prefix: String, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, sk_id: NodeId, ) -> String { format!( @@ -162,12 +162,12 @@ pub fn get_candiate_name(system_id: NodeId) -> String { } async fn push_sk_info( - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, mut client: Client, key: String, sk_info: SkTimelineInfo, mut lease: Lease, -) -> anyhow::Result<(ZTenantTimelineId, Lease)> { +) -> anyhow::Result<(TenantTimelineId, Lease)> { let put_opts = PutOptions::new().with_lease(lease.id); client .put( @@ -202,7 +202,7 @@ struct Lease { /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; - let mut leases: HashMap = HashMap::new(); + let mut leases: HashMap = HashMap::new(); let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); loop { diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 7fc75246e1..ff23f0360f 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -14,7 +14,7 @@ use tracing::*; use crate::control_file_upgrade::upgrade_control_file; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; -use utils::{bin_ser::LeSer, zid::ZTenantTimelineId}; +use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -55,7 +55,7 @@ pub struct FileStorage { } impl FileStorage { - pub fn restore_new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> Result { + pub fn restore_new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { let timeline_dir = conf.timeline_dir(zttid); let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); @@ -72,7 +72,7 @@ impl FileStorage { } pub fn create_new( - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { @@ -115,7 +115,7 @@ impl FileStorage { // Load control file for given zttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result { let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); Self::load_control_file(path) @@ -252,7 +252,7 @@ mod test { use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; use anyhow::Result; use std::fs; - use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + use utils::{id::TenantTimelineId, lsn::Lsn}; fn stub_conf() -> SafeKeeperConf { let workdir = tempfile::tempdir().unwrap().into_path(); @@ -264,7 +264,7 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); Ok(( @@ -275,7 +275,7 @@ mod test { fn create( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); @@ -286,7 +286,7 @@ mod test { #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let zttid = TenantTimelineId::generate(); { let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); // change something @@ -301,7 +301,7 @@ mod test { #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let zttid = TenantTimelineId::generate(); { let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 91d2f61c10..87204d6b49 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -7,9 +7,9 @@ use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ bin_ser::LeSer, + id::{TenantId, TimelineId}, lsn::Lsn, pq_proto::SystemId, - zid::{ZTenantId, ZTimelineId}, }; /// Persistent consensus state of the acceptor. @@ -45,9 +45,8 @@ pub struct ServerInfoV2 { /// Postgres server version pub pg_version: u32, pub system_id: SystemId, - pub tenant_id: ZTenantId, - /// Zenith timelineid - pub ztli: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub wal_seg_size: u32, } @@ -76,10 +75,9 @@ pub struct ServerInfoV3 { pub pg_version: u32, pub system_id: SystemId, #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub wal_seg_size: u32, } @@ -106,10 +104,9 @@ pub struct SafeKeeperStateV3 { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperStateV4 { #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -154,7 +151,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }; return Ok(SafeKeeperState { tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, + timeline_id: oldstate.server.timeline_id, acceptor_state: ac, server: ServerInfo { pg_version: oldstate.server.pg_version, @@ -181,7 +178,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }; return Ok(SafeKeeperState { tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, + timeline_id: oldstate.server.timeline_id, acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, @@ -193,9 +190,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); - // migrate to moving ztenantid/ztli to the top and adding some lsns + // migrate to moving tenant_id/timeline_id to the top and adding some lsns } else if version == 3 { - info!("reading safekeeper control file version {}", version); + info!("reading safekeeper control file version {version}"); let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?; let server = ServerInfo { pg_version: oldstate.server.pg_version, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 3e301259ed..41b9ad66e1 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -14,10 +14,10 @@ use regex::Regex; use std::sync::Arc; use tracing::info; use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, postgres_backend::{self, PostgresBackend}, pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; /// Safekeeper handler of postgres commands @@ -25,8 +25,8 @@ pub struct SafekeeperPostgresHandler { pub conf: SafeKeeperConf, /// assigned application name pub appname: Option, - pub ztenantid: Option, - pub ztimelineid: Option, + pub tenant_id: Option, + pub timeline_id: Option, pub timeline: Option>, } @@ -63,17 +63,17 @@ fn parse_cmd(cmd: &str) -> Result { } impl postgres_backend::Handler for SafekeeperPostgresHandler { - // ztenant id and ztimeline id are passed in connection string params + // tenant_id and timeline_id are passed in connection string params fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { match opt.split_once('=') { - Some(("ztenantid", value)) => { - self.ztenantid = Some(value.parse()?); + Some(("tenant_id", value)) => { + self.tenant_id = Some(value.parse()?); } - Some(("ztimelineid", value)) => { - self.ztimelineid = Some(value.parse()?); + Some(("timeline_id", value)) => { + self.timeline_id = Some(value.parse()?); } _ => continue, } @@ -95,18 +95,18 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { info!( "got query {:?} in timeline {:?}", - query_string, self.ztimelineid + query_string, self.timeline_id ); let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); - let tenantid = self.ztenantid.context("tenantid is required")?; - let timelineid = self.ztimelineid.context("timelineid is required")?; + let tenant_id = self.tenant_id.context("tenant_id is required")?; + let timeline_id = self.timeline_id.context("timeline_id is required")?; if self.timeline.is_none() { self.timeline.set( &self.conf, - ZTenantTimelineId::new(tenantid, timelineid), + TenantTimelineId::new(tenant_id, timeline_id), create, )?; } @@ -121,7 +121,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } - .context(format!("timeline {timelineid}"))?; + .context(format!("timeline {timeline_id}"))?; Ok(()) } @@ -132,8 +132,8 @@ impl SafekeeperPostgresHandler { SafekeeperPostgresHandler { conf, appname: None, - ztenantid: None, - ztimelineid: None, + tenant_id: None, + timeline_id: None, timeline: None, } } diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index 4b3ae7798e..e13ea50eaf 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,8 +1,8 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{NodeId, ZTimelineId}; +use utils::id::{NodeId, TimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub peer_ids: Vec, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 13356c5921..14c9414c09 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -21,8 +21,8 @@ use utils::{ request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::models::TimelineCreateRequest; @@ -68,9 +68,9 @@ struct AcceptorStateStatus { #[derive(Debug, Serialize)] struct TimelineStatus { #[serde(serialize_with = "display_serialize")] - tenant_id: ZTenantId, + tenant_id: TenantId, #[serde(serialize_with = "display_serialize")] - timeline_id: ZTimelineId, + timeline_id: TimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] flush_lsn: Lsn, @@ -90,7 +90,7 @@ struct TimelineStatus { /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); @@ -125,7 +125,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, ApiError> { let request_data: TimelineCreateRequest = json_request(&mut request).await?; - let zttid = ZTenantTimelineId { + let zttid = TenantTimelineId { tenant_id: parse_request_param(&request, "tenant_id")?, timeline_id: request_data.timeline_id, }; @@ -146,7 +146,7 @@ async fn timeline_create_handler(mut request: Request) -> Result, ) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); @@ -181,7 +181,7 @@ async fn tenant_delete_force_handler( /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 16c1d36131..00fc43521b 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -97,8 +97,8 @@ fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { pg_version: 0, // unknown proposer_id: [0u8; 16], system_id: 0, - ztli: spg.ztimelineid.unwrap(), - tenant_id: spg.ztenantid.unwrap(), + timeline_id: spg.timeline_id.unwrap(), + tenant_id: spg.tenant_id.unwrap(), tli: 0, wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests }); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 0335d61d3f..b466d5aab5 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use std::time::Duration; use url::Url; -use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId}; pub mod broker; pub mod control_file; @@ -61,11 +61,11 @@ pub struct SafeKeeperConf { } impl SafeKeeperConf { - pub fn tenant_dir(&self, tenant_id: &ZTenantId) -> PathBuf { + pub fn tenant_dir(&self, tenant_id: &TenantId) -> PathBuf { self.workdir.join(tenant_id.to_string()) } - pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf { + pub fn timeline_dir(&self, zttid: &TenantTimelineId) -> PathBuf { self.tenant_dir(&zttid.tenant_id) .join(zttid.timeline_id.to_string()) } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index c693035dd3..3fa3916266 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -8,7 +8,7 @@ use metrics::{ Gauge, IntGaugeVec, }; use postgres_ffi::XLogSegNo; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, @@ -16,7 +16,7 @@ use crate::{ }; pub struct FullTimelineInfo { - pub zttid: ZTenantTimelineId, + pub zttid: TenantTimelineId, pub replicas: Vec, pub wal_backup_active: bool, pub timeline_is_active: bool, diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index af4cfb6ba4..b0b6a73621 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -53,7 +53,7 @@ impl<'pg> ReceiveWalConn<'pg> { /// Receive WAL from wal_proposer pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let _enter = info_span!("WAL acceptor", timeline = %spg.ztimelineid.unwrap()).entered(); + let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered(); // Notify the libpq client that it's allowed to send `CopyData` messages self.pg_backend diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index a2bdcb55e7..fa045eed90 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -19,9 +19,9 @@ use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; use utils::{ bin_ser::LeSer, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, pq_proto::{ReplicationFeedback, SystemId}, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; @@ -166,10 +166,9 @@ pub struct Peers(pub Vec<(NodeId, PeerInfo)>); #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperState { #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -219,7 +218,7 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new(zttid: &TenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { tenant_id: zttid.tenant_id, timeline_id: zttid.timeline_id, @@ -245,7 +244,7 @@ impl SafeKeeperState { #[cfg(test)] pub fn empty() -> Self { - SafeKeeperState::new(&ZTenantTimelineId::empty(), vec![]) + SafeKeeperState::new(&TenantTimelineId::empty(), vec![]) } } @@ -260,9 +259,8 @@ pub struct ProposerGreeting { pub pg_version: u32, pub proposer_id: PgUuid, pub system_id: SystemId, - /// Zenith timelineid - pub ztli: ZTimelineId, - pub tenant_id: ZTenantId, + pub timeline_id: TimelineId, + pub tenant_id: TenantId, pub tli: TimeLineID, pub wal_seg_size: u32, } @@ -507,13 +505,13 @@ where { // constructor pub fn new( - ztli: ZTimelineId, + timeline_id: TimelineId, state: CTRL, mut wal_store: WAL, node_id: NodeId, ) -> Result> { - if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { - bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); + if state.timeline_id != TimelineId::from([0u8; 16]) && timeline_id != state.timeline_id { + bail!("Calling SafeKeeper::new with inconsistent timeline_id ({}) and SafeKeeperState.server.timeline_id ({})", timeline_id, state.timeline_id); } // initialize wal_store, if state is already initialized @@ -600,10 +598,10 @@ where self.state.tenant_id ); } - if msg.ztli != self.state.timeline_id { + if msg.timeline_id != self.state.timeline_id { bail!( "invalid timeline ID, got {}, expected {}", - msg.ztli, + msg.timeline_id, self.state.timeline_id ); } @@ -982,9 +980,9 @@ mod tests { persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); + let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1000,7 +998,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(timeline_id, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1016,9 +1014,9 @@ mod tests { persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); + let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 293cf67c57..375b6eea18 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -30,7 +30,7 @@ use utils::{ // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; -// zenith extension of replication protocol +// neon extension of replication protocol const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; @@ -105,7 +105,7 @@ impl ReplicationConn { match &msg { FeMessage::CopyData(m) => { // There's three possible data messages that the client is supposed to send here: - // `HotStandbyFeedback` and `StandbyStatusUpdate` and `ZenithStandbyFeedback`. + // `HotStandbyFeedback` and `StandbyStatusUpdate` and `NeonStandbyFeedback`. match m.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { @@ -165,12 +165,12 @@ impl ReplicationConn { pgb: &mut PostgresBackend, mut start_pos: Lsn, ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap()).entered(); + let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); let bg_stream_in = self.stream_in.take().unwrap(); - let bg_timeline_id = spg.ztimelineid.unwrap(); + let bg_timeline_id = spg.timeline_id.unwrap(); let state = ReplicaState::new(); // This replica_id is used below to check if it's time to stop replication. diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 8d101e6ff6..cf317c41c3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -21,9 +21,9 @@ use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ + id::{NodeId, TenantId, TenantTimelineId}, lsn::Lsn, pq_proto::ReplicationFeedback, - zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; use crate::control_file; @@ -98,7 +98,7 @@ impl SharedState { /// Initialize timeline state, creating control file fn create( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); @@ -119,7 +119,7 @@ impl SharedState { /// Restore SharedState from control file. /// If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result { + fn restore(conf: &SafeKeeperConf, zttid: &TenantTimelineId) -> Result { let control_store = control_file::FileStorage::restore_new(zttid, conf)?; let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); @@ -143,7 +143,7 @@ impl SharedState { /// Mark timeline active/inactive and return whether s3 offloading requires /// start/stop action. - fn update_status(&mut self, ttid: ZTenantTimelineId) -> bool { + fn update_status(&mut self, ttid: TenantTimelineId) -> bool { let is_active = self.is_active(); if self.active != is_active { info!("timeline {} active={} now", ttid, is_active); @@ -213,7 +213,7 @@ impl SharedState { // // To choose what feedback to use and resend to compute node, // we need to know which pageserver compute node considers to be main. - // See https://github.com/zenithdb/zenith/issues/1171 + // See https://github.com/neondatabase/neon/issues/1171 // if let Some(pageserver_feedback) = state.pageserver_feedback { if let Some(acc_feedback) = acc.pageserver_feedback { @@ -227,7 +227,7 @@ impl SharedState { // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. - // See https://github.com/zenithdb/zenith/issues/1171 + // See https://github.com/neondatabase/neon/issues/1171 acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, @@ -256,11 +256,11 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { - pub zttid: ZTenantTimelineId, + pub zttid: TenantTimelineId, /// Sending here asks for wal backup launcher attention (start/stop /// offloading). Sending zttid instead of concrete command allows to do /// sending without timeline lock. - wal_backup_launcher_tx: Sender, + wal_backup_launcher_tx: Sender, commit_lsn_watch_tx: watch::Sender, /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, @@ -269,8 +269,8 @@ pub struct Timeline { impl Timeline { fn new( - zttid: ZTenantTimelineId, - wal_backup_launcher_tx: Sender, + zttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = @@ -539,13 +539,13 @@ impl Timeline { // Utilities needed by various Connection-like objects pub trait TimelineTools { - fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; + fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()>; fn get(&self) -> &Arc; } impl TimelineTools for Option> { - fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()> { + fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()> { *self = Some(GlobalTimelines::get(conf, zttid, create)?); Ok(()) } @@ -556,8 +556,8 @@ impl TimelineTools for Option> { } struct GlobalTimelinesState { - timelines: HashMap>, - wal_backup_launcher_tx: Option>, + timelines: HashMap>, + wal_backup_launcher_tx: Option>, } static TIMELINES_STATE: Lazy> = Lazy::new(|| { @@ -577,7 +577,7 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn init(wal_backup_launcher_tx: Sender) { + pub fn init(wal_backup_launcher_tx: Sender) { let mut state = TIMELINES_STATE.lock().unwrap(); assert!(state.wal_backup_launcher_tx.is_none()); state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); @@ -586,7 +586,7 @@ impl GlobalTimelines { fn create_internal( mut state: MutexGuard, conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, peer_ids: Vec, ) -> Result> { match state.timelines.get(&zttid) { @@ -612,7 +612,7 @@ impl GlobalTimelines { pub fn create( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, peer_ids: Vec, ) -> Result> { let state = TIMELINES_STATE.lock().unwrap(); @@ -623,7 +623,7 @@ impl GlobalTimelines { /// If control file doesn't exist and create=false, bails out. pub fn get( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, create: bool, ) -> Result> { let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); @@ -664,13 +664,12 @@ impl GlobalTimelines { } /// Get loaded timeline, if it exists. - pub fn get_loaded(zttid: ZTenantTimelineId) -> Option> { + pub fn get_loaded(zttid: TenantTimelineId) -> Option> { let state = TIMELINES_STATE.lock().unwrap(); state.timelines.get(&zttid).map(Arc::clone) } - /// Get ZTenantTimelineIDs of all active timelines. - pub fn get_active_timelines() -> HashSet { + pub fn get_active_timelines() -> HashSet { let state = TIMELINES_STATE.lock().unwrap(); state .timelines @@ -692,7 +691,7 @@ impl GlobalTimelines { fn delete_force_internal( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, was_active: bool, ) -> Result { match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { @@ -721,7 +720,7 @@ impl GlobalTimelines { /// TODO: ensure all of the above never happens. pub async fn delete_force( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result { info!("deleting timeline {}", zttid); let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); @@ -737,8 +736,8 @@ impl GlobalTimelines { /// There may be a race if new timelines are created simultaneously. pub async fn delete_force_all_for_tenant( conf: &SafeKeeperConf, - tenant_id: &ZTenantId, - ) -> Result> { + tenant_id: &TenantId, + ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let mut to_delete = HashMap::new(); { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 5d946e37a4..85e967e218 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -23,7 +23,7 @@ use tokio::sync::watch; use tokio::time::sleep; use tracing::*; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::broker::{Election, ElectionLeader}; use crate::timeline::{GlobalTimelines, Timeline}; @@ -38,7 +38,7 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; pub fn wal_backup_launcher_thread_main( conf: SafeKeeperConf, - wal_backup_launcher_rx: Receiver, + wal_backup_launcher_rx: Receiver, ) { let rt = Builder::new_multi_thread() .worker_threads(conf.backup_runtime_threads) @@ -53,7 +53,7 @@ pub fn wal_backup_launcher_thread_main( /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(zttid: ZTenantTimelineId) -> Option> { +fn is_wal_backup_required(zttid: TenantTimelineId) -> Option> { GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) } @@ -70,7 +70,7 @@ struct WalBackupTimelineEntry { /// Start per timeline task, if it makes sense for this safekeeper to offload. fn consider_start_task( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, task: &mut WalBackupTimelineEntry, ) { if !task.timeline.can_wal_backup() { @@ -117,7 +117,7 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// panics and separate elections from offloading itself. async fn wal_backup_launcher_main_loop( conf: SafeKeeperConf, - mut wal_backup_launcher_rx: Receiver, + mut wal_backup_launcher_rx: Receiver, ) { info!( "WAL backup launcher started, remote config {:?}", @@ -135,7 +135,7 @@ async fn wal_backup_launcher_main_loop( // Presense in this map means launcher is aware s3 offloading is needed for // the timeline, but task is started only if it makes sense for to offload // from this safekeeper. - let mut tasks: HashMap = HashMap::new(); + let mut tasks: HashMap = HashMap::new(); let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { @@ -193,7 +193,7 @@ struct WalBackupTask { /// Offload single timeline. async fn backup_task_main( - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, election: Election, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 644237a00d..58b69f06e7 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -25,7 +25,7 @@ use std::path::{Path, PathBuf}; use tracing::*; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::safekeeper::SafeKeeperState; @@ -86,7 +86,7 @@ struct WalStorageMetrics { } impl WalStorageMetrics { - fn new(zttid: &ZTenantTimelineId) -> Self { + fn new(zttid: &TenantTimelineId) -> Self { let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); Self { @@ -130,7 +130,7 @@ pub trait Storage { /// When storage is just created, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, timeline_dir: PathBuf, conf: SafeKeeperConf, @@ -161,7 +161,7 @@ pub struct PhysicalStorage { } impl PhysicalStorage { - pub fn new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { + pub fn new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { let timeline_dir = conf.timeline_dir(zttid); PhysicalStorage { metrics: WalStorageMetrics::new(zttid), diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh index df84fa0dd8..9e03302b0f 100755 --- a/scripts/generate_and_push_perf_report.sh +++ b/scripts/generate_and_push_perf_report.sh @@ -5,8 +5,8 @@ set -eux -o pipefail SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -echo "Uploading perf report to zenith pg" -# ingest per test results data into zenith backed postgres running in staging to build grafana reports on that data +echo "Uploading perf report to neon pg" +# ingest per test results data into neon backed postgres running in staging to build grafana reports on that data DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM" # Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository) @@ -16,8 +16,8 @@ DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_ echo "Uploading perf result to zenith-perf-data" scripts/git-upload \ - --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/zenithdb/zenith-perf-data.git \ - --message="add performance test result for $GITHUB_SHA zenith revision" \ + --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \ + --message="add performance test result for $GITHUB_SHA neon revision" \ --branch=master \ copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\ --merge \ diff --git a/scripts/perf_report_template.html b/scripts/perf_report_template.html index 2847e75a00..c86ab37c2d 100644 --- a/scripts/perf_report_template.html +++ b/scripts/perf_report_template.html @@ -19,7 +19,7 @@ } -

Zenith Performance Tests

+

Neon Performance Tests

{% for suit_name, suit_data in context.items() %}

Runs for {{ suit_name }}

@@ -38,7 +38,7 @@ {% for row in suit_data.rows %} - {{ row.revision[:6] }} + {{ row.revision[:6] }} {% for column_value in row.values %} {{ column_value.value }}{{column_value.ratio}} {% endfor %} diff --git a/test_runner/README.md b/test_runner/README.md index c7ec361d65..44751944b3 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -60,7 +60,7 @@ Useful environment variables: `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as +`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as `--pageserver-config-override=${value}` parameter values when neon_local cli is invoked `RUST_LOG`: logging configuration to pass into Neon CLI diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index b9cdfdebc4..b5565dab0f 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -16,7 +16,7 @@ from typing import Iterator, Optional import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. @@ -365,11 +365,11 @@ class NeonBenchmarker: assert matches, f"metric {metric_name} not found" return int(round(float(matches.group(1)))) - def get_timeline_size(self, repo_dir: Path, tenantid: ZTenantId, timelineid: ZTimelineId): + def get_timeline_size(self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId): """ Calculate the on-disk size of a timeline """ - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) + path = f"{repo_dir}/tenants/{tenant_id}/timelines/{timeline_id}" totalbytes = 0 for root, dirs, files in os.walk(path): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 69c6d31315..0c03429f95 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,7 +29,7 @@ import pytest import requests from cached_property import cached_property from fixtures.log_helper import log -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -754,7 +754,7 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = ZTenantId.generate() + self.initial_tenant = TenantId.generate() # Create a config file corresponding to the options toml = textwrap.dedent( @@ -776,7 +776,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - pageserver_auth_type = "ZenithJWT" if config.auth_enabled else "Trust" + pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust" toml += textwrap.dedent( f""" @@ -841,7 +841,7 @@ class NeonEnv: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) - def timeline_dir(self, tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Path: + def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -971,7 +971,7 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, new_tenant_id: Optional[ZTenantId] = None) -> ZTenantId: + def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ @@ -983,24 +983,24 @@ class NeonPageserverHttpClient(requests.Session): raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") new_tenant_id = res.json() assert isinstance(new_tenant_id, str) - return ZTenantId(new_tenant_id) + return TenantId(new_tenant_id) - def tenant_attach(self, tenant_id: ZTenantId): + def tenant_attach(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") self.verbose_error(res) - def tenant_detach(self, tenant_id: ZTenantId): + def tenant_detach(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") self.verbose_error(res) - def tenant_status(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) return res_json - def timeline_list(self, tenant_id: ZTenantId) -> List[Dict[str, Any]]: + def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") self.verbose_error(res) res_json = res.json() @@ -1009,9 +1009,9 @@ class NeonPageserverHttpClient(requests.Session): def timeline_create( self, - tenant_id: ZTenantId, - new_timeline_id: Optional[ZTimelineId] = None, - ancestor_timeline_id: Optional[ZTimelineId] = None, + tenant_id: TenantId, + new_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, ) -> Dict[Any, Any]: res = self.post( @@ -1032,8 +1032,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail( self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_non_incremental_physical_size: bool = False, ) -> Dict[Any, Any]: @@ -1052,7 +1052,7 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: ZTenantId, timeline_id: ZTimelineId): + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -1174,17 +1174,17 @@ class NeonCli(AbstractNeonCli): def create_tenant( self, - tenant_id: Optional[ZTenantId] = None, - timeline_id: Optional[ZTimelineId] = None, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, conf: Optional[Dict[str, str]] = None, - ) -> Tuple[ZTenantId, ZTimelineId]: + ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() if timeline_id is None: - timeline_id = ZTimelineId.generate() + timeline_id = TimelineId.generate() if conf is None: res = self.raw_cli( [ @@ -1211,7 +1211,7 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return tenant_id, timeline_id - def config_tenant(self, tenant_id: ZTenantId, conf: Dict[str, str]): + def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]): """ Update tenant config. """ @@ -1230,8 +1230,8 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[ZTenantId] = None - ) -> ZTimelineId: + self, new_branch_name: str, tenant_id: Optional[TenantId] = None + ) -> TimelineId: cmd = [ "timeline", "create", @@ -1250,9 +1250,9 @@ class NeonCli(AbstractNeonCli): if matches is not None: created_timeline_id = matches.group("timeline_id") - return ZTimelineId(str(created_timeline_id)) + return TimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[ZTenantId] = None): + def create_root_branch(self, branch_name: str, tenant_id: Optional[TenantId] = None): cmd = [ "timeline", "create", @@ -1274,15 +1274,15 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return ZTimelineId(created_timeline_id) + return TimelineId(created_timeline_id) def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, ancestor_start_lsn: Optional[Lsn] = None, - ) -> ZTimelineId: + ) -> TimelineId: cmd = [ "timeline", "branch", @@ -1308,11 +1308,9 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return ZTimelineId(str(created_timeline_id)) + return TimelineId(str(created_timeline_id)) - def list_timelines( - self, tenant_id: Optional[ZTenantId] = None - ) -> List[Tuple[str, ZTimelineId]]: + def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: """ Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ @@ -1324,14 +1322,14 @@ class NeonCli(AbstractNeonCli): ) timelines_cli = sorted( map( - lambda branch_and_id: (branch_and_id[0], ZTimelineId(branch_and_id[1])), + lambda branch_and_id: (branch_and_id[0], TimelineId(branch_and_id[1])), TIMELINE_DATA_EXTRACTOR.findall(res.stdout), ) ) return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[ZTimelineId] = None + self, config_toml: str, initial_timeline_id: Optional[TimelineId] = None ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1410,7 +1408,7 @@ class NeonCli(AbstractNeonCli): self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1436,7 +1434,7 @@ class NeonCli(AbstractNeonCli): def pg_start( self, node_name: str, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1460,7 +1458,7 @@ class NeonCli(AbstractNeonCli): def pg_stop( self, node_name: str, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, destroy=False, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": @@ -1558,7 +1556,7 @@ def append_pageserver_param_overrides( f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" ) - env_overrides = os.getenv("ZENITH_PAGESERVER_OVERRIDES") + env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") if env_overrides is not None: params_to_update += [ f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") @@ -1867,7 +1865,7 @@ class Postgres(PgProtocol): """An object representing a running postgres daemon.""" def __init__( - self, env: NeonEnv, tenant_id: ZTenantId, port: int, check_stop_result: bool = True + self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True ): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env @@ -2057,7 +2055,7 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2081,7 +2079,7 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2157,7 +2155,7 @@ class Safekeeper: return self def append_logical_message( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId, request: Dict[str, Any] + self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify @@ -2167,7 +2165,7 @@ class Safekeeper: # "replication=0" hacks psycopg not to send additional queries # on startup, see https://github.com/psycopg/psycopg2/pull/482 - connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + connstr = f"host=localhost port={self.port.pg} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" with closing(psycopg2.connect(connstr)) as conn: # server doesn't support transactions @@ -2202,8 +2200,8 @@ class SafekeeperTimelineStatus: class SafekeeperMetrics: # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) + flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) class SafekeeperHttpClient(requests.Session): @@ -2221,7 +2219,7 @@ class SafekeeperHttpClient(requests.Session): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() def timeline_status( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId + self, tenant_id: TenantId, timeline_id: TimelineId ) -> SafekeeperTimelineStatus: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() @@ -2234,16 +2232,14 @@ class SafekeeperHttpClient(requests.Session): remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), ) - def record_safekeeper_info(self, tenant_id: ZTenantId, timeline_id: ZTimelineId, body): + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", json=body, ) res.raise_for_status() - def timeline_delete_force( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId - ) -> Dict[Any, Any]: + def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -2252,7 +2248,7 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def tenant_delete_force(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") res.raise_for_status() res_json = res.json() @@ -2273,16 +2269,16 @@ class SafekeeperHttpClient(requests.Session): all_metrics_text, re.MULTILINE, ): - metrics.flush_lsn_inexact[ - (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) - ] = int(match.group(3)) + metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( + match.group(3) + ) for match in re.finditer( r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', all_metrics_text, re.MULTILINE, ): metrics.commit_lsn_inexact[ - (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) + (TenantId(match.group(1)), TimelineId(match.group(2))) ] = int(match.group(3)) return metrics @@ -2456,7 +2452,7 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command - timeline = ZTimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) + timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) # stop postgres to ensure that files won't change pg.stop() @@ -2540,7 +2536,7 @@ def wait_until(number_of_iterations: int, interval: float, func): def assert_timeline_local( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ): timeline_detail = pageserver_http_client.timeline_detail( tenant, @@ -2554,14 +2550,14 @@ def assert_timeline_local( def assert_no_in_progress_downloads_for_tenant( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, + tenant: TenantId, ): tenant_status = pageserver_http_client.tenant_status(tenant) assert tenant_status["has_in_progress_downloads"] is False, tenant_status def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2578,8 +2574,8 @@ def remote_consistent_lsn( def wait_for_upload( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, - timeline: ZTimelineId, + tenant: TenantId, + timeline: TimelineId, lsn: Lsn, ): """waits for local timeline upload up to specified lsn""" @@ -2601,7 +2597,7 @@ def wait_for_upload( def last_record_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2612,8 +2608,8 @@ def last_record_lsn( def wait_for_last_record_lsn( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, - timeline: ZTimelineId, + tenant: TenantId, + timeline: TimelineId, lsn: Lsn, ): """waits for pageserver to catch up to a certain lsn""" @@ -2632,7 +2628,7 @@ def wait_for_last_record_lsn( ) -def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: ZTenantId, timeline: ZTimelineId): +def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId): """Wait for pageserver to catch up the latest flush LSN""" last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) @@ -2643,8 +2639,8 @@ def fork_at_current_lsn( pg: Postgres, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[ZTenantId] = None, -) -> ZTimelineId: + tenant_id: Optional[TenantId] = None, +) -> TimelineId: """ Create new branch at the last LSN of an existing branch. The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index bdf675a785..de2e131b79 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -46,11 +46,11 @@ class Lsn: @total_ordering -class ZId: +class Id: """ Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and - the string representation is in hex. This corresponds to the ZId / ZTenantId / - ZTimelineIds in the Rust code. + the string representation is in hex. This corresponds to the Id / TenantId / + TimelineIds in the Rust code. """ def __init__(self, x: str): @@ -79,11 +79,11 @@ class ZId: return cls(random.randbytes(16).hex()) -class ZTenantId(ZId): +class TenantId(Id): def __repr__(self): - return f'ZTenantId("{self.id.hex()}")' + return f'`TenantId("{self.id.hex()}")' -class ZTimelineId(ZId): +class TimelineId(Id): def __repr__(self): - return f'ZTimelineId("{self.id.hex()}")' + return f'TimelineId("{self.id.hex()}")' diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 8bac8080db..21e48cf899 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -20,4 +20,4 @@ All tests run only once. Usually to obtain more consistent performance numbers, Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. -There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I.e. some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing. +There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing. diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index b8e81824b0..cb2621ff02 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,6 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import query_scalar @@ -27,7 +27,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_timeline = ZTimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) + branch0_timeline = TimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. @@ -51,7 +51,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_timeline = ZTimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) + branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) log.info(f"b1 timeline {branch1_timeline}") branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") @@ -74,7 +74,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_timeline = ZTimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) + branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) log.info(f"b2 timeline {branch2_timeline}") branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 08e38e1461..d9082efada 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -2,7 +2,7 @@ from contextlib import closing import pytest from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException -from fixtures.types import ZTenantId +from fixtures.types import TenantId def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -13,7 +13,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) tenant_http_client = env.pageserver.http_client(tenant_token) - invalid_tenant_token = env.auth_keys.generate_tenant_token(ZTenantId.generate()) + invalid_tenant_token = env.auth_keys.generate_tenant_token(TenantId.generate()) invalid_tenant_http_client = env.pageserver.http_client(invalid_tenant_token) management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 5bd6368bfc..cfb9649867 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -2,7 +2,7 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import Lsn, ZTimelineId +from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -28,7 +28,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): main_cur = pgmain.connect().cursor() - timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows main_cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index ce3a74930e..fd81981b2b 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -5,7 +5,7 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep @@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_timelines: List[Tuple[ZTenantId, ZTimelineId, Postgres]] = [] + tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = [] for n in range(4): tenant_id, timeline_id = env.neon_cli.create_tenant() diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index af94865549..8de2687c9b 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -8,7 +8,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, pg_distrib_dir, ) -from fixtures.types import Lsn, ZTimelineId +from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -27,7 +27,7 @@ def test_fullbackup( log.info("postgres is running on 'test_fullbackup' branch") with pgmain.cursor() as cur: - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 67ce8871cd..88d4ad8a6e 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -3,7 +3,7 @@ import random from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import query_scalar # Test configuration @@ -29,7 +29,7 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon -async def gc(env: NeonEnv, timeline: ZTimelineId): +async def gc(env: NeonEnv, timeline: TimelineId): psconn = await env.pageserver.connect_async() while updates_performed < updates_to_perform: @@ -37,7 +37,7 @@ async def gc(env: NeonEnv, timeline: ZTimelineId): # At the same time, run UPDATEs and GC -async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: ZTimelineId): +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId): workers = [] for worker_id in range(num_connections): workers.append(asyncio.create_task(update_table(pg))) @@ -62,7 +62,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on test_gc_aggressive branch") with pg.cursor() as cur: - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (id int, counter int, t text)") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index fc9f41bda0..60cc0551ab 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import subprocess_capture @@ -69,8 +69,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] node_name = "import_from_vanilla" - tenant = ZTenantId.generate() - timeline = ZTimelineId.generate() + tenant = TenantId.generate() + timeline = TimelineId.generate() # Set up pageserver for import neon_env_builder.enable_local_fs_remote_storage() @@ -195,7 +195,7 @@ def _generate_data(num_rows: int, pg: Postgres) -> Lsn: def _import( - expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: ZTimelineId + expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: TimelineId ) -> str: """Test importing backup data to the pageserver. @@ -228,9 +228,9 @@ def _import( # start the pageserver again env.pageserver.start() - # Import using another tenantid, because we use the same pageserver. + # Import using another tenant_id, because we use the same pageserver. # TODO Create another pageserver to make test more realistic. - tenant = ZTenantId.generate() + tenant = TenantId.generate() # Import to pageserver node_name = "import_from_pageserver" diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index b2342e5ee8..a9dc63dd50 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -7,11 +7,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, NeonPageserverHttpClient, ) -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( - pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: ZTenantId + pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: TenantId ): """ Compare timelines list returned by CLI and directly via API. @@ -20,7 +20,7 @@ def helper_compare_timeline_list( timelines_api = sorted( map( - lambda t: ZTimelineId(t["timeline_id"]), + lambda t: TimelineId(t["timeline_id"]), pageserver_http_client.timeline_list(initial_tenant), ) ) @@ -85,7 +85,7 @@ def test_cli_tenant_list(neon_simple_env: NeonEnv): helper_compare_tenant_list(pageserver_http_client, env) res = env.neon_cli.list_tenants() - tenants = sorted(map(lambda t: ZTenantId(t.split()[0]), res.stdout.splitlines())) + tenants = sorted(map(lambda t: TenantId(t.split()[0]), res.stdout.splitlines())) assert env.initial_tenant in tenants assert tenant1 in tenants diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 2b5e2edb5f..c99e13f45f 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,7 +1,7 @@ import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -27,7 +27,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) psconn = env.pageserver.connect() pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index a7b7189824..def6bd5b33 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -11,7 +11,7 @@ from fixtures.neon_fixtures import ( pg_distrib_dir, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId # test that we cannot override node id after init @@ -60,39 +60,39 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: NeonPageserverHttpClient, initial_tenant: ZTenantId): +def check_client(client: NeonPageserverHttpClient, initial_tenant: TenantId): client.check_status() # check initial tenant is there - assert initial_tenant in {ZTenantId(t["id"]) for t in client.tenant_list()} + assert initial_tenant in {TenantId(t["id"]) for t in client.tenant_list()} # create new tenant and check it is also there - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() client.tenant_create(tenant_id) - assert tenant_id in {ZTenantId(t["id"]) for t in client.tenant_list()} + assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) assert len(timelines) == 0, "initial tenant should not have any timelines" # create timeline - timeline_id = ZTimelineId.generate() + timeline_id = TimelineId.generate() client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 # check it is there - assert timeline_id in {ZTimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} + assert timeline_id in {TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} for timeline in timelines: - timeline_id = ZTimelineId(timeline["timeline_id"]) + timeline_id = TimelineId(timeline["timeline_id"]) timeline_details = client.timeline_detail( tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True, ) - assert ZTenantId(timeline_details["tenant_id"]) == tenant_id - assert ZTimelineId(timeline_details["timeline_id"]) == timeline_id + assert TenantId(timeline_details["tenant_id"]) == tenant_id + assert TimelineId(timeline_details["timeline_id"]) == timeline_id assert timeline_details.get("local") is not None @@ -118,8 +118,8 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): def expect_updated_msg_lsn( client: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, prev_msg_lsn: Optional[Lsn], ) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 329f4b7d24..786266b70e 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -3,7 +3,7 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -25,7 +25,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table main_cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 04baef6ba0..cbe74cad5c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,5 +1,5 @@ # It's possible to run any regular test with the local fs remote storage via -# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... +# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import os import shutil @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar @@ -61,8 +61,8 @@ def test_remote_storage_backup_and_restore( client = env.pageserver.http_client() - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) checkpoint_numbers = range(1, 3) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 147e22b38f..e3c9a091f9 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -4,10 +4,10 @@ import psycopg2 import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId -def do_gc_target(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): +def do_gc_target(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") @@ -20,7 +20,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() # first check for non existing tenant - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() with pytest.raises( expected_exception=NeonPageserverApiException, match=f"Tenant not found for id {tenant_id}", @@ -46,7 +46,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): with pytest.raises( expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" ): - bogus_timeline_id = ZTimelineId.generate() + bogus_timeline_id = TimelineId.generate() env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") # try to concurrently run gc and detach diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 56563ebe87..aa7d92f1fd 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -24,7 +24,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, subprocess_capture @@ -113,15 +113,15 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def populate_branch( pg: Postgres, - tenant_id: ZTenantId, + tenant_id: TenantId, ps_http: NeonPageserverHttpClient, create_table: bool, expected_sum: Optional[int], -) -> Tuple[ZTimelineId, Lsn]: +) -> Tuple[TimelineId, Lsn]: # insert some data with pg_cur(pg) as cur: cur.execute("SHOW neon.timeline_id") - timeline_id = ZTimelineId(cur.fetchone()[0]) + timeline_id = TimelineId(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline_id) log.info( @@ -149,8 +149,8 @@ def populate_branch( def ensure_checkpoint( pageserver_cur, pageserver_http: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage @@ -162,8 +162,8 @@ def ensure_checkpoint( def check_timeline_attached( new_pageserver_http_client: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, old_timeline_detail: Dict[str, Any], old_current_lsn: Lsn, ): @@ -187,8 +187,8 @@ def switch_pg_to_new_pageserver( env: NeonEnv, pg: Postgres, new_pageserver_port: int, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> pathlib.Path: pg.stop() @@ -265,7 +265,7 @@ def test_tenant_relocation( pageserver_http = env.pageserver.http_client() tenant_id, initial_timeline_id = env.neon_cli.create_tenant( - ZTenantId("74ee8b079a0e437eb0afea7d26a07209") + TenantId("74ee8b079a0e437eb0afea7d26a07209") ) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 1214d703d0..97a13bbcb0 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,6 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_until -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def get_only_element(l): # noqa: E741 @@ -23,7 +23,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): def get_state(tenant): all_states = client.tenant_list() - matching = [t for t in all_states if ZTenantId(t["id"]) == tenant] + matching = [t for t in all_states if TenantId(t["id"]) == tenant] return get_only_element(matching)["state"] def get_metric_value(name): @@ -35,8 +35,8 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): value = line.lstrip(name).strip() return int(value) - def delete_all_timelines(tenant: ZTenantId): - timelines = [ZTimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] + def delete_all_timelines(tenant: TenantId): + timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) @@ -56,7 +56,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Delete all timelines on all tenants for tenant_info in client.tenant_list(): - tenant_id = ZTenantId(tenant_info["id"]) + tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index bd53aae25c..4e7610a96f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -8,7 +8,7 @@ import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, ZTenantId +from fixtures.types import Lsn, TenantId from prometheus_client.samples import Sample @@ -188,7 +188,7 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000,) - def get_ps_metric_samples_for_tenant(tenant_id: ZTenantId) -> List[Sample]: + def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") samples = [] for metric_name in ps_metrics.metrics: diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 70b474c9a9..85f371c845 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -19,7 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -58,7 +58,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem env = neon_env_builder.init_start() - tenants_pgs: List[Tuple[ZTenantId, Postgres]] = [] + tenants_pgs: List[Tuple[TenantId, Postgres]] = [] for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly @@ -83,8 +83,8 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem res = pg.safe_psql_many( ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] ) - tenant_id = ZTenantId(res[0][0][0]) - timeline_id = ZTimelineId(res[1][0][0]) + tenant_id = TenantId(res[0][0][0]) + timeline_id = TimelineId(res[1][0][0]) current_lsn = Lsn(res[2][0][0]) # wait until pageserver receives all the data diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 5a20dbd232..2eea8dd3cc 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,6 +1,6 @@ import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def test_timeline_delete(neon_simple_env: NeonEnv): @@ -10,12 +10,12 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # first try to delete non existing timeline # for existing tenant: - invalid_timeline_id = ZTimelineId.generate() + invalid_timeline_id = TimelineId.generate() with pytest.raises(NeonPageserverApiException, match="timeline not found"): ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) # for non existing tenant: - invalid_tenant_id = ZTenantId.generate() + invalid_tenant_id = TenantId.generate() with pytest.raises( NeonPageserverApiException, match=f"Tenant {invalid_tenant_id} not found in the local state", diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 6fbc430e80..83018f46f5 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -15,7 +15,7 @@ from fixtures.neon_fixtures import ( assert_timeline_local, wait_for_last_flush_lsn, ) -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size @@ -386,7 +386,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: ZTimelineId): + def get_timeline_physical_size(timeline: TimelineId): res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) return res["local"]["current_physical_size_non_incremental"] @@ -415,7 +415,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): assert tenant_physical_size == timeline_total_size -def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): +def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): """Check the current physical size returned from timeline API matches the total physical size of the timeline on disk""" client = env.pageserver.http_client() @@ -431,7 +431,7 @@ def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimel # Timeline logical size initialization is an asynchronous background task that runs once, # try a few times to ensure it's activated properly def wait_for_timeline_size_init( - client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ): for i in range(10): timeline_details = assert_timeline_local(client, tenant, timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index cd370e60c0..8c5b4c8c30 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -32,13 +32,13 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar def wait_lsn_force_checkpoint( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, pg: Postgres, ps: NeonPageserver, pageserver_conn_options={}, @@ -74,7 +74,7 @@ def wait_lsn_force_checkpoint( @dataclass class TimelineMetrics: - timeline_id: ZTimelineId + timeline_id: TimelineId last_record_lsn: Lsn # One entry per each Safekeeper, order is the same flush_lsns: List[Lsn] = field(default_factory=list) @@ -126,7 +126,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): timeline_metrics = [] for timeline_detail in timeline_details: - timeline_id = ZTimelineId(timeline_detail["timeline_id"]) + timeline_id = TimelineId(timeline_detail["timeline_id"]) local_timeline_detail = timeline_detail.get("local") if local_timeline_detail is None: @@ -273,8 +273,8 @@ def test_broker(neon_env_builder: NeonEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -325,8 +325,8 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ] ) - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} @@ -348,7 +348,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(tenant_id) ) http_cli_other = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) http_cli_noauth = env.safekeepers[0].http_client() @@ -438,8 +438,8 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot pg = env.postgres.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) pg_conn = pg.connect() cur = pg_conn.cursor() @@ -493,8 +493,8 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) expected_sum = 0 @@ -584,8 +584,8 @@ class ProposerPostgres(PgProtocol): self, pgdata_dir: str, pg_bin, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, listen_addr: str, port: int, ): @@ -593,8 +593,8 @@ class ProposerPostgres(PgProtocol): self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin - self.tenant_id: ZTenantId = tenant_id - self.timeline_id: ZTimelineId = timeline_id + self.tenant_id: TenantId = tenant_id + self.timeline_id: TimelineId = timeline_id self.listen_addr: str = listen_addr self.port: int = port @@ -672,8 +672,8 @@ def test_sync_safekeepers( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_id = ZTenantId.generate() - timeline_id = ZTimelineId.generate() + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() # write config for proposer pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") @@ -725,8 +725,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa = env.safekeepers[0] # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) if not auth_enabled: wa_http_cli = wa.http_client() @@ -735,7 +735,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() wa_http_cli_bad = wa.http_client( - auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) wa_http_cli_bad.check_status() wa_http_cli_noauth = wa.http_client() @@ -785,15 +785,15 @@ class SafekeeperEnv: self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[ZTenantId] = None - self.timeline_id: Optional[ZTimelineId] = None + self.tenant_id: Optional[TenantId] = None + self.timeline_id: Optional[TimelineId] = None def init(self) -> "SafekeeperEnv": assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" - self.tenant_id = ZTenantId.generate() - self.timeline_id = ZTimelineId.generate() + self.tenant_id = TenantId.generate() + self.timeline_id = TimelineId.generate() self.repo_dir.mkdir(exist_ok=True) # Create config and a Safekeeper object for each safekeeper @@ -912,9 +912,7 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") assert sum_after == sum_before + 5000050000 - def show_statuses( - safekeepers: List[Safekeeper], tenant_id: ZTenantId, timeline_id: ZTimelineId - ): + def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): for sk in safekeepers: http_cli = sk.http_client() try: @@ -935,8 +933,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): pg.start() # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1134,7 +1132,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, ZTimelineId("00" * 16)) == { + assert sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16)) == { "dir_existed": False, "was_active": False, } diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index e36d3cf94b..9d2008296a 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -7,7 +7,7 @@ from typing import List, Optional import asyncpg from fixtures.log_helper import getLogger from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") @@ -103,8 +103,8 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou async def wait_for_lsn( safekeeper: Safekeeper, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, wait_lsn: Lsn, polling_interval=1, timeout=60, @@ -155,8 +155,8 @@ async def run_restarts_under_load( test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = ZTenantId(await pg_conn.fetchval("show neon.tenant_id")) - timeline_id = ZTimelineId(await pg_conn.fetchval("show neon.timeline_id")) + tenant_id = TenantId(await pg_conn.fetchval("show neon.tenant_id")) + timeline_id = TimelineId(await pg_conn.fetchval("show neon.timeline_id")) bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 6fd509c4d1..21921a3bc2 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -9,7 +9,7 @@ from fixtures.neon_fixtures import ( base_dir, pg_distrib_dir, ) -from fixtures.types import ZTenantId +from fixtures.types import TenantId def test_wal_restore( @@ -22,7 +22,7 @@ def test_wal_restore( env.neon_cli.create_branch("test_wal_restore") pg = env.postgres.create_start("test_wal_restore") pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" From 6db6e7ddda3c67a3d48387955859452e93f7d751 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 22:51:28 +0300 Subject: [PATCH 23/33] Use backward-compatible safekeeper code --- safekeeper/src/handler.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 41b9ad66e1..ad2c0ec8bf 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -68,11 +68,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { + // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy, + // remove these after the PR gets deployed: + // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064 match opt.split_once('=') { - Some(("tenant_id", value)) => { + Some(("ztenantid", value)) | Some(("tenant_id", value)) => { self.tenant_id = Some(value.parse()?); } - Some(("timeline_id", value)) => { + Some(("ztimelineid", value)) | Some(("timeline_id", value)) => { self.timeline_id = Some(value.parse()?); } _ => continue, From c3096532f9ceee8fad82b4c741b0108bd143cc06 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 14 Sep 2022 09:23:51 +0300 Subject: [PATCH 24/33] Fix vendor/postgres-v15 to point to correct v15 branch. Commit f44afbaf62 updated vendor/postgres-v15 to point to a commit that was built on top of PostgreSQL 14 rather than 15. So we accidentally had two copies of PostgreSQL v14 in the repository. Oops. This updates it to point to the correct version. --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index b1dbd93e2b..cf4db95b84 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit b1dbd93e2b1691e93860f7e59b9e1fe5a6e79786 +Subproject commit cf4db95b8480e08425e52ef46f78cb5a234baa0e From d87c9e62d64c8a4628096a4ce5c8307fc1daa2e6 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 14 Sep 2022 11:53:34 +0100 Subject: [PATCH 25/33] Nightly Benchmarks: perform tests on both pre-created and fresh projects (#2443) --- .github/workflows/benchmarking.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 49fbc74dd6..fab0a9aa04 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -144,7 +144,9 @@ jobs: strategy: fail-fast: false matrix: - platform: [ neon-captest, rds-aurora ] + # neon-captest: Run pgbench, reusing existing project + # neon-captest-new: Same, but on a freshly created project + platform: [ neon-captest, neon-captest-new, rds-aurora ] runs-on: dev container: @@ -162,7 +164,7 @@ jobs: sudo apt install -y postgresql-14 - name: Create Neon Project - if: matrix.platform == 'neon-captest' + if: matrix.platform == 'neon-captest-new' id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -174,13 +176,16 @@ jobs: run: | case "${PLATFORM}" in neon-captest) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + neon-captest-new) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest' or 'rds-aurora'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'" exit 1 ;; esac @@ -240,7 +245,7 @@ jobs: build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform == 'neon-captest' && always() }} + if: ${{ matrix.platform == 'neon-captest-new' && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev @@ -252,6 +257,6 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} From f86ea09323ac0d6f2904dcf603652044cea50664 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 14 Sep 2022 09:53:06 +0300 Subject: [PATCH 26/33] Avoid recompiling postgres_ffi every time you run "make". Running "make" at the top level calls "make install" to install the PostgreSQL headers into the pg_install/ directory. That always updated the modification time of the headers even if there were no changes, triggering recompilation of the postgres_ffi bindings. To avoid that, use 'install -C', to install the PostgreSQL headers. However, there was an upstream PostgreSQL issue that the src/include/Makefile didn't respect the INSTALL configure option. That was just fixed in upstream PostgreSQL, so cherry-pick that fix to our vendor/postgres repositories. Fixes https://github.com/neondatabase/neon/issues/1873. --- Makefile | 6 ++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4d7b1bee07..4ac51ed174 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,12 @@ ifeq ($(UNAME_S),Darwin) PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib endif +# Use -C option so that when PostgreSQL "make install" installs the +# headers, the mtime of the headers are not changed when there have +# been no changes to the files. Changing the mtime triggers an +# unnecessary rebuild of 'postgres_ffi'. +PG_CONFIGURE_OPTS += INSTALL='install -C' + # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) # Fix for a corner case when make doesn't pass a jobserver diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 114676d2ed..ce723ee499 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 114676d2edd5307226d9448ec467821fdb77467d +Subproject commit ce723ee499450cb108aede464a35a17f3d75cf84 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index cf4db95b84..0858387047 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit cf4db95b8480e08425e52ef46f78cb5a234baa0e +Subproject commit 08583870479e30c64aeb5a97d6fee9cf470f05fb From 87bf7be5370cc2a621cd51d5a4cb3b1ed76e4633 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 14 Sep 2022 21:27:47 +0300 Subject: [PATCH 27/33] [proxy] Drop support for legacy cloud API (#2448) Apparently, it no longer exists in the cloud. --- proxy/src/auth.rs | 5 - proxy/src/auth/backend.rs | 22 +-- proxy/src/auth/backend/legacy_console.rs | 208 ----------------------- proxy/src/config.rs | 19 +-- proxy/src/main.rs | 31 +++- 5 files changed, 30 insertions(+), 255 deletions(-) delete mode 100644 proxy/src/auth/backend/legacy_console.rs diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index d09470d15e..a50d23e351 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -22,10 +22,6 @@ pub type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { - // This will be dropped in the future. - #[error(transparent)] - Legacy(#[from] backend::LegacyAuthError), - #[error(transparent)] Link(#[from] backend::LinkAuthError), @@ -78,7 +74,6 @@ impl UserFacingError for AuthError { fn to_string_client(&self) -> String { use AuthErrorImpl::*; match self.0.as_ref() { - Legacy(e) => e.to_string_client(), Link(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), WakeCompute(e) => e.to_string_client(), diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 9c43620ffb..de0719a196 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -6,9 +6,6 @@ pub use link::LinkAuthError; mod console; pub use console::{GetAuthInfoError, WakeComputeError}; -mod legacy_console; -pub use legacy_console::LegacyAuthError; - use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute, config, mgmt, @@ -56,7 +53,7 @@ impl std::fmt::Debug for DatabaseInfo { fmt.debug_struct("DatabaseInfo") .field("host", &self.host) .field("port", &self.port) - .finish() + .finish_non_exhaustive() } } @@ -88,8 +85,6 @@ impl From for tokio_postgres::Config { /// backends which require them for the authentication process. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BackendType { - /// Legacy Cloud API (V1) + link auth. - LegacyConsole(T), /// Current Cloud API (V2). Console(T), /// Local mock of Cloud API (V2). @@ -105,7 +100,6 @@ impl BackendType { pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { use BackendType::*; match self { - LegacyConsole(x) => LegacyConsole(f(x)), Console(x) => Console(f(x)), Postgres(x) => Postgres(f(x)), Link => Link, @@ -119,7 +113,6 @@ impl BackendType> { pub fn transpose(self) -> Result, E> { use BackendType::*; match self { - LegacyConsole(x) => x.map(LegacyConsole), Console(x) => x.map(Console), Postgres(x) => x.map(Postgres), Link => Ok(Link), @@ -176,15 +169,6 @@ impl BackendType> { } match self { - LegacyConsole(creds) => { - legacy_console::handle_user( - &urls.auth_endpoint, - &urls.auth_link_uri, - &creds, - client, - ) - .await - } Console(creds) => { console::Api::new(&urls.auth_endpoint, &creds) .handle_user(client) @@ -208,7 +192,6 @@ mod tests { #[test] fn test_backend_type_map() { let values = [ - BackendType::LegacyConsole(0), BackendType::Console(0), BackendType::Postgres(0), BackendType::Link, @@ -222,8 +205,7 @@ mod tests { #[test] fn test_backend_type_transpose() { let values = [ - BackendType::LegacyConsole(Ok::<_, ()>(0)), - BackendType::Console(Ok(0)), + BackendType::Console(Ok::<_, ()>(0)), BackendType::Postgres(Ok(0)), BackendType::Link, ]; diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs deleted file mode 100644 index b99a004dcd..0000000000 --- a/proxy/src/auth/backend/legacy_console.rs +++ /dev/null @@ -1,208 +0,0 @@ -//! Cloud API V1. - -use super::DatabaseInfo; -use crate::{ - auth::{self, ClientCredentials}, - compute, - error::UserFacingError, - stream::PqStream, - waiters, -}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::BeMessage as Be; - -#[derive(Debug, Error)] -pub enum LegacyAuthError { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error("Console responded with a malformed JSON: {0}")] - BadResponse(#[from] serde_json::Error), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error(transparent)] - WaiterRegister(#[from] waiters::RegisterError), - - #[error(transparent)] - WaiterWait(#[from] waiters::WaitError), -} - -impl UserFacingError for LegacyAuthError { - fn to_string_client(&self) -> String { - use LegacyAuthError::*; - match self { - AuthFailed(_) | HttpStatus(_) => self.to_string(), - _ => "Internal error".to_string(), - } - } -} - -// NOTE: the order of constructors is important. -// https://serde.rs/enum-representations.html#untagged -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum ProxyAuthResponse { - Ready { conn_info: DatabaseInfo }, - Error { error: String }, - NotReady { ready: bool }, // TODO: get rid of `ready` -} - -impl ClientCredentials<'_> { - fn is_existing_user(&self) -> bool { - self.user.ends_with("@zenith") - } -} - -async fn authenticate_proxy_client( - auth_endpoint: &reqwest::Url, - creds: &ClientCredentials<'_>, - md5_response: &str, - salt: &[u8; 4], - psql_session_id: &str, -) -> Result { - let mut url = auth_endpoint.clone(); - url.query_pairs_mut() - .append_pair("login", creds.user) - .append_pair("database", creds.dbname) - .append_pair("md5response", md5_response) - .append_pair("salt", &hex::encode(salt)) - .append_pair("psql_session_id", psql_session_id); - - super::with_waiter(psql_session_id, |waiter| async { - println!("cloud request: {}", url); - // TODO: leverage `reqwest::Client` to reuse connections - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(LegacyAuthError::HttpStatus(resp.status())); - } - - let auth_info = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: {:?}", auth_info); - - use ProxyAuthResponse::*; - let db_info = match auth_info { - Ready { conn_info } => conn_info, - Error { error } => return Err(LegacyAuthError::AuthFailed(error)), - NotReady { .. } => waiter.await?.map_err(LegacyAuthError::AuthFailed)?, - }; - - Ok(db_info) - }) - .await -} - -async fn handle_existing_user( - auth_endpoint: &reqwest::Url, - client: &mut PqStream, - creds: &ClientCredentials<'_>, -) -> auth::Result { - let psql_session_id = super::link::new_psql_session_id(); - let md5_salt = rand::random(); - - client - .write_message(&Be::AuthenticationMD5Password(md5_salt)) - .await?; - - // Read client's password hash - let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword( - "the password should be a valid null-terminated utf-8 string", - ))?; - - let db_info = authenticate_proxy_client( - auth_endpoint, - creds, - md5_response, - &md5_salt, - &psql_session_id, - ) - .await?; - - Ok(compute::NodeInfo { - reported_auth_ok: false, - config: db_info.into(), - }) -} - -pub async fn handle_user( - auth_endpoint: &reqwest::Url, - auth_link_uri: &reqwest::Url, - creds: &ClientCredentials<'_>, - client: &mut PqStream, -) -> auth::Result { - if creds.is_existing_user() { - handle_existing_user(auth_endpoint, client, creds).await - } else { - super::link::handle_user(auth_link_uri, client).await - } -} - -fn parse_password(bytes: &[u8]) -> Option<&str> { - std::str::from_utf8(bytes).ok()?.strip_suffix('\0') -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_proxy_auth_response() { - // Ready - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": true, - "conn_info": DatabaseInfo::default(), - })) - .unwrap(); - assert!(matches!( - auth, - ProxyAuthResponse::Ready { - conn_info: DatabaseInfo { .. } - } - )); - - // Error - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - "error": "too bad, so sad", - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::Error { .. })); - - // NotReady - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); - } - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1f01c25734..8835d660d5 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,21 +1,6 @@ use crate::{auth, url::ApiUrl}; -use anyhow::{bail, ensure, Context}; -use std::{str::FromStr, sync::Arc}; - -impl FromStr for auth::BackendType<()> { - type Err = anyhow::Error; - - fn from_str(s: &str) -> anyhow::Result { - use auth::BackendType::*; - Ok(match s { - "legacy" => LegacyConsole(()), - "console" => Console(()), - "postgres" => Postgres(()), - "link" => Link, - _ => bail!("Invalid option `{s}` for auth method"), - }) - } -} +use anyhow::{ensure, Context}; +use std::sync::Arc; pub struct ProxyConfig { pub tls_config: Option, diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 2521f2af21..efe45f6386 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -20,7 +20,7 @@ mod url; mod waiters; use anyhow::{bail, Context}; -use clap::{App, Arg}; +use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; use std::{future::Future, net::SocketAddr}; @@ -36,9 +36,26 @@ async fn flatten_err( f.map(|r| r.context("join error").and_then(|x| x)).await } +/// A proper parser for auth backend parameter. +impl clap::ValueEnum for auth::BackendType<()> { + fn value_variants<'a>() -> &'a [Self] { + use auth::BackendType::*; + &[Console(()), Postgres(()), Link] + } + + fn to_possible_value<'a>(&self) -> Option> { + use auth::BackendType::*; + Some(clap::PossibleValue::new(match self { + Console(_) => "console", + Postgres(_) => "postgres", + Link => "link", + })) + } +} + #[tokio::main] async fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Neon proxy/router") + let arg_matches = clap::App::new("Neon proxy/router") .version(GIT_VERSION) .arg( Arg::new("proxy") @@ -52,8 +69,8 @@ async fn main() -> anyhow::Result<()> { Arg::new("auth-backend") .long("auth-backend") .takes_value(true) - .help("Possible values: legacy | console | postgres | link") - .default_value("legacy"), + .value_parser(clap::builder::EnumValueParser::>::new()) + .default_value("link"), ) .arg( Arg::new("mgmt") @@ -118,6 +135,10 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + let auth_backend = *arg_matches + .try_get_one::>("auth-backend")? + .unwrap(); + let auth_urls = config::AuthUrls { auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, @@ -125,7 +146,7 @@ async fn main() -> anyhow::Result<()> { let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, - auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, + auth_backend, auth_urls, })); From 757e2147c12a4d63cfecf84018b5453cbec474bd Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 15 Sep 2022 14:21:22 +0200 Subject: [PATCH 28/33] Follow-up for neondatabase/neon#2448 (#2452) * remove `legacy` mode from the proxy readme * explicitly specify `authBackend` in the link auth proxy helm-values for all envs --- .github/helm-values/neon-stress.proxy.yaml | 1 + .github/helm-values/production.proxy.yaml | 1 + .github/helm-values/staging.proxy.yaml | 1 + proxy/README.md | 17 +++++++---------- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml index 8236f9873a..ce432ca23c 100644 --- a/.github/helm-values/neon-stress.proxy.yaml +++ b/.github/helm-values/neon-stress.proxy.yaml @@ -1,6 +1,7 @@ fullnameOverride: "neon-stress-proxy" settings: + authBackend: "link" authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/" uri: "https://console.dev.neon.tech/psql_session/" diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml index 87c61c90cf..c26a6258be 100644 --- a/.github/helm-values/production.proxy.yaml +++ b/.github/helm-values/production.proxy.yaml @@ -1,4 +1,5 @@ settings: + authBackend: "link" authEndpoint: "https://console.neon.tech/authenticate_proxy_request/" uri: "https://console.neon.tech/psql_session/" diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml index 34ba972b64..25842429a5 100644 --- a/.github/helm-values/staging.proxy.yaml +++ b/.github/helm-values/staging.proxy.yaml @@ -5,6 +5,7 @@ image: repository: neondatabase/neon settings: + authBackend: "link" authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" uri: "https://console.stage.neon.tech/psql_session/" diff --git a/proxy/README.md b/proxy/README.md index 458a7d9bbf..4ead098b73 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -2,10 +2,8 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: -* legacy - old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back * console - new SCRAM-based console API; uses SNI info to select the destination cluster + new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon) * postgres uses postgres to select auth secrets of existing roles. Useful for local testing * link @@ -13,21 +11,20 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a ## Using SNI-based routing on localhost -Now proxy determines cluster name from the subdomain, request to the `my-cluster-42.somedomain.tld` will be routed to the cluster named `my-cluster-42`. Unfortunately `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: +Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: -``` +```sh openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" - ``` -now you can start proxy: +start proxy -``` +```sh ./target/debug/proxy -c server.crt -k server.key ``` -and connect to it: +and connect to it -``` +```sh PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full' ``` From a8d97325291b207d3481ed9578246398c6576ec2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 15 Sep 2022 03:28:24 +0000 Subject: [PATCH 29/33] Bump axum-core from 0.2.7 to 0.2.8 Bumps [axum-core](https://github.com/tokio-rs/axum) from 0.2.7 to 0.2.8. - [Release notes](https://github.com/tokio-rs/axum/releases) - [Changelog](https://github.com/tokio-rs/axum/blob/main/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/axum/compare/axum-core-v0.2.7...axum-core-v0.2.8) --- updated-dependencies: - dependency-name: axum-core dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Cargo.lock | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4234d2b00..a258fab5f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -183,9 +183,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f44a0e6200e9d11a1cdc989e4b358f6e3d354fbf48478f345a17f4e43f8635" +checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b" dependencies = [ "async-trait", "bytes", @@ -193,6 +193,8 @@ dependencies = [ "http", "http-body", "mime", + "tower-layer", + "tower-service", ] [[package]] From 1062e57feeae80fa9771ad42dc66cd10ffcf5e36 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 15 Sep 2022 16:33:42 +0300 Subject: [PATCH 30/33] Don't run codestyle checks separately for Postgres v14 and v15. Previously, we compiled neon separately for Postgres v14 and v15, for the codestyle checks. But that was bogus; we actually just ran "make postgres", which always compiled both versions. The version really only affected the caching. Fix that, by copying the build steps from the main build_and_test.yml workflow. --- .github/workflows/codestyle.yml | 53 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 53d0f9c5d8..237cf81205 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -30,8 +30,6 @@ jobs: # this is all we need to install our toolchain later via rust-toolchain.toml # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] - # To support several Postgres versions, add them here. - postgres_version: [v14, v15] timeout-minutes: 60 name: check codestyle rust and postgres runs-on: ${{ matrix.os }} @@ -56,17 +54,29 @@ jobs: if: matrix.os == 'macos-latest' run: brew install flex bison openssl - - name: Set pg revision for caching - id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-${{matrix.postgres_version}}) + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) + shell: bash -euxo pipefail {0} - - name: Cache postgres ${{matrix.postgres_version}} build - id: cache_pg + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15) + shell: bash -euxo pipefail {0} + + - name: Cache postgres v14 build + id: cache_pg_14 uses: actions/cache@v3 with: - path: | - pg_install/${{matrix.postgres_version}} - key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS if: matrix.os == 'macos-latest' @@ -74,24 +84,19 @@ jobs: echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: make postgres + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: make postgres-v14 + shell: bash -euxo pipefail {0} + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: make postgres-v15 + shell: bash -euxo pipefail {0} - name: Build neon extensions run: make neon-pg-ext - # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' - # and the real cause will be inside config.log - - name: Print configure logs in case of failure - if: failure() - continue-on-error: true - run: | - echo '' && echo '=== Postgres ${{matrix.postgres_version}} config.log ===' && echo '' - cat pg_install/build/${{matrix.postgres_version}}/config.log - echo '' && echo '=== Postgres ${{matrix.postgres_version}} configure.log ===' && echo '' - cat pg_install/build/${{matrix.postgres_version}}/configure.log - - name: Cache cargo deps id: cache_cargo uses: actions/cache@v3 From 9d9d8e951947b9cbaca4ab11937bda8d681dc24c Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 15 Sep 2022 19:16:07 +0200 Subject: [PATCH 31/33] docs/sourcetree: update CLion set up instructions (#2454) After #2325 the old method no longer works as our Makefile does not print compilation commands when run with --dry-run, see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 This method is much slower but is hopefully robust. Add some more notes while we're here. --- docs/sourcetree.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index c1a860f126..8043450a55 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -147,8 +147,16 @@ C code requires some extra care, as it's built via Make, not CMake. Some of our ```bash # Install a `compiledb` tool which can parse make's output and generate the compilation database. poetry add -D compiledb - # Run Make without actually compiling code so we can generate the compilation database. It still may take a few minutes. - make --dry-run --print-directory --keep-going --assume-new=* postgres neon-pg-ext | poetry run compiledb --verbose --no-build + # Clean the build tree so we can rebuild from scratch. + # Unfortunately, our and Postgres Makefiles do not work well with either --dry-run or --assume-new, + # so we don't know a way to generate the compilation database without recompiling everything, + # see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 + make distclean + # Rebuild the Postgres parts from scratch and save the compilation commands to the compilation database. + # You can alter the -j parameter to your liking. + # Note that we only build for a specific version of Postgres. The extension code is shared, but headers are + # different, so we set up CLion to only use a specific version of the headers. + make -j$(nproc) --print-directory postgres-v15 neon-pg-ext-v15 | poetry run compiledb --verbose --no-build # Uninstall the tool poetry remove -D compiledb # Make sure the compile_commands.json file is not committed. @@ -157,7 +165,8 @@ C code requires some extra care, as it's built via Make, not CMake. Some of our 3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. 4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). 5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. -7. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. +6. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. +7. Set up correct code indentation in CLion's settings: Editor > Code Style > C/C++, choose the "Project" scheme on the top, and tick the "Use tab character" on the "Tabs and Indents" tab. Ensure that "Tab size" is 4. You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. @@ -168,3 +177,4 @@ Known issues (fixes and suggestions are welcome): * Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. * CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. * Cargo Clippy diagnostics in CLion may take a lot of resources. +* `poetry add -D` updates some packages and changes `poetry.lock` drastically even when followed by `poetry remove -D`. Feel free to `git checkout poetry.lock` and `./scripts/pysync` to revert these changes. From e968b5e5025616f2a7d03cd7307c54a49185925c Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 15 Sep 2022 20:43:51 +0200 Subject: [PATCH 32/33] tests: do not set num_safekeepers = 1, it's the default (#2457) Also get rid if `with_safekeepers` parameter in tests. Its meaning has changed: `False` meant "no safekeepers" which is not supported anymore, so we assume it's always `True`. See #1648 --- test_runner/performance/test_perf_pgbench.py | 1 - test_runner/regress/test_auth.py | 8 +++----- test_runner/regress/test_branch_behind.py | 7 ------- test_runner/regress/test_crafted_wal_end.py | 1 - test_runner/regress/test_fullbackup.py | 2 -- test_runner/regress/test_import.py | 2 -- test_runner/regress/test_lsn_mapping.py | 1 - test_runner/regress/test_pitr_gc.py | 2 -- test_runner/regress/test_recovery.py | 1 - test_runner/regress/test_tenants.py | 18 ++++++------------ test_runner/regress/test_wal_acceptor.py | 1 - 11 files changed, 9 insertions(+), 35 deletions(-) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 934642d095..2a2213b783 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -173,7 +173,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): - neon_env_builder.num_safekeepers = 1 neon_env_builder.pageserver_config_override = """ profiling="page_requests" """ diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index d9082efada..ce4a8ffa9e 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -56,14 +56,12 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): tenant_http_client.tenant_create() -@pytest.mark.parametrize("with_safekeepers", [False, True]) -def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): +def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True - if with_safekeepers: - neon_env_builder.num_safekeepers = 3 + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - branch = f"test_compute_auth_to_pageserver{with_safekeepers}" + branch = "test_compute_auth_to_pageserver" env.neon_cli.create_branch(branch) pg = env.postgres.create_start(branch) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index cfb9649867..b0d0737172 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -10,13 +10,6 @@ from fixtures.utils import print_gc_result, query_scalar # Create a couple of branches off the main branch, at a historical point in time. # def test_branch_behind(neon_env_builder: NeonEnvBuilder): - - # Use safekeeper in this test to avoid a subtle race condition. - # Without safekeeper, walreceiver reconnection can stuck - # because of IO deadlock. - # - # See https://github.com/neondatabase/neon/issues/1068 - neon_env_builder.num_safekeepers = 1 # Disable pitr, because here we want to test branch creation after GC neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 32e5366945..e94c9a2bd0 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -17,7 +17,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft ], ) def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 8de2687c9b..0048e7b580 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -18,8 +18,6 @@ num_rows = 1000 def test_fullbackup( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor ): - - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_fullbackup") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 60cc0551ab..7b61b03b97 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -122,7 +122,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build @pytest.mark.timeout(600) def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() @@ -140,7 +139,6 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu # @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 9d1efec2c1..ef99954a76 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -9,7 +9,6 @@ from fixtures.utils import query_scalar # Test pageserver get_lsn_by_timestamp API # def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 786266b70e..57b2ee1c04 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -12,8 +12,6 @@ from fixtures.utils import print_gc_result, query_scalar # Insert some data, run GC and create a branch in the past. # def test_pitr_gc(neon_env_builder: NeonEnvBuilder): - - neon_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data neon_env_builder.pageserver_config_override = ( "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 6aa8b4e9be..08c15d8f09 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -10,7 +10,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder # Test pageserver recovery after crash # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 # Override default checkpointer settings to run it more often neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4e7610a96f..4500395c8f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -50,29 +50,23 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ), "pageserver should clean its temp tenant dirs on restart" -@pytest.mark.parametrize("with_safekeepers", [False, True]) -def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): - if with_safekeepers: - neon_env_builder.num_safekeepers = 3 +def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() """Tests tenants with and without wal acceptors""" tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_1 - ) - env.neon_cli.create_timeline( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_2 - ) + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_1) + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", + "test_tenants_normal_work", tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", + "test_tenants_normal_work", tenant_id=tenant_2, ) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8c5b4c8c30..089ed91c98 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1037,7 +1037,6 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): - neon_env_builder.num_safekeepers = 1 neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() From 96e867642fbe730a3fe13c572383d68b393ca567 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 15 Sep 2022 18:20:23 -0400 Subject: [PATCH 33/33] Validate tenant create options (#2450) Co-authored-by: Kirill Bulatov --- control_plane/src/storage.rs | 79 ++++++++++++++----------- test_runner/regress/test_tenant_conf.py | 16 ++++- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2cc5e096c..3bbbdc5865 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -371,43 +371,50 @@ impl PageServerNode { new_tenant_id: Option, settings: HashMap<&str, &str>, ) -> anyhow::Result { + let mut settings = settings.clone(); + let request = TenantCreateRequest { + new_tenant_id, + checkpoint_distance: settings + .remove("checkpoint_distance") + .map(|x| x.parse::()) + .transpose()?, + checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + compaction_target_size: settings + .remove("compaction_target_size") + .map(|x| x.parse::()) + .transpose()?, + compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_threshold: settings + .remove("compaction_threshold") + .map(|x| x.parse::()) + .transpose()?, + gc_horizon: settings + .remove("gc_horizon") + .map(|x| x.parse::()) + .transpose()?, + gc_period: settings.remove("gc_period").map(|x| x.to_string()), + image_creation_threshold: settings + .remove("image_creation_threshold") + .map(|x| x.parse::()) + .transpose()?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .remove("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings + .remove("lagging_wal_timeout") + .map(|x| x.to_string()), + max_lsn_wal_lag: settings + .remove("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, + }; + if !settings.is_empty() { + bail!("Unrecognized tenant settings: {settings:?}") + } self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) - .json(&TenantCreateRequest { - new_tenant_id, - checkpoint_distance: settings - .get("checkpoint_distance") - .map(|x| x.parse::()) - .transpose()?, - checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()), - compaction_target_size: settings - .get("compaction_target_size") - .map(|x| x.parse::()) - .transpose()?, - compaction_period: settings.get("compaction_period").map(|x| x.to_string()), - compaction_threshold: settings - .get("compaction_threshold") - .map(|x| x.parse::()) - .transpose()?, - gc_horizon: settings - .get("gc_horizon") - .map(|x| x.parse::()) - .transpose()?, - gc_period: settings.get("gc_period").map(|x| x.to_string()), - image_creation_threshold: settings - .get("image_creation_threshold") - .map(|x| x.parse::()) - .transpose()?, - pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), - walreceiver_connect_timeout: settings - .get("walreceiver_connect_timeout") - .map(|x| x.to_string()), - lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), - max_lsn_wal_lag: settings - .get("max_lsn_wal_lag") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - }) + .json(&request) .send()? .error_from_body()? .json::>() diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 51a8101b11..c6cf416d12 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -6,6 +6,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_tenant_config(neon_env_builder: NeonEnvBuilder): + """Test per tenant configuration""" # set some non-default global config neon_env_builder.pageserver_config_override = """ page_cache_size=444; @@ -13,7 +14,20 @@ wait_lsn_timeout='111 s'; tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" env = neon_env_builder.init_start() - """Test per tenant configuration""" + + # Check that we raise on misspelled configs + invalid_conf_key = "some_invalid_setting_name_blah_blah_123" + try: + env.neon_cli.create_tenant( + conf={ + invalid_conf_key: "20000", + } + ) + except Exception as e: + assert invalid_conf_key in str(e) + else: + raise AssertionError("Expected validation error") + tenant, _ = env.neon_cli.create_tenant( conf={ "checkpoint_distance": "20000",