rust/neon - neon - Gitea: Git with a cup of tea

rust/neon

mirror of https://github.com/neondatabase/neon.git synced 2026-01-14 17:02:56 +00:00

Author	SHA1	Message	Date
Christian Schwarz	abfac6ef2a	Merge remote-tracking branch 'origin/main' into HEAD Conflicts: libs/pageserver_api/src/models.rs pageserver/src/lib.rs pageserver/src/tenant_mgr.rs There was a merge conflict following attach_tenant() where I didn't understand why Git called out a conflict. I went through the changes in `origin/main` since the last merge done by Heikki, couldn't find anything that would conflict there. Original git diff right after after `git merge` follows: diff --cc libs/pageserver_api/src/models.rs index 750585b58,aefd79336..000000000 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@@ -15,17 -15,13 +15,27 @@@ use bytes::{BufMut, Bytes, BytesMut} /// A state of a tenant in pageserver's memory. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TenantState { ++<<<<<<< HEAD + // This tenant is being loaded from local disk + Loading, + // This tenant is being downloaded from cloud storage. + Attaching, + /// Tenant is fully operational + Active, + /// A tenant is recognized by pageserver, but it is being detached or the system is being + /// shut down. + Paused, + /// A tenant is recognized by the pageserver, but can no longer used for any operations, + /// because it failed to get activated. ++======= + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but it is being detached or the + /// system is being shut down. + Paused, + /// A tenant is recognized by the pageserver, but can no longer be used for + /// any operations, because it failed to be activated. ++>>>>>>> origin/main Broken, } diff --cc pageserver/src/lib.rs index 2d5b66f57,e3112223e..000000000 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@@ -22,7 -23,11 +23,13 @@@ pub mod walreceiver pub mod walrecord; pub mod walredo; ++<<<<<<< HEAD ++======= + use std::collections::HashMap; + use std::path::Path; + ++>>>>>>> origin/main use tracing::info; -use utils:🆔:{TenantId, TimelineId}; use crate::task_mgr::TaskKind; @@@ -103,14 -108,51 +110,64 @@@ fn exponential_backoff_duration_seconds } } ++<<<<<<< HEAD +/// A suffix to be used during file sync from the remote storage, +/// to ensure that we do not leave corrupted files that pretend to be layers. +const TEMP_FILE_SUFFIX: &str = "___temp"; ++======= + /// A newtype to store arbitrary data grouped by tenant and timeline ids. + /// One could use [`utils:🆔:TenantTimelineId`] for grouping, but that would + /// not include the cases where a certain tenant has zero timelines. + /// This is sometimes important: a tenant could be registered during initial load from FS, + /// even if he has no timelines on disk. + #[derive(Debug)] + pub struct TenantTimelineValues<T>(HashMap<TenantId, HashMap<TimelineId, T>>); + + impl<T> TenantTimelineValues<T> { + fn new() -> Self { + Self(HashMap::new()) + } + } + + /// The name of the metadata file pageserver creates per timeline. + /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`. + pub const METADATA_FILE_NAME: &str = "metadata"; + + /// Per-tenant configuration file. + /// Full path: `tenants/<tenant_id>/config`. + pub const TENANT_CONFIG_NAME: &str = "config"; + + /// A suffix used for various temporary files. Any temporary files found in the + /// data directory at pageserver startup can be automatically removed. + pub const TEMP_FILE_SUFFIX: &str = "___temp"; + + /// A marker file to mark that a timeline directory was not fully initialized. + /// If a timeline directory with this marker is encountered at pageserver startup, + /// the timeline directory and the marker file are both removed. + /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`. + pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; + + pub fn is_temporary(path: &Path) -> bool { + match path.file_name() { + Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX), + None => false, + } + } + + pub fn is_uninit_mark(path: &Path) -> bool { + match path.file_name() { + Some(name) => name + .to_string_lossy() + .ends_with(TIMELINE_UNINIT_MARK_SUFFIX), + None => false, ++ } ++} ++>>>>>>> origin/main + +pub fn is_temporary(path: &std::path::Path) -> bool { + match path.file_name() { + Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX), + None => false, } } diff --cc pageserver/src/tenant_mgr.rs index 73593bc48,061d7fa19..000000000 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@@ -13,11 -13,18 +13,22 @@@ use tracing::* use remote_storage::GenericRemoteStorage; use crate::config::PageServerConf; ++<<<<<<< HEAD ++======= + use crate::http::models::TenantInfo; + use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex}; + use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles}; ++>>>>>>> origin/main use crate::task_mgr::{self, TaskKind}; -use crate::tenant::{ - ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState, -}; +use crate::tenant::{Tenant, TenantState}; use crate::tenant_config::TenantConfOpt; ++<<<<<<< HEAD ++======= + use crate::walredo::PostgresRedoManager; + use crate::{is_temporary, is_uninit_mark, METADATA_FILE_NAME, TEMP_FILE_SUFFIX}; ++>>>>>>> origin/main -use utils::crashsafe::{self, path_with_suffix_extension}; +use utils::fs_ext::PathExt; use utils:🆔:{TenantId, TimelineId}; mod tenants_state { @@@ -341,87 -521,334 +352,247 @@@ pub fn list_tenants() -> Vec<(TenantId .collect() } -#[derive(Debug)] -pub enum TenantAttachData { - Ready(HashMap<TimelineId, TimelineLocalFiles>), - Broken(anyhow::Error), -} -/// Attempts to collect information about all tenant and timelines, existing on the local FS. -/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories, -/// that may appear due to such removals. -/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities. -fn local_tenant_timeline_files( - config: &'static PageServerConf, -) -> anyhow::Result<HashMap<TenantId, TenantAttachData>> { - let _entered = info_span!("local_tenant_timeline_files").entered(); - - let mut local_tenant_timeline_files = HashMap::new(); - let tenants_dir = config.tenants_path(); - for tenants_dir_entry in fs::read_dir(&tenants_dir) - .with_context(\|\| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &tenants_dir_entry { - Ok(tenants_dir_entry) => { - let tenant_dir_path = tenants_dir_entry.path(); - if is_temporary(&tenant_dir_path) { - info!( - "Found temporary tenant directory, removing: {}", - tenant_dir_path.display() - ); - if let Err(e) = fs::remove_dir_all(&tenant_dir_path) { - error!( - "Failed to remove temporary directory '{}': {:?}", - tenant_dir_path.display(), - e - ); - } - } else { - match collect_timelines_for_tenant(config, &tenant_dir_path) { - Ok((tenant_id, TenantAttachData::Broken(e))) => { - local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e)); - }, - Ok((tenant_id, TenantAttachData::Ready(collected_files))) => { - if collected_files.is_empty() { - match remove_if_empty(&tenant_dir_path) { - Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()), - Ok(false) => { - // insert empty timeline entry: it has some non-temporary files inside that we cannot remove - // so make obvious for HTTP API callers, that something exists there and try to load the tenant - let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(\|\| TenantAttachData::Ready(HashMap::new())); - }, - Err(e) => error!("Failed to remove empty tenant directory: {e:?}"), - } - } else { - match local_tenant_timeline_files.entry(tenant_id) { - hash_map::Entry::Vacant(entry) => { - entry.insert(TenantAttachData::Ready(collected_files)); - } - hash_map::Entry::Occupied(entry) =>{ - if let TenantAttachData::Ready(old_timelines) = entry.into_mut() { - old_timelines.extend(collected_files); - } - }, - } - } - }, - Err(e) => error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - tenants_dir_entry, - e - ), - } +/// Execute Attach mgmt API command. +/// +/// Downloading all the tenant data is performed in the background, this merely +/// spawns the background task and returns quickly. +pub async fn attach_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, + remote_storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { + match tenants_state::write_tenants().entry(tenant_id) { + hash_map::Entry::Occupied(e) => { + // Cannot attach a tenant that already exists. The error message depends on + // the state it's in. + match e.get().current_state() { + TenantState::Attaching => { + anyhow::bail!("tenant {tenant_id} attach is already in progress") } ++<<<<<<< HEAD + current_state => { + anyhow::bail!("tenant already exists, current state: {current_state:?}") ++======= + } + Err(e) => error!( + "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + info!( + "Collected files for {} tenants", + local_tenant_timeline_files.len(), + ); + Ok(local_tenant_timeline_files) + } + + fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result<bool> { + let directory_is_empty = tenant_dir_path + .read_dir() + .with_context(\|\| { + format!( + "Failed to read directory '{}' contents", + tenant_dir_path.display() + ) + })? + .next() + .is_none(); + + if directory_is_empty { + fs::remove_dir_all(&tenant_dir_path).with_context(\|\| { + format!( + "Failed to remove empty directory '{}'", + tenant_dir_path.display(), + ) + })?; + + Ok(true) + } else { + Ok(false) + } + } + + fn collect_timelines_for_tenant( + config: &'static PageServerConf, + tenant_path: &Path, + ) -> anyhow::Result<(TenantId, TenantAttachData)> { + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::<TenantId>() + .context("Could not parse tenant id out of the tenant dir name")?; + let timelines_dir = config.timelines_path(&tenant_id); + + if !timelines_dir.as_path().is_dir() { + return Ok(( + tenant_id, + TenantAttachData::Broken(anyhow::anyhow!( + "Tenant {} has no timelines directory at {}", + tenant_id, + timelines_dir.display() + )), + )); + } + + let mut tenant_timelines = HashMap::new(); + for timelines_dir_entry in fs::read_dir(&timelines_dir) + .with_context(\|\| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? + { + match timelines_dir_entry { + Ok(timelines_dir_entry) => { + let timeline_dir = timelines_dir_entry.path(); + if is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() + ); + if let Err(e) = fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else if is_uninit_mark(&timeline_dir) { + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::<TimelineId>() + .with_context(\|\| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_uninit_mark_file.display() + ) + })?; + let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + } else { + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::<TimelineId>() + .with_context(\|\| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = + config.timeline_uninit_mark_file_path(tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark"); + if let Err(e) = remove_timeline_and_uninit_mark( + &timeline_dir, + &timeline_uninit_mark_file, + ) { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + } else { + match collect_timeline_files(&timeline_dir) { + Ok((metadata, timeline_files)) => { + tenant_timelines.insert( + timeline_id, + TimelineLocalFiles::collected(metadata, timeline_files), + ); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } + } + } + } + } ++>>>>>>> origin/main } } - Err(e) => { - error!("Failed to list timelines for entry tenant {tenant_id}, reason: {e:?}") - } + } + hash_map::Entry::Vacant(v) => { + let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage)?; + v.insert(tenant); + Ok(()) } } - - if tenant_timelines.is_empty() { - // this is normal, we've removed all broken, empty and temporary timeline dirs - // but should allow the tenant to stay functional and allow creating new timelines - // on a restart, we require tenants to have the timelines dir, so leave it on disk - debug!("Tenant {tenant_id} has no timelines loaded"); - } - - Ok((tenant_id, TenantAttachData::Ready(tenant_timelines))) } -fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> { - fs::remove_dir_all(&timeline_dir) - .or_else(\|e\| { - if e.kind() == std::io::ErrorKind::NotFound { - // we can leave the uninit mark without a timeline dir, - // just remove the mark then - Ok(()) - } else { - Err(e) - } - }) - .with_context(\|\| { - format!( - "Failed to remove unit marked timeline directory {}", - timeline_dir.display() - ) - })?; - fs::remove_file(&uninit_mark).with_context(\|\| { - format!( - "Failed to remove timeline uninit mark file {}", - uninit_mark.display() - ) - })?; +#[cfg(feature = "testing")] +use { + crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, + utils::http::error::ApiError, +}; - Ok(()) -} +#[cfg(feature = "testing")] +pub fn immediate_gc( + tenant_id: TenantId, + timeline_id: TimelineId, + gc_req: TimelineGcRequest, +) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> { + let guard = tenants_state::read_tenants(); -// discover timeline files and extract timeline metadata -// NOTE: ephemeral files are excluded from the list -fn collect_timeline_files( - timeline_dir: &Path, -) -> anyhow::Result<(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>)> { - let mut timeline_files = HashMap::new(); - let mut timeline_metadata_path = None; - - let timeline_dir_entries = - fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; - for entry in timeline_dir_entries { - let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - let metadata = entry_path.metadata()?; - - if metadata.is_file() { - if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { - timeline_metadata_path = Some(entry_path); - } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { - debug!("skipping ephemeral file {}", entry_path.display()); - continue; - } else if is_temporary(&entry_path) { - info!("removing temp timeline file at {}", entry_path.display()); - fs::remove_file(&entry_path).with_context(\|\| { - format!( - "failed to remove temp download file at {}", - entry_path.display() - ) - })?; - } else { - let layer_metadata = LayerFileMetadata::new(metadata.len()); - timeline_files.insert(entry_path, layer_metadata); + let tenant = guard + .get(&tenant_id) + .map(Arc::clone) + .with_context(\|\| format!("Tenant {tenant_id} not found")) + .map_err(ApiError::NotFound)?; + + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(\|\| tenant.get_gc_horizon()); + // Use tenant's pitr setting + let pitr = tenant.get_pitr_interval(); + + // Run in task_mgr to avoid race with detach operation + let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::GarbageCollector, + Some(tenant_id), + Some(timeline_id), + &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), + false, + async move { + fail::fail_point!("immediate_gc_task_pre"); + let result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, true) + .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. + match task_done.send(result) { + Ok(_) => (), + Err(result) => error!("failed to send gc result: {result:?}"), } + Ok(()) } - } - - // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed - // then attach is lost. There would be no retries for that, - // initial collect will fail because there is no metadata. - // We either need to start download if we see empty dir after restart or attach caller should - // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didn't appear locally. - // Check what happens with remote index in that case. - let timeline_metadata_path = match timeline_metadata_path { - Some(path) => path, - None => anyhow::bail!("No metadata file found in the timeline directory"), - }; - let metadata = TimelineMetadata::from_bytes( - &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, - ) - .context("Failed to parse timeline metadata file bytes")?; - - anyhow::ensure!( - metadata.ancestor_timeline().is_some() \|\| !timeline_files.is_empty(), - "Timeline has no ancestor and no layer files" ); - Ok((metadata, timeline_files)) + // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task + drop(guard); + + Ok(wait_task_done) } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index da50d99db..360ff1c63 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit da50d99db54848f7a3e910f920aaad7dc6915d36 +Subproject commit 360ff1c637a57d351a7a5a391d8e8afd8fde8c3a diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 780c3f8e3..d31b3f7c6 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 780c3f8e3524c2e32a2e28884c7b647fcebf71d7 +Subproject commit d31b3f7c6d108e52c8bb11e812ce4e266501ea3d	2022-11-25 12:19:53 -05:00
Egor Suvorov	ae53dc3326	Add authentication between Safekeeper and Pageserver/Compute * Fix https://github.com/neondatabase/neon/issues/1854 * Never log Safekeeper::conninfo in walproposer as it now contains a secret token * control_panel, test_runner: generate and pass JWT tokens for Safekeeper to compute and pageserver * Compute: load JWT token for Safekepeer from the environment variable. Do not reuse the token from pageserver_connstring because it's embedded in there weirdly. * Pageserver: load JWT token for Safekeeper from the environment variable. * Rewrite docs/authentication.md	2022-11-25 04:17:42 +03:00
Egor Suvorov	2ce5d8137d	Separate permission checks for Pageserver and Safekeeper There will be different scopes for those two, so authorization code should be different. The `check_permission` function is now not in the shared library. Its implementation is very similar to the one which will be added for Safekeeper. In fact, we may reuse the same existing root-like 'PageServerApi' scope, but I would prefer to have separate root-like scopes for services. Also, generate_management_token in tests is generate_pageserver_token now.	2022-11-25 04:17:42 +03:00
Arseny Sher	2d42f84389	Add storage_broker binary. Which ought to replace etcd. This patch only adds the binary and adjusts Dockerfile to include it; subsequent ones will add deploy of helm chart and the actual replacement. It is a simple and fast pub-sub message bus. In this patch only safekeeper message is supported, but others can be easily added. Compilation now requires protoc to be installed. Installing protobuf-compiler package is fine for Debian/Ubuntu. ref https://github.com/neondatabase/neon/pull/2733 https://github.com/neondatabase/neon/issues/2394	2022-11-23 22:05:59 +04:00
Christian Schwarz	f564dff0e3	make test_tenant_detach_smoke fail reproducibly Add failpoint that triggers the race condition. Skip test until we'll land the fix from https://github.com/neondatabase/neon/pull/2851 with https://github.com/neondatabase/neon/pull/2785	2022-11-18 17:15:34 +01:00
Christian Schwarz	919f2b261a	make test_tenant_detach_smoke fail reproducibly Add failpoint that triggers the race condition.	2022-11-18 12:15:14 +02:00
Christian Schwarz	bb6dbd2f43	crash-safe and resumable tenant attach This change introduces a marker file $repo/tenants/$tenant_id/attaching that is present while a tenant is in Attaching state. When pageserver restarts, we use it to resume the tenant attach operation. Before this change, a crash during tenant attach would result in one of the following: 1. crash upon restart due to missing metadata file (IIRC) 2. "successful" loading of the tenant with a subset of timelines	2022-11-16 14:57:26 +02:00
Kirill Bulatov	1e21ca1afe	Trim whitespaces off Lsn strings when parsing (#2827 )	2022-11-15 22:39:44 +02:00
bojanserafimov	7fd88fab59	Trace read requests (#2762 )	2022-11-10 16:43:04 -05:00
Dmitry Ivanov	c38f38dab7	Move pq_proto to its own crate	2022-11-03 22:56:04 +03:00
Joonas Koivunen	cf68963b18	Add initial tenant sizing model and a http route to query it (#2714 ) Tenant size information is gathered by using existing parts of `Tenant::gc_iteration` which are now separated as `Tenant::refresh_gc_info`. `Tenant::refresh_gc_info` collects branch points, and invokes `Timeline::update_gc_info`; nothing was supposed to be changed there. The gathered branch points (through Timeline's `GcInfo::retain_lsns`), `GcInfo::horizon_cutoff`, and `GcInfo::pitr_cutoff` are used to build up a Vec of updates fed into the `libs/tenant_size_model` to calculate the history size. The gathered information is now exposed using `GET /v1/tenant/{tenant_id}/size`, which which will respond with the actual calculated size. Initially the idea was to have this delivered as tenant background task and exported via metric, but it might be too computationally expensive to run it periodically as we don't yet know if the returned values are any good. Adds one new metric: - pageserver_storage_operations_seconds with label `logical_size` - separating from original `init_logical_size` Adds a pageserver wide configuration variable: - `concurrent_tenant_size_logical_size_queries` with default 1 This leaves a lot of TODO's, tracked on issue #2748.	2022-11-03 12:39:19 +00:00
Kirill Bulatov	d42700280f	Remove daemonize from storage components (#2677 ) Move daemonization logic into `control_plane`. Storage binaries now only crate a lockfile to avoid concurrent services running in the same directory.	2022-11-02 02:26:37 +02:00
Dmitry Ivanov	0df3467146	Refactoring: replace `utils::connstring` with `Url`-based APIs	2022-11-01 18:17:36 +03:00
Lassi Pölönen	321aeac3d4	Json logging capability (#2624 ) * Support configuring the log format as json or plain. Separately test json and plain logger. They would be competing on the same global subscriber otherwise. * Implement log_format for pageserver config * Implement configurable log format for safekeeper.	2022-10-21 17:30:20 +00:00
Arseny Sher	7480a0338a	Determine safekeeper for offloading WAL without etcd election API. This API is rather pointless, as sane choice anyway requires knowledge of peers status and leaders lifetime in any case can intersect, which is fine for us -- so manual elections are straightforward. Here, we deterministically choose among the reasonably caught up safekeepers, shifting by timeline id to spread the load. A step towards custom broker https://github.com/neondatabase/neon/issues/2394	2022-10-21 15:33:27 +03:00
Kirill Bulatov	306a47c4fa	Use uninit mark files during timeline init for atomic creation (#2489 ) Part of https://github.com/neondatabase/neon/pull/2239 Regular, from scratch, timeline creation involves initdb to be run in a separate directory, data from this directory to be imported into pageserver and, finally, timeline-related background tasks to start. This PR ensures we don't leave behind any directories that are not marked as temporary and that pageserver removes such directories on restart, allowing timeline creation to be retried with the same IDs, if needed. It would be good to later rewrite the logic to use a temporary directory, similar what tenant creation does. Yet currently it's harder than this change, so not done.	2022-10-20 14:19:17 +03:00
Heikki Linnakangas	989d78aac8	Buffer the TCP incoming stream on libpq connections. Reduces the number of syscalls needed to read the commands from the compute. Here's a snippet of strace output from the pageserver, when performing a sequential scan on a table, with prefetch: 3084934 recvfrom(47, "d", 1, 0, NULL, NULL) = 1 3084934 recvfrom(47, "\0\0\0\37", 4, 0, NULL, NULL) = 4 3084934 recvfrom(47, "\2\1\0\0\0\0\362\302\360\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\0\3", 27, 0, NULL, NULL) = 27 3084934 pread64(28, "\0\0\0\1\0\0\0\0\0\0\0\253 "..., 8192, 25190400) = 8192 3084934 write(45, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\3A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3084934 poll([{fd=46, events=POLLIN}, {fd=48, events=POLLIN}], 2, 60000) = 1 ([{fd=46, revents=POLLIN}]) 3084934 read(46, "\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3084934 sendto(47, "d\0\0 \5f\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3084934 recvfrom(47, "d", 1, 0, NULL, NULL) = 1 3084934 recvfrom(47, "\0\0\0\37", 4, 0, NULL, NULL) = 4 3084934 recvfrom(47, "\2\1\0\0\0\0\362\302\360\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\0\4", 27, 0, NULL, NULL) = 27 3084934 pread64(28, " \0=\0L\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0;;\0\0\0\4\4\0"..., 8192, 25198592) = 8192 3084934 write(45, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\4A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3084934 poll([{fd=46, events=POLLIN}, {fd=48, events=POLLIN}], 2, 60000) = 1 ([{fd=46, revents=POLLIN}]) 3084934 read(46, "\0\0\0\0\260\344q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3084934 sendto(47, "d\0\0 \5f\0\0\0\0\260\344q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3084934 recvfrom(47, "d", 1, 0, NULL, NULL) = 1 3084934 recvfrom(47, "\0\0\0\37", 4, 0, NULL, NULL) = 4 3084934 recvfrom(47, "\2\1\0\0\0\0\362\302\360\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\0\5", 27, 0, NULL, NULL) = 27 3084934 write(45, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\5A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3084934 poll([{fd=46, events=POLLIN}, {fd=48, events=POLLIN}], 2, 60000) = 1 ([{fd=46, revents=POLLIN}]) 3084934 read(46, "\0\0\0\0\330\377q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3084934 sendto(47, "d\0\0 \5f\0\0\0\0\330\377q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 This shows the interaction for three get_page_at_lsn requests. For each request, the pageserver performs three recvfrom syscalls to read the incoming request from the socket. After this patch, those recvfrom calls are gone: 3086123 read(47, "\0\0\0\0\360\222q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3086123 sendto(45, "d\0\0 \5f\0\0\0\0\360\222q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3086123 pread64(29, " "..., 8192, 25182208) = 8192 3086123 write(46, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\2A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3086123 poll([{fd=47, events=POLLIN}, {fd=49, events=POLLIN}], 2, 60000) = 1 ([{fd=47, revents=POLLIN}]) 3086123 read(47, "\0\0\0\0000\256q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3086123 sendto(45, "d\0\0 \5f\0\0\0\0000\256q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3086123 pread64(29, "\0\0\0\1\0\0\0\0\0\0\0\253 "..., 8192, 25190400) = 8192 3086123 write(46, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\3A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3086123 poll([{fd=47, events=POLLIN}, {fd=49, events=POLLIN}], 2, 60000) = 1 ([{fd=47, revents=POLLIN}]) 3086123 read(47, "\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237\362\0\0\237\362\0"..., 8192) = 8192 3086123 sendto(45, "d\0\0 \5f\0\0\0\0p\311q\1\0\0\4\0\f\1\200\1\0 \4 \0\0\0\0\200\237"..., 8198, MSG_NOSIGNAL, NULL, 0) = 8198 3086123 pread64(29, " \0=\0L\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0;;\0\0\0\4\4\0"..., 8192, 25198592) = 8192 3086123 write(46, "B\0\0\0\25\0\0\0\6\177\0\0002\276\0\0@\f\0\0\0\4A\0\0\32\355\0\0\0\0\1"..., 7010) = 7010 3086123 poll([{fd=47, events=POLLIN}, {fd=49, events=POLLIN}], 2, 60000) = 1 ([{fd=47, revents=POLLIN}]) In this test, the compute sends a batch of prefetch requests, and they are read from the socket in one syscall. That syscall was not captured by the strace snippet above, but there are much fewer of them than before.	2022-10-18 18:46:07 +03:00
Kirill Bulatov	f03b7c3458	Bump regular dependencies (#2618 ) * etcd-client is not updated, since we plan to replace it with another client and the new version errors with some missing prost library error * clap had released another major update that requires changing every CLI declaration again, deserves a separate PR	2022-10-15 01:55:31 +03:00
Arseny Sher	9fe4548e13	Reimplement explicit timeline creation on safekeepers. With the ability to pass commit_lsn. This allows to perform project WAL recovery through different (from the original) set of safekeepers (or under different ttid) by 1) moving WAL files to s3 under proper ttid; 2) explicitly creating timeline on safekeepers, setting commit_lsn to the latest point; 3) putting the lastest .parital file to the timeline directory on safekeepers, if desired. Extend test_s3_wal_replay to exersise this behaviour. Also extends timeline_status endpoint to return postgres information.	2022-10-13 21:43:10 +04:00
Dmitry Ivanov	e516c376d6	[proxy] Improve logging (#2554 ) * [proxy] Use `tracing::` instead of `println!` for logging Fix a minor misnomer * Log more stuff	2022-10-07 14:34:57 +03:00
Arthur Petukhovsky	f25dd75be9	Fix deadlock in safekeeper metrics (#2566 ) We had a problem where almost all of the threads were waiting on a futex syscall. More specifically: - `/metrics` handler was inside `TimelineCollector::collect()`, waiting on a mutex for a single Timeline - This exact timeline was inside `control_file::FileStorage::persist()`, waiting on a mutex for Lazy initialization of `PERSIST_CONTROL_FILE_SECONDS` - `PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram>` was blocked on `prometheus::register` - `prometheus::register` calls `DEFAULT_REGISTRY.write().register()` to take a write lock on Registry and add a new metric - `DEFAULT_REGISTRY` lock was already taken inside `DEFAULT_REGISTRY.gather()`, which was called by `/metrics` handler to collect all metrics This commit creates another Registry with a separate lock, to avoid deadlock in a case where `TimelineCollector` triggers registration of new metrics inside default registry.	2022-10-06 01:07:02 +03:00
Vadim Kharitonov	2233ca2a39	seqwait.rs unit tests don't check return value	2022-09-27 11:47:59 +03:00
sharnoff	6f949e1556	Improve pageserver/safekeepeer HTTP API errors (#2461 ) Part of the general work on improving pageserver logs. Brief summary of changes: * Remove `ApiError::from_err` * Remove `impl From<anyhow::Error> for ApiError` * Convert `ApiError::{BadRequest, NotFound}` to use `anyhow::Error` * Note: `NotFound` has more verbose formatting because it's more likely to have useful information for the receiving "user" * Explicitly convert from `tokio::task::JoinError`s into `InternalServerError`s where appropriate Also note: many of the places where errors were implicitly converted to 500s have now been updated to return a more appropriate error. Some places where it's not yet possible to distinguish the error types have been left as 500s.	2022-09-20 17:02:10 -07:00
Kirill Bulatov	8d7024a8c2	Move path manipulation function to utils	2022-09-20 23:43:52 +03:00
sharnoff	4b25b9652a	Rename more zid-like idents (#2480 ) Follow-up to PR #2433 (`b8eb908a`). There's still a few more unresolved locations that have been left as-is for the same compatibility reasons in the original PR.	2022-09-20 11:06:31 -07:00
Arthur Petukhovsky	566e816298	Refactor safekeeper timelines handling (#2329 ) See https://github.com/neondatabase/neon/pull/2329 for details	2022-09-20 07:42:39 +00:00
Kirill Bulatov	b8eb908a3d	Rename old project name references	2022-09-14 08:14:05 +03:00
Heikki Linnakangas	40c845e57d	Switch to async for all concurrency in the pageserver. Instead of spawning helper threads, we now use Tokio tasks. There are multiple Tokio runtimes, for different kinds of tasks. One for serving libpq client connections, another for background operations like GC and compaction, and so on. That's not strictly required, we could use just one runtime, but with this you can still get an overview of what's happening with "top -H". There's one subtle behavior in how TenantState is updated. Before this patch, if you deleted all timelines from a tenant, its GC and compaction loops were stopped, and the tenant went back to Idle state. We no longer do that. The empty tenant stays Active. The changes to test_tenant_tasks.py are related to that. There's still plenty of synchronous code and blocking. For example, we still use blocking std::io functions for all file I/O, and the communication with WAL redo processes is still uses low-level unix poll(). We might want to rewrite those later, but this will do for now. The model is that local file I/O is considered to be fast enough that blocking - and preventing other tasks running in the same thread - is acceptable.	2022-09-12 14:21:00 +03:00
Heikki Linnakangas	f0a0d7bb7a	Split RcuWriteGuard::store() into two stages: store and wait. This makes it easier to explain which stages allow concurrent readers and writers. Expand the comments with examples, too.	2022-09-02 00:34:37 +03:00
Dmitry Ivanov	96a50e99cf	Forward various connection params to compute nodes. (#2336 ) Previously, proxy didn't forward auxiliary `options` parameter and other ones to the client's compute node, e.g. ``` $ psql "user=john host=localhost dbname=postgres options='-cgeqo=off'" postgres=# show geqo; ┌──────┐ │ geqo │ ├──────┤ │ on │ └──────┘ (1 row) ``` With this patch we now forward `options`, `application_name` and `replication`. Further reading: https://www.postgresql.org/docs/current/libpq-connect.html Fixes #1287.	2022-08-30 17:36:21 +03:00
Heikki Linnakangas	bfa1d91612	Introduce RCU, and use it to protect latest_gc_cutoff_lsn. `latest_gc_cutoff_lsn` tracks the cutoff point where GC has been performed. Anything older than the cutoff might already have been GC'd away, and cannot be queried by get_page_at_lsn requests. It's protected by an RWLock. Whenever a get_page_at_lsn requests comes in, it first grabs the lock and reads the current `latest_gc_cutoff`, and holds the lock it until the request has been served. The lock ensures that GC doesn't start concurrently and remove page versions that we still need to satisfy the request. With the lock, get_page_at_lsn request could potentially be blocked for a long time. GC only holds the lock in exclusive mode for a short duration, but depending on how whether the RWLock is "fair", a read request might be queued behind the GC's exclusive request, which in turn might be queued behind a long-running read operation, like a basebackup. If the lock implementation is not fair, i.e. if a reader can always jump the queue if the lock is already held in read mode, then another problem arises: GC might be starved if a constant stream of GetPage requests comes in. To avoid the long wait or starvation, introduce a Read-Copy-Update mechanism to replace the lock on `latest_gc_cutoff_lsn`. With the RCU, reader can always read the latest value without blocking (except for a very short duration if the lock protecting the RCU is contended; that's comparable to a spinlock). And a writer can always write a new value without waiting for readers to finish using the old value. The old readers will continue to see the old value through their guard object, while new readers will see the new value. This is purely theoretical ATM, we don't have any reports of either starvation or blocking behind GC happening in practice. But it's simple to fix, so let's nip that problem in the bud.	2022-08-29 11:23:37 +03:00
Heikki Linnakangas	88a339ed73	Update a few crates "cargo tree -d" showed that we're building multiple versions of some crates. Update some crates, to avoid depending on multiple versions.	2022-08-27 18:14:30 +03:00
Dmitry Ivanov	8e1d6dd848	Minor cleanup in pq_proto (#2322 )	2022-08-23 18:00:02 +03:00
Heikki Linnakangas	4013290508	Fix module doc comment. `///` is used for comments on the next code that follows, so the comment actually applied to the `use std::collections::BTreeMap;` line that follows. rustfmt complained about that: error: an inner attribute is not permitted following an outer doc comment --> /home/heikki/git-sandbox/neon/libs/utils/src/seqwait_async.rs:7:1 \| 5 \| /// \| --- previous doc comment 6 \| 7 \| #![warn(missing_docs)] \| ^^^^^^^^^^^^^^^^^^^^^^ not permitted following an outer attribute 8 \| 9 \| use std::collections::BTreeMap; \| ------------------------------- the inner attribute doesn't annotate this `use` import \| = note: inner attributes, like `#![no_std]`, annotate the item enclosing them, and are usually found at the beginning of source files help: to annotate the `use` import, change the attribute from inner to outer style \| 7 - #![warn(missing_docs)] 7 + #[warn(missing_docs)] \| `//!` is the correct syntax for comments that apply to the whole file.	2022-08-23 12:58:54 +03:00
Kirill Bulatov	648e8bbefe	Fix 1.63 clippy lints (#2282 )	2022-08-16 18:49:22 +03:00
Ankur Srivastava	84d1bc06a9	refactor: replace lazy-static with once-cell (#2195 ) - Replacing all the occurrences of lazy-static with `once-cell::sync::Lazy` - fixes #1147 Signed-off-by: Ankur Srivastava <best.ankur@gmail.com>	2022-08-05 19:34:04 +02:00
Dmitry Ivanov	5f4ccae5c5	[proxy] Add the `password hack` authentication flow (#2095 ) [proxy] Add the `password hack` authentication flow This lets us authenticate users which can use neither SNI (due to old libpq) nor connection string `options` (due to restrictions in other client libraries). Note: `PasswordHack` will accept passwords which are not encoded in base64 via the "password" field. The assumption is that most user passwords will be valid utf-8 strings, and the rest may still be passed via "password_".	2022-07-25 17:23:10 +03:00
Kirill Bulatov	d8a37452c8	Rename ZenithFeedback (#1912 )	2022-06-11 00:44:05 +03:00
Dmitry Rodionov	6e26588d17	Allow to customize shutdown condition in PostgresBackend Use it in PageServerHandler to check per thread shutdown condition from thread_mgr which takes into account tenants and timelines	2022-06-07 22:11:54 +03:00
KlimentSerafimov	fecad1ca34	Resolving issue #1745 . Added cluster option for SNI data (#1813 ) * Added project option in case SNI data is missing. Resolving issue #1745. * Added invariant checking for project name: if both sni_data and project_name are available then they should match.	2022-06-06 08:14:41 -04:00
Kirill Bulatov	e5cb727572	Replace callmemaybe with etcd subscriptions on safekeeper timeline info	2022-06-01 16:07:04 +03:00
Anastasia Lubennikova	6a867bce6d	Rename 'zenith_admin' role to 'cloud_admin'	2022-05-30 11:11:01 +03:00
Kian-Meng Ang	f1c51a1267	Fix typos	2022-05-28 14:02:05 +03:00
Arseny Sher	54b75248ff	s3 WAL offloading staging review. - Uncomment accidently `self.keep_alive.abort()` commented line, due to this task never finished, which blocked launcher. - Mess up with initialization one more time, to fix offloader trying to back up segment 0. Now we initialize all required LSNs in handle_elected, where we learn start LSN for the first time. - Fix blind attempt to provide safekeeper service file with remote storage params.	2022-05-27 14:02:52 +04:00
Arseny Sher	0e1bd57c53	Add WAL offloading to s3 on safekeepers. Separate task is launched for each timeline and stopped when timeline doesn't need offloading. Decision who offloads is done through etcd leader election; currently there is no pre condition for participating, that's a TODO. neon_local and tests infrastructure for remote storage in safekeepers added, along with the test itself. ref #1009 Co-authored-by: Anton Shyrabokau <ahtoxa@Antons-MacBook-Pro.local>	2022-05-27 06:19:23 +04:00
chaitanya sharma	c584d90bb9	initial commit, renamed znodeid to nodeid.	2022-05-25 20:11:26 +03:00
Egor Suvorov	c4b77084af	utils: add const_assert! macro	2022-05-21 05:25:17 +02:00
Arthur Petukhovsky	98da0aa159	Add _total suffix to metrics name (#1741 )	2022-05-18 15:17:04 +03:00
Arthur Petukhovsky	134eeeb096	Add more common storage metrics (#1722 ) - Enabled process exporter for storage services - Changed zenith_proxy prefix to just proxy - Removed old `monitoring` directory - Removed common prefix for metrics, now our common metrics have `libmetrics_` prefix, for example `libmetrics_serve_metrics_count` - Added `test_metrics_normal_work`	2022-05-17 19:29:01 +03:00
Egor Suvorov	22d997049c	libs/utils/http/request: add ensure_no_body	2022-05-13 15:43:52 +02:00

1 2

59 Commits