From 15c5f3e6cfe86774c55ced328bd507266d464f0f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 30 Aug 2022 22:18:01 +0300 Subject: [PATCH 001/166] Fix misc typos in comments and variable names. --- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 2 +- pageserver/src/layered_repository/timeline.rs | 2 +- pageserver/src/storage_sync/download.rs | 24 +++++++++---------- safekeeper/src/bin/safekeeper.rs | 2 +- test_runner/regress/test_tenant_relocation.py | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 652736058a..2cc1ae7853 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -46,7 +46,7 @@ impl ResponseErrorMessageExt for Response { return Ok(self); } - // reqwest do not export it's error construction utility functions, so lets craft the message ourselves + // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = self.url().to_owned(); Err(SafekeeperHttpError::Response( match self.json::() { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index aab29628e3..9fdab5f88c 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -57,7 +57,7 @@ impl ResponseErrorMessageExt for Response { return Ok(self); } - // reqwest do not export it's error construction utility functions, so lets craft the message ourselves + // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = self.url().to_owned(); Err(PageserverHttpError::Response( match self.json::() { diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 1a941affe5..81bc975272 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -379,7 +379,7 @@ pub struct Timeline { // It is needed in checks when we want to error on some operations // when they are requested for pre-initdb lsn. // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", - // though lets keep them both for better error visibility. + // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, /// When did we last calculate the partitioning? diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 8e6aa47c88..ded4c042c4 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -234,11 +234,11 @@ pub(super) async fn download_timeline_layers<'a>( let mut download_tasks = layers_to_download .into_iter() - .map(|layer_desination_path| async move { - if layer_desination_path.exists() { + .map(|layer_destination_path| async move { + if layer_destination_path.exists() { debug!( "Layer already exists locally, skipping download: {}", - layer_desination_path.display() + layer_destination_path.display() ); } else { // Perform a rename inspired by durable_rename from file_utils.c. @@ -252,7 +252,7 @@ pub(super) async fn download_timeline_layers<'a>( // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = - path_with_suffix_extension(&layer_desination_path, TEMP_DOWNLOAD_EXTENSION); + path_with_suffix_extension(&layer_destination_path, TEMP_DOWNLOAD_EXTENSION); let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { @@ -262,7 +262,7 @@ pub(super) async fn download_timeline_layers<'a>( ) })?; - let mut layer_download = download_storage_object(storage, &layer_desination_path) + let mut layer_download = download_storage_object(storage, &layer_destination_path) .await .with_context(|| { format!( @@ -284,9 +284,9 @@ pub(super) async fn download_timeline_layers<'a>( // that have not yet completed. To ensure that a file is closed immediately when it is dropped, // you should call flush before dropping it. // - // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because - // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. - // But for additional safety lets check/wait for any pending operations. + // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because + // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations. + // But for additional safety let's check/wait for any pending operations. destination_file.flush().await.with_context(|| { format!( "failed to flush source file at {}", @@ -307,16 +307,16 @@ pub(super) async fn download_timeline_layers<'a>( anyhow::bail!("remote-storage-download-pre-rename failpoint triggered") }); - fs::rename(&temp_file_path, &layer_desination_path).await?; + fs::rename(&temp_file_path, &layer_destination_path).await?; - fsync_path(&layer_desination_path).await.with_context(|| { + fsync_path(&layer_destination_path).await.with_context(|| { format!( "Cannot fsync layer destination path {}", - layer_desination_path.display(), + layer_destination_path.display(), ) })?; } - Ok::<_, anyhow::Error>(layer_desination_path) + Ok::<_, anyhow::Error>(layer_destination_path) }) .collect::>(); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6c9c59c76b..244c793250 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -70,7 +70,7 @@ fn main() -> anyhow::Result<()> { .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), ) // FIXME this argument is no longer needed since pageserver address is forwarded from compute. - // However because this argument is in use by console's e2e tests lets keep it for now and remove separately. + // However because this argument is in use by console's e2e tests let's keep it for now and remove separately. // So currently it is a noop. .arg( Arg::new("pageserver") diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 4d949e0c13..19b0ec05a7 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -170,7 +170,7 @@ def check_timeline_attached( new_timeline_detail = assert_timeline_local(new_pageserver_http_client, tenant_id, timeline_id) # when load is active these checks can break because lsns are not static - # so lets check with some margin + # so let's check with some margin assert_abs_margin_ratio( lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), lsn_from_hex(old_timeline_detail["local"]["disk_consistent_lsn"]), From 40813adba2515372eb6f3a612c5a453a0ab84d03 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 1 Sep 2022 21:51:48 +0300 Subject: [PATCH 002/166] Pevent creation of empty layers with duplicates (#2327) * Pevent creation of empty layers with duplicates * Add comments --- pageserver/src/layered_repository/timeline.rs | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 81bc975272..a624a3ccf5 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -1795,47 +1795,54 @@ impl Timeline { if !same_key { dup_end_lsn = Lsn::INVALID; } - // Determine size occupied by this key. We stop at next key, or when size becomes larger than target_file_size + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { next_key_size = next_size; if key != next_key { if dup_end_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range } break; } key_values_total_size += next_size; - if key_values_total_size > target_file_size { - // split key between multiple layers: such layer can contain only single key + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn + dup_end_lsn // new segment with duplicates starts where old one stops } else { - lsn + lsn // start with the first LSN for this key }; - dup_end_lsn = next_lsn; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive break; } } - // handle case when loop reaches last key + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { dup_start_lsn = dup_end_lsn; dup_end_lsn = lsn_range.end; } if writer.is_some() { let written_size = writer.as_mut().unwrap().size(); - // check if key cause layer overflow + // check if key cause layer overflow... if is_dup_layer || dup_end_lsn.is_valid() || written_size + key_values_total_size > target_file_size { + // ... if so, flush previous layer and prepare to write new one new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); writer = None; } } + // Remember size of key value because at next iteration we will access next item key_values_total_size = next_key_size; } if writer.is_none() { + // Create writer if not initiaized yet writer = Some(DeltaLayerWriter::new( self.conf, self.timeline_id, From f0a0d7bb7ad470e72cba404a7e857e20a8b6de55 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Sep 2022 00:34:37 +0300 Subject: [PATCH 003/166] Split RcuWriteGuard::store() into two stages: store and wait. This makes it easier to explain which stages allow concurrent readers and writers. Expand the comments with examples, too. --- libs/utils/src/simple_rcu.rs | 146 +++++++++++++----- pageserver/src/layered_repository/timeline.rs | 4 +- 2 files changed, 111 insertions(+), 39 deletions(-) diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 24423815ab..177a839d75 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -9,6 +9,36 @@ //! This implementation isn't wait-free; it uses an RwLock that is held for a //! short duration when the value is read or updated. //! +//! # Examples +//! +//! Read a value and do things with it while holding the guard: +//! +//! ``` +//! # let rcu = utils::simple_rcu::Rcu::new(1); +//! { +//! let read = rcu.read(); +//! println!("the current value is {}", *read); +//! // exiting the scope drops the read-guard, and allows concurrent writers +//! // to finish. +//! } +//! ``` +//! +//! Increment the value by one, and wait for old readers to finish: +//! +//! ``` +//! # let rcu = utils::simple_rcu::Rcu::new(1); +//! let write_guard = rcu.lock_for_write(); +//! +//! // NB: holding `write_guard` blocks new readers and writers. Keep this section short! +//! let new_value = *write_guard + 1; +//! +//! let waitlist = write_guard.store_and_unlock(new_value); // consumes `write_guard` +//! +//! // Concurrent reads and writes are now possible again. Wait for all the readers +//! // that still observe the old value to finish. +//! waitlist.wait(); +//! ``` +//! #![warn(missing_docs)] use std::ops::Deref; @@ -84,9 +114,10 @@ impl Rcu { /// used to read the current value, and to store a new value. /// /// Note: holding the write-guard blocks concurrent readers, so you should - /// finish the update and drop the guard quickly! + /// finish the update and drop the guard quickly! Multiple writers can be + /// waiting on the RcuWriteGuard::store step at the same time, however. /// - pub fn write(&self) -> RcuWriteGuard<'_, V> { + pub fn lock_for_write(&self) -> RcuWriteGuard<'_, V> { let inner = self.inner.write().unwrap(); RcuWriteGuard { inner } } @@ -108,7 +139,13 @@ impl Deref for RcuReadGuard { } /// -/// Read guard returned by `read` +/// Write guard returned by `write` +/// +/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so +/// it should only be held for a short duration! +/// +/// Calling `store` consumes the guard, making new reads and new writes possible +/// again. /// pub struct RcuWriteGuard<'a, V> { inner: RwLockWriteGuard<'a, RcuInner>, @@ -126,13 +163,11 @@ impl<'a, V> RcuWriteGuard<'a, V> { /// /// Store a new value. The new value will be written to the Rcu immediately, /// and will be immediately seen by any `read` calls that start afterwards. - /// But if there are any readers still holding onto the old value, or any - /// even older values, this will await until they have been released. /// - /// This will drop the write-guard before it starts waiting for the reads to - /// finish, so a new write operation can begin before this functio returns. + /// Returns a list of readers that can see old values. You can call `wait()` + /// on it to wait for them to finish. /// - pub fn store(mut self, new_val: V) { + pub fn store_and_unlock(mut self, new_val: V) -> RcuWaitList { let new_cell = Arc::new(RcuCell::new(new_val)); let mut watches = Vec::new(); @@ -151,11 +186,23 @@ impl<'a, V> RcuWriteGuard<'a, V> { } }); } - drop(self); + RcuWaitList(watches) + } +} +/// +/// List of readers who can still see old values. +/// +pub struct RcuWaitList(Vec>); + +impl RcuWaitList { + /// + /// Wait for old readers to finish. + /// + pub fn wait(mut self) { // after all the old_cells are no longer in use, we're done - for w in watches.iter_mut() { - // This will block until the Receiver is closed. That happens then + for w in self.0.iter_mut() { + // This will block until the Receiver is closed. That happens when // the RcuCell is dropped. #[allow(clippy::single_match)] match w.send(()) { @@ -177,41 +224,66 @@ mod tests { use std::time::Duration; #[test] - fn basic() { - let rcu = Arc::new(Rcu::new(1)); + fn two_writers() { + let rcu = Rcu::new(1); + + let read1 = rcu.read(); + assert_eq!(*read1, 1); + + let write2 = rcu.lock_for_write(); + assert_eq!(*write2, 1); + let wait2 = write2.store_and_unlock(2); + + let read2 = rcu.read(); + assert_eq!(*read2, 2); + + let write3 = rcu.lock_for_write(); + assert_eq!(*write3, 2); + let wait3 = write3.store_and_unlock(3); + + // new reader can see the new value, and old readers continue to see the old values. + let read3 = rcu.read(); + assert_eq!(*read3, 3); + assert_eq!(*read2, 2); + assert_eq!(*read1, 1); + let log = Arc::new(Mutex::new(Vec::new())); - - let a = rcu.read(); - assert_eq!(*a, 1); - log.lock().unwrap().push("one"); - - let (rcu_clone, log_clone) = (Arc::clone(&rcu), Arc::clone(&log)); - let thread = spawn(move || { - log_clone.lock().unwrap().push("store two start"); - let write_guard = rcu_clone.write(); - assert_eq!(*write_guard, 1); - write_guard.store(2); - log_clone.lock().unwrap().push("store two done"); + // Wait for the old readers to finish in separate threads. + let log_clone = Arc::clone(&log); + let thread2 = spawn(move || { + wait2.wait(); + log_clone.lock().unwrap().push("wait2 done"); }); + let log_clone = Arc::clone(&log); + let thread3 = spawn(move || { + wait3.wait(); + log_clone.lock().unwrap().push("wait3 done"); + }); + // without this sleep the test can pass on accident if the writer is slow - sleep(Duration::from_secs(1)); + sleep(Duration::from_millis(500)); - // new read should see the new value - let b = rcu.read(); - assert_eq!(*b, 2); + // Release first reader. This allows first write to finish, but calling + // wait() on the second one would still block. + log.lock().unwrap().push("dropping read1"); + drop(read1); + thread2.join().unwrap(); - // old guard still sees the old value - assert_eq!(*a, 1); + sleep(Duration::from_millis(500)); - // Release the old guard. This lets the store in the thread to finish. - log.lock().unwrap().push("release a"); - drop(a); - - thread.join().unwrap(); + // Release second reader, and finish second writer. + log.lock().unwrap().push("dropping read2"); + drop(read2); + thread3.join().unwrap(); assert_eq!( log.lock().unwrap().as_slice(), - &["one", "store two start", "release a", "store two done",] + &[ + "dropping read1", + "wait2 done", + "dropping read2", + "wait3 done" + ] ); } } diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index a624a3ccf5..8b90cc4e6b 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -2046,14 +2046,14 @@ impl Timeline { // // The GC cutoff should only ever move forwards. { - let write_guard = self.latest_gc_cutoff_lsn.write(); + let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); ensure!( *write_guard <= new_gc_cutoff, "Cannot move GC cutoff LSN backwards (was {}, new {})", *write_guard, new_gc_cutoff ); - write_guard.store(new_gc_cutoff); + write_guard.store_and_unlock(new_gc_cutoff).wait(); } info!("GC starting"); From 47bd307cb8c2941bf66405b4580a11099f4dfe3f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Sep 2022 10:16:47 +0300 Subject: [PATCH 004/166] Add python types to represent LSNs, tenant IDs and timeline IDs. (#2351) For better ergonomics. I always found it weird that we used UUID to actually mean a tenant or timeline ID. It worked because it happened to have the same length, 16 bytes, but it was hacky. --- test_runner/fixtures/benchmark_fixture.py | 6 +- test_runner/fixtures/neon_fixtures.py | 272 +++++++++--------- test_runner/fixtures/types.py | 89 ++++++ test_runner/fixtures/utils.py | 11 - .../performance/test_wal_backpressure.py | 10 +- test_runner/regress/test_ancestor_branch.py | 9 +- test_runner/regress/test_auth.py | 6 +- test_runner/regress/test_branch_and_gc.py | 15 +- test_runner/regress/test_branch_behind.py | 19 +- test_runner/regress/test_broken_timeline.py | 14 +- test_runner/regress/test_fullbackup.py | 7 +- test_runner/regress/test_gc_aggressive.py | 9 +- test_runner/regress/test_import.py | 42 +-- test_runner/regress/test_lsn_mapping.py | 6 +- test_runner/regress/test_neon_cli.py | 18 +- test_runner/regress/test_old_request_lsn.py | 9 +- test_runner/regress/test_pageserver_api.py | 33 ++- test_runner/regress/test_pitr_gc.py | 7 +- test_runner/regress/test_readonly_node.py | 9 +- test_runner/regress/test_remote_storage.py | 28 +- test_runner/regress/test_tenant_conf.py | 10 +- test_runner/regress/test_tenant_detach.py | 22 +- test_runner/regress/test_tenant_relocation.py | 55 ++-- test_runner/regress/test_tenant_tasks.py | 11 +- test_runner/regress/test_tenants.py | 18 +- .../test_tenants_with_remote_storage.py | 18 +- test_runner/regress/test_timeline_delete.py | 17 +- test_runner/regress/test_timeline_size.py | 30 +- test_runner/regress/test_wal_acceptor.py | 262 ++++++++--------- .../regress/test_wal_acceptor_async.py | 28 +- test_runner/regress/test_wal_restore.py | 3 +- 31 files changed, 599 insertions(+), 494 deletions(-) create mode 100644 test_runner/fixtures/types.py diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 655ffed90d..338cc47ea2 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -5,7 +5,6 @@ import json import os import re import timeit -import uuid import warnings from contextlib import contextmanager from datetime import datetime @@ -17,6 +16,7 @@ from typing import Iterator, Optional import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter +from fixtures.types import ZTenantId, ZTimelineId """ This file contains fixtures for micro-benchmarks. @@ -365,11 +365,11 @@ class NeonBenchmarker: assert matches return int(round(float(matches.group(1)))) - def get_timeline_size(self, repo_dir: Path, tenantid: uuid.UUID, timelineid: str): + def get_timeline_size(self, repo_dir: Path, tenantid: ZTenantId, timelineid: ZTimelineId): """ Calculate the on-disk size of a timeline """ - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid.hex, timelineid) + path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) totalbytes = 0 for root, dirs, files in os.walk(path): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index bbc35736bc..9ad9c0cd2f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,20 +29,14 @@ import pytest import requests from cached_property import cached_property from fixtures.log_helper import log +from fixtures.types import Lsn, ZTenantId, ZTimelineId # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal -from .utils import ( - allure_attach_from_dir, - etcd_path, - get_self_dir, - lsn_from_hex, - lsn_to_hex, - subprocess_capture, -) +from .utils import allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -378,7 +372,7 @@ class AuthKeys: def generate_tenant_token(self, tenant_id): token = jwt.encode( - {"scope": "tenant", "tenant_id": tenant_id}, self.priv, algorithm="RS256" + {"scope": "tenant", "tenant_id": str(tenant_id)}, self.priv, algorithm="RS256" ) if isinstance(token, bytes): @@ -759,12 +753,12 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = uuid.uuid4() + self.initial_tenant = ZTenantId.generate() # Create a config file corresponding to the options toml = textwrap.dedent( f""" - default_tenant_id = '{self.initial_tenant.hex}' + default_tenant_id = '{self.initial_tenant}' """ ) @@ -846,9 +840,9 @@ class NeonEnv: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) - def timeline_dir(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Path: + def timeline_dir(self, tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" - return self.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @cached_property def auth_keys(self) -> AuthKeys: @@ -976,11 +970,11 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + def tenant_create(self, new_tenant_id: Optional[ZTenantId] = None) -> ZTenantId: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - "new_tenant_id": new_tenant_id.hex if new_tenant_id else None, + "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, }, ) self.verbose_error(res) @@ -988,25 +982,25 @@ class NeonPageserverHttpClient(requests.Session): raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") new_tenant_id = res.json() assert isinstance(new_tenant_id, str) - return uuid.UUID(new_tenant_id) + return ZTenantId(new_tenant_id) - def tenant_attach(self, tenant_id: uuid.UUID): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/attach") + def tenant_attach(self, tenant_id: ZTenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") self.verbose_error(res) - def tenant_detach(self, tenant_id: uuid.UUID): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/detach") + def tenant_detach(self, tenant_id: ZTenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") self.verbose_error(res) - def tenant_status(self, tenant_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}") + def tenant_status(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) return res_json - def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[str, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") + def timeline_list(self, tenant_id: ZTenantId) -> List[Dict[str, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -1014,17 +1008,17 @@ class NeonPageserverHttpClient(requests.Session): def timeline_create( self, - tenant_id: uuid.UUID, - new_timeline_id: Optional[uuid.UUID] = None, - ancestor_timeline_id: Optional[uuid.UUID] = None, - ancestor_start_lsn: Optional[str] = None, + tenant_id: ZTenantId, + new_timeline_id: Optional[ZTimelineId] = None, + ancestor_timeline_id: Optional[ZTimelineId] = None, + ancestor_start_lsn: Optional[Lsn] = None, ) -> Dict[Any, Any]: res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json={ - "new_timeline_id": new_timeline_id.hex if new_timeline_id else None, - "ancestor_start_lsn": ancestor_start_lsn, - "ancestor_timeline_id": ancestor_timeline_id.hex if ancestor_timeline_id else None, + "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, + "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, }, ) self.verbose_error(res) @@ -1037,8 +1031,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail( self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, include_non_incremental_logical_size: bool = False, include_non_incremental_physical_size: bool = False, ) -> Dict[Any, Any]: @@ -1049,7 +1043,7 @@ class NeonPageserverHttpClient(requests.Session): params["include-non-incremental-physical-size"] = "yes" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params=params, ) self.verbose_error(res) @@ -1057,9 +1051,9 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): + def timeline_delete(self, tenant_id: ZTenantId, timeline_id: ZTimelineId): res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) self.verbose_error(res) res_json = res.json() @@ -1179,38 +1173,52 @@ class NeonCli(AbstractNeonCli): def create_tenant( self, - tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, + tenant_id: Optional[ZTenantId] = None, + timeline_id: Optional[ZTimelineId] = None, conf: Optional[Dict[str, str]] = None, - ) -> Tuple[uuid.UUID, uuid.UUID]: + ) -> Tuple[ZTenantId, ZTimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: - tenant_id = uuid.uuid4() + tenant_id = ZTenantId.generate() if timeline_id is None: - timeline_id = uuid.uuid4() + timeline_id = ZTimelineId.generate() if conf is None: res = self.raw_cli( - ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + ] ) else: res = self.raw_cli( - ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) res.check_returncode() return tenant_id, timeline_id - def config_tenant(self, tenant_id: uuid.UUID, conf: Dict[str, str]): + def config_tenant(self, tenant_id: ZTenantId, conf: Dict[str, str]): """ Update tenant config. """ if conf is None: - res = self.raw_cli(["tenant", "config", "--tenant-id", tenant_id.hex]) + res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)]) else: res = self.raw_cli( - ["tenant", "config", "--tenant-id", tenant_id.hex] + ["tenant", "config", "--tenant-id", str(tenant_id)] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) res.check_returncode() @@ -1221,15 +1229,15 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[uuid.UUID] = None - ) -> uuid.UUID: + self, new_branch_name: str, tenant_id: Optional[ZTenantId] = None + ) -> ZTimelineId: cmd = [ "timeline", "create", "--branch-name", new_branch_name, "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] res = self.raw_cli(cmd) @@ -1241,16 +1249,16 @@ class NeonCli(AbstractNeonCli): if matches is not None: created_timeline_id = matches.group("timeline_id") - return uuid.UUID(created_timeline_id) + return ZTimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[uuid.UUID] = None): + def create_root_branch(self, branch_name: str, tenant_id: Optional[ZTenantId] = None): cmd = [ "timeline", "create", "--branch-name", branch_name, "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] res = self.raw_cli(cmd) @@ -1265,27 +1273,27 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return uuid.UUID(created_timeline_id) + return ZTimelineId(created_timeline_id) def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - ancestor_start_lsn: Optional[str] = None, - ) -> uuid.UUID: + tenant_id: Optional[ZTenantId] = None, + ancestor_start_lsn: Optional[Lsn] = None, + ) -> ZTimelineId: cmd = [ "timeline", "branch", "--branch-name", new_branch_name, "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] if ancestor_branch_name is not None: cmd.extend(["--ancestor-branch-name", ancestor_branch_name]) if ancestor_start_lsn is not None: - cmd.extend(["--ancestor-start-lsn", ancestor_start_lsn]) + cmd.extend(["--ancestor-start-lsn", str(ancestor_start_lsn)]) res = self.raw_cli(cmd) res.check_returncode() @@ -1299,9 +1307,11 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return uuid.UUID(created_timeline_id) + return ZTimelineId(str(created_timeline_id)) - def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]: + def list_timelines( + self, tenant_id: Optional[ZTenantId] = None + ) -> List[Tuple[str, ZTimelineId]]: """ Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ @@ -1309,18 +1319,18 @@ class NeonCli(AbstractNeonCli): # (L) main [b49f7954224a0ad25cc0013ea107b54b] # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] res = self.raw_cli( - ["timeline", "list", "--tenant-id", (tenant_id or self.env.initial_tenant).hex] + ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] ) timelines_cli = sorted( map( - lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), + lambda branch_and_id: (branch_and_id[0], ZTimelineId(branch_and_id[1])), TIMELINE_DATA_EXTRACTOR.findall(res.stdout), ) ) return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[uuid.UUID] = None + self, config_toml: str, initial_timeline_id: Optional[ZTimelineId] = None ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1328,7 +1338,7 @@ class NeonCli(AbstractNeonCli): cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: - cmd.extend(["--timeline-id", initial_timeline_id.hex]) + cmd.extend(["--timeline-id", str(initial_timeline_id)]) append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, @@ -1399,20 +1409,20 @@ class NeonCli(AbstractNeonCli): self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "pg", "create", "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), "--branch-name", branch_name, ] if lsn is not None: - args.extend(["--lsn", lsn]) + args.extend(["--lsn", str(lsn)]) if port is not None: args.extend(["--port", str(port)]) if node_name is not None: @@ -1425,15 +1435,15 @@ class NeonCli(AbstractNeonCli): def pg_start( self, node_name: str, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "pg", "start", "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] if lsn is not None: args.append(f"--lsn={lsn}") @@ -1449,7 +1459,7 @@ class NeonCli(AbstractNeonCli): def pg_stop( self, node_name: str, - tenant_id: Optional[uuid.UUID] = None, + tenant_id: Optional[ZTenantId] = None, destroy=False, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": @@ -1457,7 +1467,7 @@ class NeonCli(AbstractNeonCli): "pg", "stop", "--tenant-id", - (tenant_id or self.env.initial_tenant).hex, + str(tenant_id or self.env.initial_tenant), ] if destroy: args.append("--destroy") @@ -1856,7 +1866,7 @@ class Postgres(PgProtocol): """An object representing a running postgres daemon.""" def __init__( - self, env: NeonEnv, tenant_id: uuid.UUID, port: int, check_stop_result: bool = True + self, env: NeonEnv, tenant_id: ZTenantId, port: int, check_stop_result: bool = True ): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env @@ -1872,7 +1882,7 @@ class Postgres(PgProtocol): self, branch_name: str, node_name: Optional[str] = None, - lsn: Optional[str] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Postgres": """ @@ -1887,7 +1897,7 @@ class Postgres(PgProtocol): self.env.neon_cli.pg_create( branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port ) - path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name + path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -1918,7 +1928,7 @@ class Postgres(PgProtocol): def pg_data_dir_path(self) -> str: """Path to data directory""" assert self.node_name - path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name + path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name return os.path.join(self.env.repo_dir, path) def pg_xact_dir_path(self) -> str: @@ -2005,7 +2015,7 @@ class Postgres(PgProtocol): self, branch_name: str, node_name: Optional[str] = None, - lsn: Optional[str] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Postgres": """ @@ -2046,8 +2056,8 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2070,8 +2080,8 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, + tenant_id: Optional[ZTenantId] = None, + lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2146,7 +2156,7 @@ class Safekeeper: return self def append_logical_message( - self, tenant_id: uuid.UUID, timeline_id: uuid.UUID, request: Dict[str, Any] + self, tenant_id: ZTenantId, timeline_id: ZTimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify @@ -2156,7 +2166,7 @@ class Safekeeper: # "replication=0" hacks psycopg not to send additional queries # on startup, see https://github.com/psycopg/psycopg2/pull/482 - connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id.hex} ztenantid={tenant_id.hex}'" + connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" with closing(psycopg2.connect(connstr)) as conn: # server doesn't support transactions @@ -2181,18 +2191,18 @@ class Safekeeper: @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int - flush_lsn: str - timeline_start_lsn: str - backup_lsn: str - remote_consistent_lsn: str + flush_lsn: Lsn + timeline_start_lsn: Lsn + backup_lsn: Lsn + remote_consistent_lsn: Lsn @dataclass class SafekeeperMetrics: # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[str, str], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[str, str], int] = field(default_factory=dict) + flush_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) class SafekeeperHttpClient(requests.Session): @@ -2209,26 +2219,30 @@ class SafekeeperHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - def timeline_status(self, tenant_id: str, timeline_id: str) -> SafekeeperTimelineStatus: + def timeline_status( + self, tenant_id: ZTenantId, timeline_id: ZTimelineId + ) -> SafekeeperTimelineStatus: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() return SafekeeperTimelineStatus( acceptor_epoch=resj["acceptor_state"]["epoch"], - flush_lsn=resj["flush_lsn"], - timeline_start_lsn=resj["timeline_start_lsn"], - backup_lsn=resj["backup_lsn"], - remote_consistent_lsn=resj["remote_consistent_lsn"], + flush_lsn=Lsn(resj["flush_lsn"]), + timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), + backup_lsn=Lsn(resj["backup_lsn"]), + remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), ) - def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): + def record_safekeeper_info(self, tenant_id: ZTenantId, timeline_id: ZTimelineId, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", json=body, ) res.raise_for_status() - def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]: + def timeline_delete_force( + self, tenant_id: ZTenantId, timeline_id: ZTimelineId + ) -> Dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -2237,7 +2251,7 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def tenant_delete_force(self, tenant_id: str) -> Dict[Any, Any]: + def tenant_delete_force(self, tenant_id: ZTenantId) -> Dict[Any, Any]: res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") res.raise_for_status() res_json = res.json() @@ -2258,13 +2272,17 @@ class SafekeeperHttpClient(requests.Session): all_metrics_text, re.MULTILINE, ): - metrics.flush_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) + metrics.flush_lsn_inexact[ + (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) + ] = int(match.group(3)) for match in re.finditer( r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', all_metrics_text, re.MULTILINE, ): - metrics.commit_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) + metrics.commit_lsn_inexact[ + (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) + ] = int(match.group(3)) return metrics @@ -2437,7 +2455,7 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command - timeline = pg.safe_psql("SHOW neon.timeline_id")[0][0] + timeline = ZTimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) # stop postgres to ensure that files won't change pg.stop() @@ -2453,7 +2471,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post {psql_path} \ --no-psqlrc \ postgres://localhost:{env.pageserver.service_port.pg} \ - -c 'basebackup {pg.tenant_id.hex} {timeline}' \ + -c 'basebackup {pg.tenant_id} {timeline}' \ | tar -x -C {restored_dir_path} """ @@ -2521,7 +2539,7 @@ def wait_until(number_of_iterations: int, interval: float, func): def assert_timeline_local( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID + pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId ): timeline_detail = pageserver_http_client.timeline_detail( tenant, @@ -2535,33 +2553,33 @@ def assert_timeline_local( def assert_no_in_progress_downloads_for_tenant( pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, + tenant: ZTenantId, ): tenant_status = pageserver_http_client.tenant_status(tenant) assert tenant_status["has_in_progress_downloads"] is False, tenant_status def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: + pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId +) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) if detail["remote"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. - return 0 + return Lsn(0) else: lsn_str = detail["remote"]["remote_consistent_lsn"] assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + return Lsn(lsn_str) def wait_for_upload( pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, + tenant: ZTenantId, + timeline: ZTimelineId, + lsn: Lsn, ): """waits for local timeline upload up to specified lsn""" for i in range(20): @@ -2570,32 +2588,32 @@ def wait_for_upload( return log.info( "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + lsn, current_lsn, i + 1 ) ) time.sleep(1) raise Exception( "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) + lsn, current_lsn ) ) def last_record_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: + pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId +) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail["local"]["last_record_lsn"] assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + return Lsn(lsn_str) def wait_for_last_record_lsn( pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, + tenant: ZTenantId, + timeline: ZTimelineId, + lsn: Lsn, ): """waits for pageserver to catch up to a certain lsn""" for i in range(10): @@ -2604,20 +2622,18 @@ def wait_for_last_record_lsn( return log.info( "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + lsn, current_lsn, i + 1 ) ) time.sleep(1) raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) - ) + "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) ) -def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID): +def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: ZTenantId, timeline: ZTimelineId): """Wait for pageserver to catch up the latest flush LSN""" - last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) @@ -2626,8 +2642,8 @@ def fork_at_current_lsn( pg: Postgres, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[uuid.UUID] = None, -) -> uuid.UUID: + tenant_id: Optional[ZTenantId] = None, +) -> ZTimelineId: """ Create new branch at the last LSN of an existing branch. The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py new file mode 100644 index 0000000000..d5cb200080 --- /dev/null +++ b/test_runner/fixtures/types.py @@ -0,0 +1,89 @@ +import random +from functools import total_ordering +from typing import Union + + +@total_ordering +class Lsn: + """ + Datatype for an LSN. Internally it is a 64-bit integer, but the string + representation is like "1/123abcd". See also pg_lsn datatype in Postgres + """ + + def __init__(self, x: Union[int, str]): + if isinstance(x, int): + self.lsn_int = x + else: + """Convert lsn from hex notation to int.""" + l, r = x.split("/") + self.lsn_int = (int(l, 16) << 32) + int(r, 16) + # FIXME: error if it doesn't look like a valid LSN + + def __str__(self): + """Convert lsn from int to standard hex notation.""" + return "{:X}/{:X}".format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF) + + def __repr__(self): + return 'Lsn("{:X}/{:X}")'.format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF) + + def __int__(self): + return self.lsn_int + + def __lt__(self, other: "Lsn") -> bool: + return self.lsn_int < other.lsn_int + + def __eq__(self, other) -> bool: + if not isinstance(other, Lsn): + return NotImplemented + return self.lsn_int == other.lsn_int + + # Returns the difference between two Lsns, in bytes + def __sub__(self, other: "Lsn") -> int: + return self.lsn_int - other.lsn_int + + def __hash__(self): + return hash(self.lsn_int) + + +@total_ordering +class ZId: + """ + Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and + the string representation is in hex. This corresponds to the ZId / ZTenantId / + ZTimelineIds in in the Rust code. + """ + + def __init__(self, x: str): + self.id = bytearray.fromhex(x) + assert len(self.id) == 16 + + def __str__(self): + return self.id.hex() + + def __lt__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self.id < other.id + + def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self.id == other.id + + def __hash__(self): + return hash(str(self.id)) + + @classmethod + def generate(cls): + """Generate a random ID""" + return cls(random.randbytes(16).hex()) + + +class ZTenantId(ZId): + def __repr__(self): + return f'ZTenantId("{self.id.hex()}")' + + +class ZTimelineId(ZId): + def __repr__(self): + return f'ZTimelineId("{self.id.hex()}")' diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 88bf6d634d..726116e53c 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -61,17 +61,6 @@ def global_counter() -> int: return _global_counter -def lsn_to_hex(num: int) -> str: - """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) - - -def lsn_from_hex(lsn_hex: str) -> int: - """Convert lsn from hex notation to int.""" - l, r = lsn_hex.split("/") - return (int(l, 16) << 32) + int(r, 16) - - def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 03d5ba208a..47e2435052 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -9,7 +9,7 @@ from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin -from fixtures.utils import lsn_from_hex +from fixtures.types import Lsn from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @@ -198,8 +198,8 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte return lsn_write_lags = [] - last_received_lsn = 0 - last_pg_flush_lsn = 0 + last_received_lsn = Lsn(0) + last_pg_flush_lsn = Lsn(0) with env.pg.connect().cursor() as cur: cur.execute("CREATE EXTENSION neon") @@ -218,11 +218,11 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte res = cur.fetchone() lsn_write_lags.append(res[0]) - curr_received_lsn = lsn_from_hex(res[3]) + curr_received_lsn = Lsn(res[3]) lsn_process_speed = (curr_received_lsn - last_received_lsn) / (1024**2) last_received_lsn = curr_received_lsn - curr_pg_flush_lsn = lsn_from_hex(res[2]) + curr_pg_flush_lsn = Lsn(res[2]) lsn_produce_speed = (curr_pg_flush_lsn - last_pg_flush_lsn) / (1024**2) last_pg_flush_lsn = curr_pg_flush_lsn diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 96612a8aef..b8e81824b0 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,5 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import ZTimelineId from fixtures.utils import query_scalar @@ -26,7 +27,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_timeline = query_scalar(branch0_cur, "SHOW neon.timeline_id") + branch0_timeline = ZTimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. @@ -50,7 +51,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_timeline = query_scalar(branch1_cur, "SHOW neon.timeline_id") + branch1_timeline = ZTimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) log.info(f"b1 timeline {branch1_timeline}") branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") @@ -73,7 +74,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_timeline = query_scalar(branch2_cur, "SHOW neon.timeline_id") + branch2_timeline = ZTimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) log.info(f"b2 timeline {branch2_timeline}") branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") @@ -91,7 +92,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f"compact {tenant.hex} {branch1_timeline} {lsn_200}" + compact = f"compact {tenant} {branch1_timeline} {lsn_200}" log.info(compact) env.pageserver.safe_psql(compact) diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 16d6ae45c3..08e38e1461 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,8 +1,8 @@ from contextlib import closing -from uuid import uuid4 import pytest from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException +from fixtures.types import ZTenantId def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -11,9 +11,9 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): ps = env.pageserver - tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant.hex) + tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) tenant_http_client = env.pageserver.http_client(tenant_token) - invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex) + invalid_tenant_token = env.auth_keys.generate_tenant_token(ZTenantId.generate()) invalid_tenant_http_client = env.pageserver.http_client(invalid_tenant_token) management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index deb041b5d1..c8c5929066 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -4,7 +4,8 @@ import time import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import lsn_from_hex, query_scalar +from fixtures.types import Lsn +from fixtures.utils import query_scalar # Test the GC implementation when running with branching. @@ -74,18 +75,16 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" ) main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") - lsn1 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn1 = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN1: {lsn1}") main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") - lsn2 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn2 = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN2: {lsn2}") # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. - env.pageserver.safe_psql( - f"do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}" - ) + env.pageserver.safe_psql(f"do_gc {tenant} {timeline_main} {lsn2 - lsn1 + 1024}") env.neon_cli.create_branch( "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 @@ -143,7 +142,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): "INSERT INTO t SELECT FROM generate_series(1, 100000)", ] ) - lsn = res[2][0][0] + lsn = Lsn(res[2][0][0]) # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* @@ -151,7 +150,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") def do_gc(): - env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0") + env.pageserver.safe_psql(f"do_gc {tenant} {b0} 0") thread = threading.Thread(target=do_gc, daemon=True) thread.start() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 51946380d2..5bd6368bfc 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -2,6 +2,7 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import Lsn, ZTimelineId from fixtures.utils import print_gc_result, query_scalar @@ -27,13 +28,13 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): main_cur = pgmain.connect().cursor() - timeline = query_scalar(main_cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows main_cur.execute("CREATE TABLE foo (t text)") # keep some early lsn to test branch creation on out of date lsn - gced_lsn = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + gced_lsn = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) main_cur.execute( """ @@ -42,7 +43,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): FROM generate_series(1, 100) g """ ) - lsn_a = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn_a = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN after 100 rows: {lsn_a}") # Insert some more rows. (This generates enough WAL to fill a few segments.) @@ -53,7 +54,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): FROM generate_series(1, 200000) g """ ) - lsn_b = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn_b = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN after 200100 rows: {lsn_b}") # Branch at the point where only 100 rows were inserted @@ -69,7 +70,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): FROM generate_series(1, 200000) g """ ) - lsn_c = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + lsn_c = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"LSN after 400100 rows: {lsn_c}") @@ -96,25 +97,25 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): # branch at segment boundary env.neon_cli.create_branch( - "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn="0/3000000" + "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn=Lsn("0/3000000") ) pg = env.postgres.create_start("test_branch_segment_boundary") assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn="0/42") + env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): env.neon_cli.create_branch( - "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn="0/42" + "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42") ) # check that we cannot create branch based on garbage collected data with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") row = pscur.fetchone() print_gc_result(row) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index c4b23c24b8..bf44dfd949 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -5,7 +5,7 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.utils import query_scalar +from fixtures.types import ZTenantId, ZTimelineId # Test restarting page server, while safekeeper and compute node keep @@ -15,19 +15,15 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_timelines: List[Tuple[str, str, Postgres]] = [] + tenant_timelines: List[Tuple[ZTenantId, ZTimelineId, Postgres]] = [] for n in range(4): - tenant_id_uuid, timeline_id_uuid = env.neon_cli.create_tenant() - tenant_id = tenant_id_uuid.hex - timeline_id = timeline_id_uuid.hex + tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start("main", tenant_id=tenant_id_uuid) + pg = env.postgres.create_start("main", tenant_id=tenant_id) with pg.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - - timeline_id = query_scalar(cur, "SHOW neon.timeline_id") pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) @@ -109,5 +105,5 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env.neon_cli.pageserver_start() # Check that tenant with "broken" timeline is not loaded. - with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id.hex}"): + with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id}"): env.neon_cli.list_timelines(tenant_id) diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 8155f52060..af94865549 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -8,6 +8,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, pg_distrib_dir, ) +from fixtures.types import Lsn, ZTimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -26,7 +27,7 @@ def test_fullbackup( log.info("postgres is running on 'test_fullbackup' branch") with pgmain.cursor() as cur: - timeline = query_scalar(cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") @@ -36,7 +37,7 @@ def test_fullbackup( ) cur.execute("CHECKPOINT") - lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"start_backup_lsn = {lsn}") # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. @@ -46,7 +47,7 @@ def test_fullbackup( # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) - query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}" + query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 90824f882a..67ce8871cd 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -3,6 +3,7 @@ import random from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres +from fixtures.types import ZTimelineId from fixtures.utils import query_scalar # Test configuration @@ -28,15 +29,15 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon -async def gc(env: NeonEnv, timeline: str): +async def gc(env: NeonEnv, timeline: ZTimelineId): psconn = await env.pageserver.connect_async() while updates_performed < updates_to_perform: - await psconn.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + await psconn.execute(f"do_gc {env.initial_tenant} {timeline} 0") # At the same time, run UPDATEs and GC -async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: str): +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: ZTimelineId): workers = [] for worker_id in range(num_connections): workers.append(asyncio.create_task(update_table(pg))) @@ -61,7 +62,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on test_gc_aggressive branch") with pg.cursor() as cur: - timeline = query_scalar(cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (id int, counter int, t text)") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index a2671727f7..fc9f41bda0 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -5,7 +5,6 @@ import shutil import tarfile from contextlib import closing from pathlib import Path -from uuid import UUID, uuid4 import pytest from fixtures.log_helper import log @@ -18,7 +17,8 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.utils import lsn_from_hex, subprocess_capture +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import subprocess_capture @pytest.mark.timeout(600) @@ -69,8 +69,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] node_name = "import_from_vanilla" - tenant = uuid4() - timeline = uuid4() + tenant = ZTenantId.generate() + timeline = ZTimelineId.generate() # Set up pageserver for import neon_env_builder.enable_local_fs_remote_storage() @@ -83,9 +83,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build "timeline", "import", "--tenant-id", - tenant.hex, + str(tenant), "--timeline-id", - timeline.hex, + str(timeline), "--node-name", node_name, "--base-lsn", @@ -112,8 +112,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build import_tar(base_tar, wal_tar) # Wait for data to land in s3 - wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(end_lsn)) - wait_for_upload(client, tenant, timeline, lsn_from_hex(end_lsn)) + wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) + wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) @@ -173,7 +173,7 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne assert cnt_seg_files > 0 -def _generate_data(num_rows: int, pg: Postgres) -> str: +def _generate_data(num_rows: int, pg: Postgres) -> Lsn: """Generate a table with `num_rows` rows. Returns: @@ -191,10 +191,12 @@ def _generate_data(num_rows: int, pg: Postgres) -> str: cur.execute("SELECT pg_current_wal_insert_lsn()") res = cur.fetchone() assert res is not None and isinstance(res[0], str) - return res[0] + return Lsn(res[0]) -def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timeline: UUID) -> str: +def _import( + expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: ZTimelineId +) -> str: """Test importing backup data to the pageserver. Args: @@ -210,7 +212,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} # Get a fullbackup from pageserver - query = f"fullbackup { env.initial_tenant.hex} {timeline.hex} {lsn}" + query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" @@ -228,7 +230,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Import using another tenantid, because we use the same pageserver. # TODO Create another pageserver to make test more realistic. - tenant = uuid4() + tenant = ZTenantId.generate() # Import to pageserver node_name = "import_from_pageserver" @@ -239,28 +241,28 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel "timeline", "import", "--tenant-id", - tenant.hex, + str(tenant), "--timeline-id", - timeline.hex, + str(timeline), "--node-name", node_name, "--base-lsn", - lsn, + str(lsn), "--base-tarfile", os.path.join(tar_output_file), ] ) # Wait for data to land in s3 - wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(lsn)) - wait_for_upload(client, tenant, timeline, lsn_from_hex(lsn)) + wait_for_last_record_lsn(client, tenant, timeline, lsn) + wait_for_upload(client, tenant, timeline, lsn) # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) assert pg.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup - query = f"fullbackup { tenant.hex} {timeline.hex} {lsn}" + query = f"fullbackup { tenant} {timeline} {lsn}" cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) new_tar_output_file = result_basepath + ".stdout" @@ -272,6 +274,6 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Check that gc works psconn = env.pageserver.connect() pscur = psconn.cursor() - pscur.execute(f"do_gc {tenant.hex} {timeline.hex} 0") + pscur.execute(f"do_gc {tenant} {timeline} 0") return tar_output_file diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 0c1d3648f2..f6ca7000dd 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -41,7 +41,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", + f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", ) assert result == "future" @@ -49,7 +49,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): probe_timestamp = tbl[0][1] - timedelta(hours=10) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", + f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", ) assert result == "past" @@ -60,7 +60,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Call get_lsn_by_timestamp to get the LSN lsn = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", + f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", ) # Launch a new read-only node at that LSN, and check that only the rows diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 1acfa72127..b2342e5ee8 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -1,4 +1,3 @@ -import uuid from typing import cast import requests @@ -8,10 +7,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, NeonPageserverHttpClient, ) +from fixtures.types import ZTenantId, ZTimelineId def helper_compare_timeline_list( - pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: uuid.UUID + pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: ZTenantId ): """ Compare timelines list returned by CLI and directly via API. @@ -20,7 +20,7 @@ def helper_compare_timeline_list( timelines_api = sorted( map( - lambda t: cast(str, t["timeline_id"]), + lambda t: ZTimelineId(t["timeline_id"]), pageserver_http_client.timeline_list(initial_tenant), ) ) @@ -52,8 +52,8 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): # Check that all new branches are visible via CLI timelines_cli = [timeline_id for (_, timeline_id) in env.neon_cli.list_timelines()] - assert main_timeline_id.hex in timelines_cli - assert nested_timeline_id.hex in timelines_cli + assert main_timeline_id in timelines_cli + assert nested_timeline_id in timelines_cli def helper_compare_tenant_list(pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv): @@ -85,11 +85,11 @@ def test_cli_tenant_list(neon_simple_env: NeonEnv): helper_compare_tenant_list(pageserver_http_client, env) res = env.neon_cli.list_tenants() - tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) + tenants = sorted(map(lambda t: ZTenantId(t.split()[0]), res.stdout.splitlines())) - assert env.initial_tenant.hex in tenants - assert tenant1.hex in tenants - assert tenant2.hex in tenants + assert env.initial_tenant in tenants + assert tenant1 in tenants + assert tenant2 in tenants def test_cli_tenant_create(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 257913ef3f..2b5e2edb5f 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,6 +1,7 @@ import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import ZTimelineId from fixtures.utils import print_gc_result, query_scalar @@ -26,7 +27,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - timeline = query_scalar(cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) psconn = env.pageserver.connect() pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) @@ -60,9 +61,9 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") + gcrow = pscur.fetchone() + print_gc_result(gcrow) for j in range(100): cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 869f53ac0a..8ee38fcf4f 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,7 +1,6 @@ import pathlib import subprocess from typing import Optional -from uuid import UUID, uuid4 from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -12,7 +11,7 @@ from fixtures.neon_fixtures import ( pg_distrib_dir, wait_until, ) -from fixtures.utils import lsn_from_hex +from fixtures.types import Lsn, ZTenantId, ZTimelineId # test that we cannot override node id after init @@ -61,39 +60,39 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): +def check_client(client: NeonPageserverHttpClient, initial_tenant: ZTenantId): client.check_status() # check initial tenant is there - assert initial_tenant.hex in {t["id"] for t in client.tenant_list()} + assert initial_tenant in {ZTenantId(t["id"]) for t in client.tenant_list()} # create new tenant and check it is also there - tenant_id = uuid4() + tenant_id = ZTenantId.generate() client.tenant_create(tenant_id) - assert tenant_id.hex in {t["id"] for t in client.tenant_list()} + assert tenant_id in {ZTenantId(t["id"]) for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) assert len(timelines) == 0, "initial tenant should not have any timelines" # create timeline - timeline_id = uuid4() + timeline_id = ZTimelineId.generate() client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 # check it is there - assert timeline_id.hex in {b["timeline_id"] for b in client.timeline_list(tenant_id)} + assert timeline_id in {ZTimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} for timeline in timelines: - timeline_id_str = str(timeline["timeline_id"]) + timeline_id = ZTimelineId(timeline["timeline_id"]) timeline_details = client.timeline_detail( tenant_id=tenant_id, - timeline_id=UUID(timeline_id_str), + timeline_id=timeline_id, include_non_incremental_logical_size=True, ) - assert timeline_details["tenant_id"] == tenant_id.hex - assert timeline_details["timeline_id"] == timeline_id_str + assert ZTenantId(timeline_details["tenant_id"]) == tenant_id + assert ZTimelineId(timeline_details["timeline_id"]) == timeline_id local_timeline_details = timeline_details.get("local") assert local_timeline_details is not None @@ -122,10 +121,10 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): def expect_updated_msg_lsn( client: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, - prev_msg_lsn: Optional[int], -) -> int: + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + prev_msg_lsn: Optional[Lsn], +) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) # a successful `timeline_details` response must contain the below fields @@ -138,7 +137,7 @@ def expect_updated_msg_lsn( local_timeline_details["last_received_msg_lsn"] is not None ), "the last received message's LSN is empty" - last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) + last_msg_lsn = Lsn(local_timeline_details["last_received_msg_lsn"]) assert ( prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 1fc18ebbc4..329f4b7d24 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -3,6 +3,7 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import ZTimelineId from fixtures.utils import print_gc_result, query_scalar @@ -24,7 +25,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - timeline = query_scalar(main_cur, "SHOW neon.timeline_id") + timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table main_cur.execute("CREATE TABLE foo (t text)") @@ -57,9 +58,9 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # run GC with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute(f"compact {env.initial_tenant.hex} {timeline}") + pscur.execute(f"compact {env.initial_tenant} {timeline}") # perform aggressive GC. Data still should be kept because of the PITR setting. - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") row = pscur.fetchone() print_gc_result(row) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 0bd78c62a3..fac9d97a42 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -84,7 +85,9 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Check creating a node at segment boundary pg = env.postgres.create_start( - branch_name="test_readonly_node", node_name="test_branch_segment_boundary", lsn="0/3000000" + branch_name="test_readonly_node", + node_name="test_branch_segment_boundary", + lsn=Lsn("0/3000000"), ) cur = pg.connect().cursor() cur.execute("SELECT 1") @@ -94,5 +97,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail env.postgres.create_start( - branch_name="test_readonly_node", node_name="test_readonly_node_preinitdb", lsn="0/42" + branch_name="test_readonly_node", + node_name="test_readonly_node_preinitdb", + lsn=Lsn("0/42"), ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0015c75670..04baef6ba0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -5,7 +5,6 @@ import os import shutil import time from pathlib import Path -from uuid import UUID import pytest from fixtures.log_helper import log @@ -18,7 +17,8 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.utils import lsn_from_hex, query_scalar +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import query_scalar # @@ -61,8 +61,8 @@ def test_remote_storage_backup_and_restore( client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) checkpoint_numbers = range(1, 3) @@ -74,17 +74,17 @@ def test_remote_storage_backup_and_restore( INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); """ ) - current_lsn = lsn_from_hex(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # wait until pageserver receives that data - wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage - wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_upload(client, tenant_id, timeline_id, current_lsn) log.info(f"upload of checkpoint {checkpoint_number} is done") ##### Stop the first pageserver instance, erase all its data @@ -101,16 +101,16 @@ def test_remote_storage_backup_and_restore( # Introduce failpoint in download env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") - client.tenant_attach(UUID(tenant_id)) + client.tenant_attach(tenant_id) # is there a better way to assert that failpoint triggered? time.sleep(10) # assert cannot attach timeline that is scheduled for download with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"): - client.tenant_attach(UUID(tenant_id)) + client.tenant_attach(tenant_id) - detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) + detail = client.timeline_detail(tenant_id, timeline_id) log.info("Timeline detail with active failpoint: %s", detail) assert detail["local"] is None assert detail["remote"]["awaits_download"] @@ -119,20 +119,20 @@ def test_remote_storage_backup_and_restore( env.pageserver.stop() env.pageserver.start() - client.tenant_attach(UUID(tenant_id)) + client.tenant_attach(tenant_id) log.info("waiting for timeline redownload") wait_until( number_of_iterations=20, interval=1, - func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)), + func=lambda: assert_timeline_local(client, tenant_id, timeline_id), ) - detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) + detail = client.timeline_detail(tenant_id, timeline_id) assert detail["local"] is not None log.info("Timeline detail after attach completed: %s", detail) assert ( - lsn_from_hex(detail["local"]["last_record_lsn"]) >= current_lsn + Lsn(detail["local"]["last_record_lsn"]) >= current_lsn ), "current db Lsn should should not be less than the one stored on remote storage" assert not detail["remote"]["awaits_download"] diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index d496edd6dc..51a8101b11 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -32,8 +32,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" # it should match global configuration with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - log.info(f"show {env.initial_tenant.hex}") - pscur.execute(f"show {env.initial_tenant.hex}") + log.info(f"show {env.initial_tenant}") + pscur.execute(f"show {env.initial_tenant}") res = pscur.fetchone() assert all( i in res.items() @@ -52,7 +52,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant.hex}") + pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"res: {res}") assert all( @@ -80,7 +80,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant.hex}") + pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"after config res: {res}") assert all( @@ -103,7 +103,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant.hex}") + pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"after restart res: {res}") assert all( diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index f1b30429bf..147e22b38f 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,17 +1,16 @@ -import uuid from threading import Thread -from uuid import uuid4 import psycopg2 import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.types import ZTenantId, ZTimelineId -def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID): +def do_gc_target(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") + env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") except Exception as e: log.error("do_gc failed: %s", e) @@ -21,10 +20,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() # first check for non existing tenant - tenant_id = uuid4() + tenant_id = ZTenantId.generate() with pytest.raises( expected_exception=NeonPageserverApiException, - match=f"Tenant not found for id {tenant_id.hex}", + match=f"Tenant not found for id {tenant_id}", ): pageserver_http.tenant_detach(tenant_id) @@ -32,7 +31,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): tenant_id, timeline_id = env.neon_cli.create_tenant() # assert tenant exists on disk - assert (env.repo_dir / "tenants" / tenant_id.hex).exists() + assert (env.repo_dir / "tenants" / str(tenant_id)).exists() pg = env.postgres.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement @@ -47,7 +46,8 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): with pytest.raises( expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" ): - env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {uuid4().hex} 0") + bogus_timeline_id = ZTimelineId.generate() + env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") # try to concurrently run gc and detach gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) @@ -70,9 +70,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): gc_thread.join(timeout=10) # check that nothing is left on disk for deleted tenant - assert not (env.repo_dir / "tenants" / tenant_id.hex).exists() + assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id.hex} not found" + expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id} not found" ): - env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") + env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 19b0ec05a7..56563ebe87 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -5,7 +5,6 @@ import subprocess import threading from contextlib import closing, contextmanager from typing import Any, Dict, Optional, Tuple -from uuid import UUID import pytest from fixtures.log_helper import log @@ -25,7 +24,8 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.utils import lsn_from_hex, lsn_to_hex, subprocess_capture +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import query_scalar, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -113,19 +113,21 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def populate_branch( pg: Postgres, - tenant_id: UUID, + tenant_id: ZTenantId, ps_http: NeonPageserverHttpClient, create_table: bool, expected_sum: Optional[int], -) -> Tuple[UUID, int]: +) -> Tuple[ZTimelineId, Lsn]: # insert some data with pg_cur(pg) as cur: cur.execute("SHOW neon.timeline_id") - timeline_id = UUID(cur.fetchone()[0]) - log.info("timeline to relocate %s", timeline_id.hex) + timeline_id = ZTimelineId(cur.fetchone()[0]) + log.info("timeline to relocate %s", timeline_id) - cur.execute("SELECT pg_current_wal_flush_lsn()") - log.info("pg_current_wal_flush_lsn() %s", lsn_from_hex(cur.fetchone()[0])) + log.info( + "pg_current_wal_flush_lsn(): %s", + Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")), + ) log.info( "timeline detail %s", ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id), @@ -139,21 +141,20 @@ def populate_branch( if expected_sum is not None: cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (expected_sum,) - cur.execute("SELECT pg_current_wal_flush_lsn()") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - current_lsn = lsn_from_hex(cur.fetchone()[0]) return timeline_id, current_lsn def ensure_checkpoint( pageserver_cur, pageserver_http: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, - current_lsn: int, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage - pageserver_cur.execute(f"checkpoint {tenant_id.hex} {timeline_id.hex}") + pageserver_cur.execute(f"checkpoint {tenant_id} {timeline_id}") # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) @@ -161,10 +162,10 @@ def ensure_checkpoint( def check_timeline_attached( new_pageserver_http_client: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, old_timeline_detail: Dict[str, Any], - old_current_lsn: int, + old_current_lsn: Lsn, ): # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint new_timeline_detail = assert_timeline_local(new_pageserver_http_client, tenant_id, timeline_id) @@ -172,18 +173,22 @@ def check_timeline_attached( # when load is active these checks can break because lsns are not static # so let's check with some margin assert_abs_margin_ratio( - lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), - lsn_from_hex(old_timeline_detail["local"]["disk_consistent_lsn"]), + int(Lsn(new_timeline_detail["local"]["disk_consistent_lsn"])), + int(Lsn(old_timeline_detail["local"]["disk_consistent_lsn"])), 0.03, ) assert_abs_margin_ratio( - lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), old_current_lsn, 0.03 + int(Lsn(new_timeline_detail["local"]["disk_consistent_lsn"])), int(old_current_lsn), 0.03 ) def switch_pg_to_new_pageserver( - env: NeonEnv, pg: Postgres, new_pageserver_port: int, tenant_id: UUID, timeline_id: UUID + env: NeonEnv, + pg: Postgres, + new_pageserver_port: int, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, ) -> pathlib.Path: pg.stop() @@ -195,7 +200,7 @@ def switch_pg_to_new_pageserver( pg.start() timeline_to_detach_local_path = ( - env.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) ) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( @@ -260,7 +265,7 @@ def test_tenant_relocation( pageserver_http = env.pageserver.http_client() tenant_id, initial_timeline_id = env.neon_cli.create_tenant( - UUID("74ee8b079a0e437eb0afea7d26a07209") + ZTenantId("74ee8b079a0e437eb0afea7d26a07209") ) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) @@ -280,7 +285,7 @@ def test_tenant_relocation( env.neon_cli.create_branch( new_branch_name="test_tenant_relocation_second", ancestor_branch_name="test_tenant_relocation_main", - ancestor_start_lsn=lsn_to_hex(current_lsn_main), + ancestor_start_lsn=current_lsn_main, tenant_id=tenant_id, ) pg_second = env.postgres.create_start( @@ -365,7 +370,7 @@ def test_tenant_relocation( "python", os.path.join(base_dir, "scripts/export_import_between_pageservers.py"), "--tenant-id", - tenant_id.hex, + str(tenant_id), "--from-host", "localhost", "--from-http-port", diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 8617bc8ea9..befa4616be 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,6 +1,5 @@ -from uuid import UUID - from fixtures.neon_fixtures import NeonEnvBuilder, wait_until +from fixtures.types import ZTenantId, ZTimelineId def get_only_element(l): # noqa: E741 @@ -23,7 +22,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): def get_state(tenant): all_states = client.tenant_list() - matching = [t for t in all_states if t["id"] == tenant.hex] + matching = [t for t in all_states if ZTenantId(t["id"]) == tenant] return get_only_element(matching)["state"] def get_metric_value(name): @@ -35,8 +34,8 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): value = line.lstrip(name).strip() return int(value) - def delete_all_timelines(tenant): - timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)] + def delete_all_timelines(tenant: ZTenantId): + timelines = [ZTimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) @@ -55,7 +54,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Detach all tenants and wait for them to go idle # TODO they should be already idle since there are no active computes for tenant_info in client.tenant_list(): - tenant_id = UUID(tenant_info["id"]) + tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) wait_until(10, 0.2, lambda: assert_idle(tenant_id)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0e0cd44471..8bbf45205a 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -6,7 +6,7 @@ import pytest from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.utils import lsn_to_hex +from fixtures.types import Lsn @pytest.mark.parametrize("with_safekeepers", [False, True]) @@ -84,22 +84,24 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): sk_metrics = all_metrics[1:] ttids = [ - {"tenant_id": tenant_1.hex, "timeline_id": timeline_1.hex}, - {"tenant_id": tenant_2.hex, "timeline_id": timeline_2.hex}, + {"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)}, + {"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)}, ] # Test metrics per timeline for tt in ttids: log.info(f"Checking metrics for {tt}") - ps_lsn = int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value) - sk_lsns = [int(sk.query_one("safekeeper_commit_lsn", filter=tt).value) for sk in sk_metrics] + ps_lsn = Lsn(int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value)) + sk_lsns = [ + Lsn(int(sk.query_one("safekeeper_commit_lsn", filter=tt).value)) for sk in sk_metrics + ] - log.info(f"ps_lsn: {lsn_to_hex(ps_lsn)}") - log.info(f"sk_lsns: {list(map(lsn_to_hex, sk_lsns))}") + log.info(f"ps_lsn: {ps_lsn}") + log.info(f"sk_lsns: {sk_lsns}") assert ps_lsn <= max(sk_lsns) - assert ps_lsn > 0 + assert ps_lsn > Lsn(0) # Test common metrics for metrics in all_metrics: diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 083150e12a..70b474c9a9 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -8,7 +8,6 @@ import asyncio from typing import List, Tuple -from uuid import UUID import pytest from fixtures.neon_fixtures import ( @@ -20,7 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.utils import lsn_from_hex +from fixtures.types import Lsn, ZTenantId, ZTimelineId async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -28,9 +27,6 @@ async def tenant_workload(env: NeonEnv, pg: Postgres): pg_conn = await pg.connect_async() - await pg_conn.fetchval("show neon.tenant_id") - await pg_conn.fetchval("show neon.timeline_id") - await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): await pg_conn.execute( @@ -62,7 +58,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem env = neon_env_builder.init_start() - tenants_pgs: List[Tuple[UUID, Postgres]] = [] + tenants_pgs: List[Tuple[ZTenantId, Postgres]] = [] for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly @@ -87,13 +83,13 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem res = pg.safe_psql_many( ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] ) - tenant_id = res[0][0][0] - timeline_id = res[1][0][0] - current_lsn = lsn_from_hex(res[2][0][0]) + tenant_id = ZTenantId(res[0][0][0]) + timeline_id = ZTimelineId(res[1][0][0]) + current_lsn = Lsn(res[2][0][0]) # wait until pageserver receives all the data - wait_for_last_record_lsn(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) # run final checkpoint manually to flush all the data to remote storage env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") - wait_for_upload(pageserver_http, UUID(tenant_id), UUID(timeline_id), current_lsn) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7a55ffb769..a5dadc535b 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,7 +1,6 @@ -from uuid import uuid4 - import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until +from fixtures.types import ZTenantId, ZTimelineId def test_timeline_delete(neon_simple_env: NeonEnv): @@ -11,15 +10,15 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # first try to delete non existing timeline # for existing tenant: - invalid_timeline_id = uuid4() + invalid_timeline_id = ZTimelineId.generate() with pytest.raises(NeonPageserverApiException, match="timeline not found"): ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) # for non existing tenant: - invalid_tenant_id = uuid4() + invalid_tenant_id = ZTenantId.generate() with pytest.raises( NeonPageserverApiException, - match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state", + match=f"Tenant {invalid_tenant_id} not found in local tenant state", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) @@ -37,7 +36,11 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ): timeline_path = ( - env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + env.repo_dir + / "tenants" + / str(env.initial_tenant) + / "timelines" + / str(parent_timeline_id) ) assert timeline_path.exists() @@ -46,7 +49,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert not timeline_path.exists() timeline_path = ( - env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) ) assert timeline_path.exists() diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index f6b665ec8c..aba8567541 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -3,7 +3,6 @@ import random import re import time from contextlib import closing -from uuid import UUID import psycopg2.errors import psycopg2.extras @@ -15,6 +14,7 @@ from fixtures.neon_fixtures import ( assert_timeline_local, wait_for_last_flush_lsn, ) +from fixtures.types import ZTenantId, ZTimelineId from fixtures.utils import get_timeline_dir_size @@ -34,8 +34,6 @@ def test_timeline_size(neon_simple_env: NeonEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timeline_id") - cur.execute("CREATE TABLE foo (t text)") cur.execute( """ @@ -77,8 +75,6 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timeline_id") - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res["local"] assert ( @@ -254,7 +250,7 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -281,8 +277,8 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") - env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + env.pageserver.safe_psql(f"compact {env.initial_tenant} {new_timeline_id}") assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -307,7 +303,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") pg.safe_psql( """ @@ -318,9 +314,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") - env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0") + env.pageserver.safe_psql(f"do_gc {env.initial_tenant} {new_timeline_id} 0") assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -343,12 +339,12 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") + env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() matches = re.search( - f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -361,7 +357,7 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): # Check that the logical size metric is sane, and matches matches = re.search( - f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -389,7 +385,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: UUID): + def get_timeline_physical_size(timeline: ZTimelineId): res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) return res["local"]["current_physical_size_non_incremental"] @@ -408,7 +404,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, tenant, timeline) - env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}") + env.pageserver.safe_psql(f"checkpoint {tenant} {timeline}") timeline_total_size += get_timeline_physical_size(timeline) @@ -418,7 +414,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): assert tenant_physical_size == timeline_total_size -def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): +def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): """Check the current physical size returned from timeline API matches the total physical size of the timeline on disk""" client = env.pageserver.http_client() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 28daeb18ed..cd370e60c0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -7,12 +7,10 @@ import subprocess import sys import threading import time -import uuid from contextlib import closing from dataclasses import dataclass, field from pathlib import Path from typing import Any, List, Optional -from uuid import uuid4 import pytest from fixtures.log_helper import log @@ -34,14 +32,19 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.utils import get_dir_size, lsn_from_hex, lsn_to_hex, query_scalar +from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.utils import get_dir_size, query_scalar def wait_lsn_force_checkpoint( - tenant_id: str, timeline_id: str, pg: Postgres, ps: NeonPageserver, pageserver_conn_options={} + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + pg: Postgres, + ps: NeonPageserver, + pageserver_conn_options={}, ): - lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - log.info(f"pg_current_wal_flush_lsn is {lsn_to_hex(lsn)}, waiting for it on pageserver") + lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") auth_token = None if "password" in pageserver_conn_options: @@ -50,8 +53,8 @@ def wait_lsn_force_checkpoint( # wait for the pageserver to catch up wait_for_last_record_lsn( ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), + tenant_id, + timeline_id, lsn, ) @@ -63,19 +66,19 @@ def wait_lsn_force_checkpoint( # ensure that remote_consistent_lsn is advanced wait_for_upload( ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), + tenant_id, + timeline_id, lsn, ) @dataclass class TimelineMetrics: - timeline_id: str - last_record_lsn: int + timeline_id: ZTimelineId + last_record_lsn: Lsn # One entry per each Safekeeper, order is the same - flush_lsns: List[int] = field(default_factory=list) - commit_lsns: List[int] = field(default_factory=list) + flush_lsns: List[Lsn] = field(default_factory=list) + commit_lsns: List[Lsn] = field(default_factory=list) # Run page server and multiple acceptors, and multiple compute nodes running @@ -123,7 +126,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): timeline_metrics = [] for timeline_detail in timeline_details: - timeline_id: str = timeline_detail["timeline_id"] + timeline_id = ZTimelineId(timeline_detail["timeline_id"]) local_timeline_detail = timeline_detail.get("local") if local_timeline_detail is None: @@ -132,11 +135,11 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): m = TimelineMetrics( timeline_id=timeline_id, - last_record_lsn=lsn_from_hex(local_timeline_detail["last_record_lsn"]), + last_record_lsn=Lsn(local_timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: - m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) - m.commit_lsns.append(sk_m.commit_lsn_inexact[(tenant_id.hex, timeline_id)]) + m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) + m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)])) for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. @@ -216,7 +219,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): final_m = collect_metrics("after SELECT") # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. # Also assume that safekeepers will not be significantly out of sync in this test. - middle_lsn = (init_m[0].last_record_lsn + final_m[0].last_record_lsn) // 2 + middle_lsn = Lsn((int(init_m[0].last_record_lsn) + int(final_m[0].last_record_lsn)) // 2) assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns) assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns) assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns) @@ -270,8 +273,8 @@ def test_broker(neon_env_builder: NeonEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -288,8 +291,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): while True: stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] if all( - lsn_from_hex(s_after.remote_consistent_lsn) - > lsn_from_hex(s_before.remote_consistent_lsn) + s_after.remote_consistent_lsn > s_before.remote_consistent_lsn for s_after, s_before in zip(stat_after, stat_before) ): break @@ -323,8 +325,8 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ] ) - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} @@ -334,7 +336,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ - os.path.join(sk.data_dir(), tenant_id, timeline_id, "000000010000000000000001") + os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001") for sk in env.safekeepers ] assert all(os.path.exists(p) for p in first_segments) @@ -346,7 +348,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(tenant_id) ) http_cli_other = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) ) http_cli_noauth = env.safekeepers[0].http_client() @@ -367,7 +369,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ) http_cli.record_safekeeper_info(tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"}) assert ( - "FFFFFFFF/FEFFFFFF" + Lsn("FFFFFFFF/FEFFFFFF") == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn ) @@ -382,14 +384,14 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): time.sleep(0.5) -def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): +def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end: Lsn): started_at = time.time() http_cli = live_sk.http_client() while True: tli_status = http_cli.timeline_status(tenant_id, timeline_id) log.info(f"live sk status is {tli_status}") - if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + if tli_status.backup_lsn >= seg_end: break elapsed = time.time() - started_at if elapsed > 30: @@ -399,23 +401,22 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): time.sleep(0.5) -def wait_wal_trim(tenant_id, timeline_id, sk, target_size): +def wait_wal_trim(tenant_id, timeline_id, sk, target_size_mb): started_at = time.time() http_cli = sk.http_client() while True: tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = ( - get_dir_size(os.path.join(sk.data_dir(), tenant_id, timeline_id)) / 1024 / 1024 - ) - log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size:.2f}MB status={tli_status}") + sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id))) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") - if sk_wal_size <= target_size: + if sk_wal_size_mb <= target_size_mb: break elapsed = time.time() - started_at if elapsed > 20: raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for sk_id={sk.id} to trim WAL to {target_size:.2f}MB, current size is {sk_wal_size:.2f}MB" + f"timed out waiting {elapsed:.0f}s for sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB, current size is {sk_wal_size_mb:.2f}MB" ) time.sleep(0.5) @@ -437,8 +438,8 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot pg = env.postgres.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) pg_conn = pg.connect() cur = pg_conn.cursor() @@ -446,7 +447,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot # Shut down subsequently each of safekeepers and fill a segment while sk is # down; ensure segment gets offloaded by others. - offloaded_seg_end = ["0/2000000", "0/3000000", "0/4000000"] + offloaded_seg_end = [Lsn("0/2000000"), Lsn("0/3000000"), Lsn("0/4000000")] for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): victim.stop() # roughly fills one segment @@ -465,7 +466,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("insert into t select generate_series(1,250000), 'payload'") - wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], "0/5000000") + wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], Lsn("0/5000000")) @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -492,8 +493,8 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) expected_sum = 0 @@ -503,7 +504,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re cur.execute("insert into t values (1, 'payload')") expected_sum += 1 - offloaded_seg_end = ["0/3000000"] + offloaded_seg_end = [Lsn("0/3000000")] for seg_end in offloaded_seg_end: # roughly fills two segments cur.execute("insert into t select generate_series(1,500000), 'payload'") @@ -517,7 +518,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re # advance remote_consistent_lsn to trigger WAL trimming # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates env.safekeepers[0].http_client().record_safekeeper_info( - tenant_id, timeline_id, {"remote_consistent_lsn": offloaded_seg_end[-1]} + tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end[-1])} ) for sk in env.safekeepers: @@ -526,10 +527,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re last_lsn = query_scalar(cur, "SELECT pg_current_wal_flush_lsn()") - pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)) - )["local"]["last_record_lsn"] - lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + pageserver_lsn = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ + "last_record_lsn" + ] + lag = Lsn(last_lsn) - Lsn(pageserver_lsn) log.info( f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) @@ -554,10 +555,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re if elapsed > wait_lsn_timeout: raise RuntimeError("Timed out waiting for WAL redo") - pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)) - )["local"]["last_record_lsn"] - lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + pageserver_lsn = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "local" + ]["last_record_lsn"] + lag = Lsn(last_lsn) - Lsn(pageserver_lsn) if time.time() > last_debug_print + 10 or lag <= 0: last_debug_print = time.time() @@ -583,8 +584,8 @@ class ProposerPostgres(PgProtocol): self, pgdata_dir: str, pg_bin, - timeline_id: uuid.UUID, - tenant_id: uuid.UUID, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, listen_addr: str, port: int, ): @@ -592,8 +593,8 @@ class ProposerPostgres(PgProtocol): self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin - self.timeline_id: uuid.UUID = timeline_id - self.tenant_id: uuid.UUID = tenant_id + self.tenant_id: ZTenantId = tenant_id + self.timeline_id: ZTimelineId = timeline_id self.listen_addr: str = listen_addr self.port: int = port @@ -613,8 +614,8 @@ class ProposerPostgres(PgProtocol): cfg = [ "synchronous_standby_names = 'walproposer'\n", "shared_preload_libraries = 'neon'\n", - f"neon.timeline_id = '{self.timeline_id.hex}'\n", - f"neon.tenant_id = '{self.tenant_id.hex}'\n", + f"neon.timeline_id = '{self.timeline_id}'\n", + f"neon.tenant_id = '{self.tenant_id}'\n", "neon.pageserver_connstring = ''\n", f"neon.safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", @@ -623,7 +624,7 @@ class ProposerPostgres(PgProtocol): f.writelines(cfg) - def sync_safekeepers(self) -> str: + def sync_safekeepers(self) -> Lsn: """ Run 'postgres --sync-safekeepers'. Returns execution result, which is commit_lsn after sync. @@ -639,7 +640,7 @@ class ProposerPostgres(PgProtocol): with open(stdout_filename, "r") as stdout_f: stdout = stdout_f.read() - return stdout.strip("\n ") + return Lsn(stdout.strip("\n ")) def initdb(self): """Run initdb""" @@ -671,18 +672,18 @@ def test_sync_safekeepers( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - timeline_id = uuid.uuid4() - tenant_id = uuid.uuid4() + tenant_id = ZTenantId.generate() + timeline_id = ZTimelineId.generate() # write config for proposer pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") pg = ProposerPostgres( - pgdata_dir, pg_bin, timeline_id, tenant_id, "127.0.0.1", port_distributor.get_port() + pgdata_dir, pg_bin, tenant_id, timeline_id, "127.0.0.1", port_distributor.get_port() ) pg.create_dir_config(env.get_safekeeper_connstrs()) # valid lsn, which is not in the segment start, nor in zero segment - epoch_start_lsn = 0x16B9188 # 0/16B9188 + epoch_start_lsn = Lsn("0/16B9188") begin_lsn = epoch_start_lsn # append and commit WAL @@ -697,14 +698,14 @@ def test_sync_safekeepers( "set_commit_lsn": True, "send_proposer_elected": True, "term": 2, - "begin_lsn": begin_lsn, - "epoch_start_lsn": epoch_start_lsn, - "truncate_lsn": epoch_start_lsn, + "begin_lsn": int(begin_lsn), + "epoch_start_lsn": int(epoch_start_lsn), + "truncate_lsn": int(epoch_start_lsn), }, ) - lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"]) - lsn_after_append.append(lsn_hex) - log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}") + lsn = Lsn(res["inserted_wal"]["end_lsn"]) + lsn_after_append.append(lsn) + log.info(f"safekeeper[{i}] lsn after append: {lsn}") # run sync safekeepers lsn_after_sync = pg.sync_safekeepers() @@ -724,8 +725,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa = env.safekeepers[0] # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) if not auth_enabled: wa_http_cli = wa.http_client() @@ -734,7 +735,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() wa_http_cli_bad = wa.http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) ) wa_http_cli_bad.check_status() wa_http_cli_noauth = wa.http_client() @@ -784,15 +785,15 @@ class SafekeeperEnv: self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[uuid.UUID] = None - self.timeline_id: Optional[uuid.UUID] = None + self.tenant_id: Optional[ZTenantId] = None + self.timeline_id: Optional[ZTimelineId] = None def init(self) -> "SafekeeperEnv": assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" - self.timeline_id = uuid.uuid4() - self.tenant_id = uuid.uuid4() + self.tenant_id = ZTenantId.generate() + self.timeline_id = ZTimelineId.generate() self.repo_dir.mkdir(exist_ok=True) # Create config and a Safekeeper object for each safekeeper @@ -841,8 +842,8 @@ class SafekeeperEnv: pg = ProposerPostgres( pgdata_dir, self.pg_bin, - self.timeline_id, self.tenant_id, + self.timeline_id, "127.0.0.1", self.port_distributor.get_port(), ) @@ -911,7 +912,9 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") assert sum_after == sum_before + 5000050000 - def show_statuses(safekeepers: List[Safekeeper], tenant_id: str, timeline_id: str): + def show_statuses( + safekeepers: List[Safekeeper], tenant_id: ZTenantId, timeline_id: ZTimelineId + ): for sk in safekeepers: http_cli = sk.http_client() try: @@ -932,8 +935,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): pg.start() # learn neon timeline from compute - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -985,20 +988,21 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): # of WAL segments. def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): # used to calculate delta in collect_stats - last_lsn = 0.0 + last_lsn = Lsn(0) - # returns LSN and pg_wal size, all in MB + # returns pg_wal size in MB def collect_stats(pg: Postgres, cur, enable_logs=True): nonlocal last_lsn assert pg.pgdata_dir is not None log.info("executing INSERT to generate WAL") - current_lsn = lsn_from_hex(query_scalar(cur, "select pg_current_wal_lsn()")) / 1024 / 1024 - pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 + current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + pg_wal_size_mb = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 if enable_logs: - log.info(f"LSN delta: {current_lsn - last_lsn} MB, current WAL size: {pg_wal_size} MB") + lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024 + log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB") last_lsn = current_lsn - return current_lsn, pg_wal_size + return pg_wal_size_mb # generates about ~20MB of WAL, to create at least one new segment def generate_wal(cur): @@ -1027,7 +1031,7 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): log.info("executing checkpoint") cur.execute("CHECKPOINT") - wal_size_after_checkpoint = collect_stats(pg, cur)[1] + wal_size_after_checkpoint = collect_stats(pg, cur) # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) assert wal_size_after_checkpoint < 16 * 2.5 @@ -1040,22 +1044,20 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() # Create two tenants: one will be deleted, other should be preserved. - tenant_id = env.initial_tenant.hex - timeline_id_1 = env.neon_cli.create_branch("br1").hex # Active, delete explicitly - timeline_id_2 = env.neon_cli.create_branch("br2").hex # Inactive, delete explicitly - timeline_id_3 = env.neon_cli.create_branch("br3").hex # Active, delete with the tenant - timeline_id_4 = env.neon_cli.create_branch("br4").hex # Inactive, delete with the tenant + tenant_id = env.initial_tenant + timeline_id_1 = env.neon_cli.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.neon_cli.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.neon_cli.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.neon_cli.create_branch("br4") # Inactive, delete with the tenant - tenant_id_other_uuid, timeline_id_other_uuid = env.neon_cli.create_tenant() - tenant_id_other = tenant_id_other_uuid.hex - timeline_id_other = timeline_id_other_uuid.hex + tenant_id_other, timeline_id_other = env.neon_cli.create_tenant() # Populate branches pg_1 = env.postgres.create_start("br1") pg_2 = env.postgres.create_start("br2") pg_3 = env.postgres.create_start("br3") pg_4 = env.postgres.create_start("br4") - pg_other = env.postgres.create_start("main", tenant_id=uuid.UUID(hex=tenant_id_other)) + pg_other = env.postgres.create_start("main", tenant_id=tenant_id_other) for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: with closing(pg.connect()) as conn: with conn.cursor() as cur: @@ -1071,11 +1073,11 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) ) sk_http_noauth = sk.http_client() - assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. pg_2.stop_and_destroy() @@ -1094,22 +1096,22 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): "dir_existed": True, "was_active": True, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Ensure repeated deletion succeeds assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { "dir_existed": False, "was_active": False, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() if auth_enabled: # Ensure we cannot delete the other tenant @@ -1118,44 +1120,44 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): assert sk_h.tenant_delete_force(tenant_id_other) - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant's br2 (inactive) assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == { "dir_existed": True, "was_active": False, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert not (sk_data_dir / tenant_id / timeline_id_2).exists() - assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, "00" * 16) == { + assert sk_http.timeline_delete_force(tenant_id, ZTimelineId("00" * 16)) == { "dir_existed": False, "was_active": False, } - assert not (sk_data_dir / tenant_id / timeline_id_1).exists() - assert not (sk_data_dir / tenant_id / timeline_id_2).exists() - assert (sk_data_dir / tenant_id / timeline_id_3).exists() - assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant fully (two branches are active) response = sk_http.tenant_delete_force(tenant_id) - assert response[timeline_id_3] == { + assert response[str(timeline_id_3)] == { "dir_existed": True, "was_active": True, } - assert not (sk_data_dir / tenant_id).exists() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant again. response = sk_http.tenant_delete_force(tenant_id) assert response == {} - assert not (sk_data_dir / tenant_id).exists() - assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Ensure the other tenant still works sk_http_other.timeline_status(tenant_id_other, timeline_id_other) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 83285e0cbe..e36d3cf94b 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -1,14 +1,13 @@ import asyncio import random import time -import uuid from dataclasses import dataclass from typing import List, Optional import asyncpg from fixtures.log_helper import getLogger from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper -from fixtures.utils import lsn_from_hex, lsn_to_hex +from fixtures.types import Lsn, ZTenantId, ZTimelineId log = getLogger("root.safekeeper_async") @@ -104,9 +103,9 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou async def wait_for_lsn( safekeeper: Safekeeper, - tenant_id: str, - timeline_id: str, - wait_lsn: str, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + wait_lsn: Lsn, polling_interval=1, timeout=60, ): @@ -124,7 +123,7 @@ async def wait_for_lsn( f"Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}" ) - while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn): + while wait_lsn > flush_lsn: elapsed = time.time() - started_at if elapsed > timeout: raise RuntimeError( @@ -156,8 +155,8 @@ async def run_restarts_under_load( test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenant_id") - timeline_id = await pg_conn.fetchval("show neon.timeline_id") + tenant_id = ZTenantId(await pg_conn.fetchval("show neon.tenant_id")) + timeline_id = ZTimelineId(await pg_conn.fetchval("show neon.timeline_id")) bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances @@ -176,14 +175,15 @@ async def run_restarts_under_load( victim = acceptors[victim_idx] victim.stop() - flush_lsn = await pg_conn.fetchval("SELECT pg_current_wal_flush_lsn()") - flush_lsn = lsn_to_hex(flush_lsn) + flush_lsn = Lsn(await pg_conn.fetchval("SELECT pg_current_wal_flush_lsn()")) log.info(f"Postgres flush_lsn {flush_lsn}") - pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)) - )["local"]["last_record_lsn"] - sk_ps_lag = lsn_from_hex(flush_lsn) - lsn_from_hex(pageserver_lsn) + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["local"][ + "last_record_lsn" + ] + ) + sk_ps_lag = flush_lsn - pageserver_lsn log.info(f"Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb") # Wait until alive safekeepers catch up with postgres diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 0847b5a505..6fd509c4d1 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -9,6 +9,7 @@ from fixtures.neon_fixtures import ( base_dir, pg_distrib_dir, ) +from fixtures.types import ZTenantId def test_wal_restore( @@ -21,7 +22,7 @@ def test_wal_restore( env.neon_cli.create_branch("test_wal_restore") pg = env.postgres.create_start("test_wal_restore") pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" From 8a7333438a566a32a99f41bc238b5d596eab2cda Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 11:58:28 +0300 Subject: [PATCH 005/166] Extract common remote storage operations into GenericRemoteStorage (#2373) --- libs/remote_storage/src/lib.rs | 96 +++++++++++++++++++++++++ pageserver/src/storage_sync/download.rs | 39 ++-------- pageserver/src/storage_sync/upload.rs | 58 ++------------- safekeeper/src/wal_backup.rs | 78 ++++++-------------- 4 files changed, 128 insertions(+), 143 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index d5ad2f8633..8a10e098a1 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -164,6 +164,102 @@ impl GenericRemoteStorage { _ => None, } } + + /// Takes storage object contents and its size and uploads to remote storage, + /// mapping `from_path` to the corresponding remote object id in the storage. + /// + /// The storage object does not have to be present on the `from_path`, + /// this path is used for the remote object id conversion only. + pub async fn upload_storage_object( + &self, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, + ) -> anyhow::Result<()> { + async fn do_upload_storage_object( + storage: &S, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, + ) -> anyhow::Result<()> + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let target_storage_path = storage.remote_object_id(from_path).with_context(|| { + format!( + "Failed to get the storage path for source local path '{}'", + from_path.display() + ) + })?; + + storage + .upload(from, from_size_bytes, &target_storage_path, None) + .await + .with_context(|| { + format!( + "Failed to upload from '{}' to storage path '{:?}'", + from_path.display(), + target_storage_path + ) + }) + } + + match self { + GenericRemoteStorage::Local(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + GenericRemoteStorage::S3(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + } + } + + /// Downloads the storage object into the `to_path` provided. + /// `byte_range` could be specified to dowload only a part of the file, if needed. + pub async fn download_storage_object( + &self, + byte_range: Option<(u64, Option)>, + to_path: &Path, + ) -> Result { + async fn do_download_storage_object( + storage: &S, + byte_range: Option<(u64, Option)>, + to_path: &Path, + ) -> Result + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let remote_object_path = storage + .remote_object_id(to_path) + .with_context(|| { + format!( + "Failed to get the storage path for target local path '{}'", + to_path.display() + ) + }) + .map_err(DownloadError::BadInput)?; + + match byte_range { + Some((start, end)) => { + storage + .download_byte_range(&remote_object_path, start, end) + .await + } + None => storage.download(&remote_object_path).await, + } + } + + match self { + GenericRemoteStorage::Local(storage) => { + do_download_storage_object(storage, byte_range, to_path).await + } + GenericRemoteStorage::S3(storage) => { + do_download_storage_object(storage, byte_range, to_path).await + } + } + } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index ded4c042c4..ebc9a252b7 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -10,7 +10,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use remote_storage::{ - path_with_suffix_extension, Download, DownloadError, GenericRemoteStorage, RemoteStorage, + path_with_suffix_extension, DownloadError, GenericRemoteStorage, RemoteStorage, }; use tokio::{ fs, @@ -143,7 +143,9 @@ async fn download_index_part( let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let mut index_part_download = download_storage_object(storage, &index_part_path).await?; + let mut index_part_download = storage + .download_storage_object(None, &index_part_path) + .await?; let mut index_part_bytes = Vec::new(); io::copy( @@ -262,7 +264,7 @@ pub(super) async fn download_timeline_layers<'a>( ) })?; - let mut layer_download = download_storage_object(storage, &layer_destination_path) + let mut layer_download = storage.download_storage_object(None, &layer_destination_path) .await .with_context(|| { format!( @@ -365,37 +367,6 @@ pub(super) async fn download_timeline_layers<'a>( } } -async fn download_storage_object( - storage: &GenericRemoteStorage, - to_path: &Path, -) -> Result { - async fn do_download_storage_object( - storage: &S, - to_path: &Path, - ) -> Result - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let remote_object_path = storage - .remote_object_id(to_path) - .with_context(|| { - format!( - "Failed to get the storage path for target local path '{}'", - to_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; - - storage.download(&remote_object_path).await - } - - match storage { - GenericRemoteStorage::Local(storage) => do_download_storage_object(storage, to_path).await, - GenericRemoteStorage::S3(storage) => do_download_storage_object(storage, to_path).await, - } -} - async fn get_timeline_sync_ids( storage: &GenericRemoteStorage, tenant_path: &Path, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index a8c768e0ae..7ef775e690 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -1,14 +1,11 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -use std::{ - fmt::Debug, - path::{Path, PathBuf}, -}; +use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use once_cell::sync::Lazy; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -47,7 +44,8 @@ pub(super) async fn upload_index_part( let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - upload_storage_object(storage, index_part_bytes, index_part_size, &index_part_path) + storage + .upload_storage_object(index_part_bytes, index_part_size, &index_part_path) .await .with_context(|| format!("Failed to upload index part for '{sync_id}'")) } @@ -131,7 +129,8 @@ pub(super) async fn upload_timeline_layers<'a>( .map_err(UploadError::Other)? .len() as usize; - match upload_storage_object(storage, source_file, source_size, &source_path) + match storage + .upload_storage_object(source_file, source_size, &source_path) .await .with_context(|| format!("Failed to upload layer file for {sync_id}")) { @@ -193,51 +192,6 @@ pub(super) async fn upload_timeline_layers<'a>( } } -async fn upload_storage_object( - storage: &GenericRemoteStorage, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_bytes: usize, - from_path: &Path, -) -> anyhow::Result<()> { - async fn do_upload_storage_object( - storage: &S, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_bytes: usize, - from_path: &Path, - ) -> anyhow::Result<()> - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let target_storage_path = storage.remote_object_id(from_path).with_context(|| { - format!( - "Failed to get the storage path for source local path '{}'", - from_path.display() - ) - })?; - - storage - .upload(from, from_size_bytes, &target_storage_path, None) - .await - .with_context(|| { - format!( - "Failed to upload from '{}' to storage path '{:?}'", - from_path.display(), - target_storage_path - ) - }) - } - - match storage { - GenericRemoteStorage::Local(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - GenericRemoteStorage::S3(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - } -} - enum UploadError { MissingLocalFile(PathBuf, anyhow::Error), Other(anyhow::Error), diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 3552452470..a15ba02863 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -13,7 +13,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr}; use postgres_ffi::PG_TLI; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use tokio::fs::File; use tokio::runtime::Builder; @@ -419,73 +419,37 @@ static REMOTE_STORAGE: OnceCell> = OnceCell::new(); async fn backup_object(source_file: &Path, size: usize) -> Result<()> { let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); - let file = File::open(&source_file).await?; + let file = tokio::io::BufReader::new(File::open(&source_file).await.with_context(|| { + format!( + "Failed to open file {} for wal backup", + source_file.display() + ) + })?); - // Storage is initialized by launcher at this point. - match storage.as_ref().unwrap() { - GenericRemoteStorage::Local(local_storage) => { - let destination = local_storage.remote_object_id(source_file)?; - - debug!( - "local upload about to start from {} to {}", - source_file.display(), - destination.display() - ); - local_storage.upload(file, size, &destination, None).await - } - GenericRemoteStorage::S3(s3_storage) => { - let s3key = s3_storage.remote_object_id(source_file)?; - - debug!( - "S3 upload about to start from {} to {:?}", - source_file.display(), - s3key - ); - s3_storage.upload(file, size, &s3key, None).await - } - }?; - - Ok(()) + storage + .as_ref() + .expect("Storage should be initialized by launcher at this point.") + .upload_storage_object(file, size, source_file) + .await } pub async fn read_object( file_path: PathBuf, offset: u64, ) -> anyhow::Result>> { - let download = match REMOTE_STORAGE + let download = REMOTE_STORAGE .get() .context("Failed to get remote storage")? .as_ref() .context("No remote storage configured")? - { - GenericRemoteStorage::Local(local_storage) => { - let source = local_storage.remote_object_id(&file_path)?; - - info!( - "local download about to start from {} at offset {}", - source.display(), - offset - ); - local_storage - .download_byte_range(&source, offset, None) - .await - } - GenericRemoteStorage::S3(s3_storage) => { - let s3key = s3_storage.remote_object_id(&file_path)?; - - info!( - "S3 download about to start from {:?} at offset {}", - s3key, offset - ); - s3_storage.download_byte_range(&s3key, offset, None).await - } - } - .with_context(|| { - format!( - "Failed to open WAL segment download stream for local storage path {}", - file_path.display() - ) - })?; + .download_storage_object(Some((offset, None)), &file_path) + .await + .with_context(|| { + format!( + "Failed to open WAL segment download stream for local storage path {}", + file_path.display() + ) + })?; Ok(download.download_stream) } From f78a542cbad53d3cb12b2655ec71abfb51ebc22a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 23 Aug 2022 23:58:49 +0300 Subject: [PATCH 006/166] Calculate timeline initial logical size in the background Start the calculation on the first size request, return partially calculated size during calculation, retry if failed. Remove "fast" size init through the ancestor: the current approach is fast enough for now and there are better ways to optimize the calculation via incremental ancestor size computation --- pageserver/src/http/routes.rs | 10 +- pageserver/src/layered_repository.rs | 48 ++- pageserver/src/layered_repository/timeline.rs | 323 ++++++++++++------ pageserver/src/pgdatadir_mapping.rs | 6 +- pageserver/src/tenant_mgr.rs | 30 +- .../src/walreceiver/walreceiver_connection.rs | 10 +- test_runner/regress/test_broken_timeline.py | 18 +- test_runner/regress/test_timeline_size.py | 54 ++- 8 files changed, 324 insertions(+), 175 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ef18129504..710014de98 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -75,7 +75,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline fn local_timeline_info_from_loaded_timeline( - timeline: &Timeline, + timeline: &Arc, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -106,7 +106,11 @@ fn local_timeline_info_from_loaded_timeline( prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), timeline_state: LocalTimelineState::Loaded, - current_logical_size: Some(timeline.get_current_logical_size()), + current_logical_size: Some( + timeline + .get_current_logical_size() + .context("Timeline info creation failed to get current logical size")?, + ), current_physical_size: Some(timeline.get_physical_size()), current_logical_size_non_incremental: if include_non_incremental_logical_size { Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) @@ -212,7 +216,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_loaded_timeline(new_timeline.as_ref(), false, false)?; + let local_info = local_timeline_info_from_loaded_timeline(&new_timeline, false, false)?; Ok(Some(TimelineInfo { tenant_id, timeline_id: new_timeline_id, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 73c30b51b8..9d405b0033 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -136,14 +136,11 @@ impl Repository { } /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - pub fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { + pub fn get_timeline_load(&self, timeline_id: ZTimelineId) -> Result> { let mut timelines = self.timelines.lock().unwrap(); - match self.get_timeline_load_internal(timelineid, &mut timelines)? { + match self.get_timeline_load_internal(timeline_id, &mut timelines)? { Some(local_loaded_timeline) => Ok(local_loaded_timeline), - None => anyhow::bail!( - "cannot get local timeline: unknown timeline id: {}", - timelineid - ), + None => anyhow::bail!("cannot get local timeline, unknown timeline id: {timeline_id}"), } } @@ -559,33 +556,34 @@ impl Repository { timeline_id: ZTimelineId, timelines: &mut HashMap, ) -> anyhow::Result>> { - match timelines.get(&timeline_id) { + Ok(match timelines.get(&timeline_id) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { debug!("timeline {timeline_id} found loaded into memory"); - return Ok(Some(Arc::clone(local_timeline))); + Some(Arc::clone(local_timeline)) + } + LayeredTimelineEntry::Unloaded { .. } => { + debug!( + "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" + ); + let timeline = self.load_local_timeline(timeline_id, timelines)?; + let was_loaded = timelines.insert( + timeline_id, + LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), + ); + ensure!( + was_loaded.is_none() + || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), + "assertion failure, inserted wrong timeline in an incorrect state" + ); + Some(timeline) } - LayeredTimelineEntry::Unloaded { .. } => {} }, None => { debug!("timeline {timeline_id} not found"); - return Ok(None); + None } - }; - debug!( - "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" - ); - let timeline = self.load_local_timeline(timeline_id, timelines)?; - let was_loaded = timelines.insert( - timeline_id, - LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), - ); - ensure!( - was_loaded.is_none() - || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), - "assertion failure, inserted wrong timeline in an incorrect state" - ); - Ok(Some(timeline)) + }) } fn load_local_timeline( diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 8b90cc4e6b..fd719812a3 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -5,17 +5,17 @@ use bytes::Bytes; use fail::fail_point; use itertools::Itertools; use metrics::core::{AtomicU64, GenericCounter}; -use once_cell::sync::Lazy; +use once_cell::sync::{Lazy, OnceCell}; use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; -use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; +use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; +use std::{fs, thread}; use metrics::{ register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, @@ -137,13 +137,13 @@ static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( "pageserver_current_logical_size", "Current logical size grouped by timeline", &["tenant_id", "timeline_id"] ) - .expect("failed to define a metric") + .expect("failed to define current logical size metric") }); // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, @@ -242,7 +242,7 @@ struct TimelineMetrics { pub wait_lsn_time_histo: Histogram, pub current_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size - pub current_logical_size_gauge: IntGauge, + pub current_logical_size_gauge: UIntGauge, } impl TimelineMetrics { @@ -389,6 +389,37 @@ pub struct Timeline { repartition_threshold: u64, /// Current logical size of the "datadir", at the last LSN. + current_logical_size: LogicalSize, + // TODO task management should be done outside timeline, managed along with other tasks. + #[allow(clippy::type_complexity)] + initial_size_computation_task: + Mutex>, mpsc::Receiver<()>)>>, + + /// Information about the last processed message by the WAL receiver, + /// or None if WAL receiver has not received anything for this timeline + /// yet. + pub last_received_wal: Mutex>, + + /// Relation size cache + pub rel_size_cache: RwLock>, +} + +/// Internal structure to hold all data needed for logical size calculation. +/// Calculation consists of two parts: +/// 1. Initial size calculation. That might take a long time, because it requires +/// reading all layers containing relation sizes up to the `initial_part_end`. +/// 2. Collecting an incremental part and adding that to the initial size. +/// Increments are appended on walreceiver writing new timeline data, +/// which result in increase or decrease of the logical size. +struct LogicalSize { + /// Size, potentially slow to compute, derived from all layers located locally on this node's FS. + /// Might require reading multiple layers, and even ancestor's layers, to collect the size. + /// + /// NOTE: initial size is not a constant and will change between restarts. + initial_logical_size: OnceCell, + /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. + initial_part_end: Option, + /// All other size changes after startup, combined together. /// /// Size shouldn't ever be negative, but this is signed for two reasons: /// @@ -407,22 +438,82 @@ pub struct Timeline { /// /// Note that we also expose a copy of this value as a prometheus metric, /// see `current_logical_size_gauge`. Use the `update_current_logical_size` - /// and `set_current_logical_size` functions to modify this, they will - /// also keep the prometheus metric in sync. - current_logical_size: AtomicI64, - // TODO we don't have a good, API to ensure on a compilation level - // that the timeline passes all initialization. - // Hence we ensure that we init at least once for every timeline - // and keep this flag to avoid potentually long recomputes. - logical_size_initialized: AtomicBool, + /// to modify this, it will also keep the prometheus metric in sync. + size_added_after_initial: AtomicI64, +} - /// Information about the last processed message by the WAL receiver, - /// or None if WAL receiver has not received anything for this timeline - /// yet. - pub last_received_wal: Mutex>, +/// Normalized current size, that the data in pageserver occupies. +#[derive(Debug, Clone, Copy)] +enum CurrentLogicalSize { + /// The size is not yet calculated to the end, this is an intermediate result, + /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, + /// yet total logical size cannot be below 0. + Approximate(u64), + // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are + // available for observation without any calculations. + Exact(u64), +} - /// Relation size cache - pub rel_size_cache: RwLock>, +impl CurrentLogicalSize { + fn size(&self) -> u64 { + *match self { + Self::Approximate(size) => size, + Self::Exact(size) => size, + } + } +} + +impl LogicalSize { + fn empty_initial() -> Self { + Self { + initial_logical_size: OnceCell::with_value(0), + initial_part_end: None, + size_added_after_initial: AtomicI64::new(0), + } + } + + fn deferred_initial(compute_to: Lsn) -> Self { + Self { + initial_logical_size: OnceCell::new(), + initial_part_end: Some(compute_to), + size_added_after_initial: AtomicI64::new(0), + } + } + + fn current_size(&self) -> anyhow::Result { + let size_increment = self.size_added_after_initial.load(AtomicOrdering::Acquire); + match self.initial_logical_size.get() { + Some(initial_size) => { + let absolute_size_increment = u64::try_from( + size_increment + .checked_abs() + .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, + ).with_context(|| format!("Failed to convert size increment {size_increment} to u64"))?; + + if size_increment < 0 { + initial_size.checked_sub(absolute_size_increment) + } else { + initial_size.checked_add(absolute_size_increment) + }.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) + .map(CurrentLogicalSize::Exact) + } + None => { + let non_negative_size_increment = size_increment.max(0); + u64::try_from(non_negative_size_increment) + .with_context(|| { + format!( + "Failed to convert size increment {non_negative_size_increment} to u64" + ) + }) + .map(CurrentLogicalSize::Approximate) + } + } + } + + fn increment_size(&self, delta: i64) { + self.size_added_after_initial + .fetch_add(delta, AtomicOrdering::SeqCst); + } } pub struct WalReceiverInfo { @@ -491,7 +582,9 @@ impl Timeline { /// the Repository implementation may incorrectly return a value from an ancestor /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// - pub fn get(&self, key: Key, lsn: Lsn) -> Result { + pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result { + anyhow::ensure!(lsn.is_valid(), "Invalid LSN"); + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed @@ -694,6 +787,8 @@ impl Timeline { walredo_mgr: Arc, upload_layers: bool, ) -> Timeline { + let disk_consistent_lsn = metadata.disk_consistent_lsn(); + let mut result = Timeline { conf, tenant_conf, @@ -705,12 +800,12 @@ impl Timeline { // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { - last: metadata.disk_consistent_lsn(), + last: disk_consistent_lsn, prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), }), - disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), + disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), - last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), ancestor_timeline: ancestor, @@ -733,8 +828,16 @@ impl Timeline { latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), - current_logical_size: AtomicI64::new(0), - logical_size_initialized: AtomicBool::new(false), + current_logical_size: if disk_consistent_lsn.is_valid() { + // we're creating timeline data with some layer files existing locally, + // need to recalculate timeline's logical size based on data in the layers. + LogicalSize::deferred_initial(disk_consistent_lsn) + } else { + // we're creating timeline data without any layers existing locally, + // initial logical size is 0. + LogicalSize::empty_initial() + }, + initial_size_computation_task: Mutex::new(None), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -835,92 +938,114 @@ impl Timeline { Ok(()) } - /// (Re-)calculate the logical size of the database at the latest LSN. + /// Retrieve current logical size of the timeline. /// - /// This can be a slow operation. - pub fn init_logical_size(&self) -> Result<()> { - if self.logical_size_initialized.load(AtomicOrdering::Acquire) { - return Ok(()); - } + /// The size could be lagging behind the actual number, in case + /// the initial size calculation has not been run (gets triggered on the first size access). + pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { + let current_size = self.current_logical_size.current_size()?; + debug!("Current size: {current_size:?}"); - // Try a fast-path first: - // Copy logical size from ancestor timeline if there has been no changes on this - // branch, and no changes on the ancestor branch since the branch point. - if self.get_ancestor_lsn() == self.get_last_record_lsn() && self.ancestor_timeline.is_some() + let size = current_size.size(); + if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = + (current_size, self.current_logical_size.initial_part_end) { - let ancestor = self.get_ancestor_timeline()?; - let ancestor_logical_size = ancestor.get_current_logical_size(); - // Check LSN after getting logical size to exclude race condition - // when ancestor timeline is concurrently updated. - // - // Logical size 0 means that it was not initialized, so don't believe that. - if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { - self.set_current_logical_size(ancestor_logical_size); - debug!( - "logical size copied from ancestor: {}", - ancestor_logical_size - ); - return Ok(()); - } + self.try_spawn_size_init_task(init_lsn); } - let timer = self.metrics.init_logical_size_histo.start_timer(); - - // Have to calculate it the hard way - let last_lsn = self.get_last_record_lsn(); - let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; - self.set_current_logical_size(logical_size); - debug!("calculated logical size the hard way: {}", logical_size); - - timer.stop_and_record(); - Ok(()) + Ok(size) } - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors. - pub fn get_current_logical_size(&self) -> u64 { - let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire); - match u64::try_from(current_logical_size) { - Ok(sz) => sz, + fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { + let timeline_id = self.timeline_id; + + let mut task_guard = match self.initial_size_computation_task.try_lock() { + Ok(guard) => guard, Err(_) => { - error!( - "current_logical_size is out of range: {}", - current_logical_size - ); - 0 + debug!("Skipping timeline logical size init: task lock is taken already"); + return; + } + }; + + if let Some((old_task, task_finish_signal)) = task_guard.take() { + // TODO rust 1.61 would allow to remove `task_finish_signal` entirely and call `old_task.is_finished()` instead + match task_finish_signal.try_recv() { + // task has either signaled successfully that it finished or panicked and dropped the sender part without signalling + Ok(()) | Err(mpsc::TryRecvError::Disconnected) => { + match old_task.join() { + // we're here due to OnceCell::get not returning the value + Ok(Ok(())) => { + error!("Timeline {timeline_id} size init task finished, yet the size was not updated, rescheduling the computation") + } + Ok(Err(task_error)) => { + error!("Error during timeline {timeline_id} size init: {task_error:?}") + } + Err(e) => error!("Timeline {timeline_id} size init task panicked: {e:?}"), + } + } + // task had not yet finished: no signal was sent and the sender channel is not dropped + Err(mpsc::TryRecvError::Empty) => { + // let the task finish + *task_guard = Some((old_task, task_finish_signal)); + return; + } } } + + if task_guard.is_none() { + let thread_timeline = Arc::clone(self); + let (finish_sender, finish_receiver) = mpsc::channel(); + + match thread::Builder::new() + .name(format!( + "Timeline {timeline_id} initial logical size calculation" + )) + .spawn(move || { + let _enter = info_span!("initial_logical_size_calculation", timeline = %timeline_id).entered(); + let calculated_size = thread_timeline.calculate_logical_size(init_lsn)?; + match thread_timeline.current_logical_size.initial_logical_size.set(calculated_size) { + Ok(()) => info!("Successfully calculated initial logical size"), + Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), + } + + finish_sender.send(()).ok(); + Ok(()) + }) { + Ok(guard) => *task_guard = Some((guard, finish_receiver)), + Err(e) => error!("Failed to spawn timeline {timeline_id} size init task: {e}"), + } + } + } + + /// Calculate the logical size of the database at the latest LSN. + /// + /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. + fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result { + info!("Calculating logical size for timeline {}", self.timeline_id); + let timer = self.metrics.init_logical_size_histo.start_timer(); + let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?; + debug!("calculated logical size: {logical_size}"); + timer.stop_and_record(); + Ok(logical_size) } /// Update current logical size, adding `delta' to the old value. fn update_current_logical_size(&self, delta: i64) { - let new_size = self - .current_logical_size - .fetch_add(delta, AtomicOrdering::SeqCst); + let logical_size = &self.current_logical_size; + logical_size.increment_size(delta); // Also set the value in the prometheus gauge. Note that // there is a race condition here: if this is is called by two // threads concurrently, the prometheus gauge might be set to // one value while current_logical_size is set to the - // other. Currently, only initialization and the WAL receiver - // updates the logical size, and they don't run concurrently, - // so it cannot happen. And even if it did, it wouldn't be - // very serious, the metrics would just be slightly off until - // the next update. - self.metrics.current_logical_size_gauge.set(new_size); - } - - /// Set current logical size. - fn set_current_logical_size(&self, new_size: u64) { - self.current_logical_size - .store(new_size as i64, AtomicOrdering::SeqCst); - self.logical_size_initialized - .store(true, AtomicOrdering::SeqCst); - - // Also set the value in the prometheus gauge. Same race condition - // here as in `update_current_logical_size`. - self.metrics.current_logical_size_gauge.set(new_size as i64); + // other. + match logical_size.current_size() { + Ok(new_current_size) => self + .metrics + .current_logical_size_gauge + .set(new_current_size.size()), + Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), + } } /// @@ -1446,7 +1571,15 @@ impl Timeline { Ok(new_delta_path) } - pub fn compact(&self) -> Result<()> { + pub fn compact(&self) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + + // Last record Lsn could be zero in case the timelie was just created + if !last_record_lsn.is_valid() { + warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); + return Ok(()); + } + // // High level strategy for compaction / image creation: // diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0f0bb1ed53..24002a36e5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -936,7 +936,7 @@ impl<'a> DatadirModification<'a> { result?; if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); + writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); self.pending_nblocks = 0; } @@ -948,7 +948,7 @@ impl<'a> DatadirModification<'a> { /// underlying timeline. /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub fn commit(&mut self) -> Result<()> { + pub fn commit(&mut self) -> anyhow::Result<()> { let writer = self.tline.writer(); let lsn = self.lsn; let pending_nblocks = self.pending_nblocks; @@ -964,7 +964,7 @@ impl<'a> DatadirModification<'a> { writer.finish_write(lsn); if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); + writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); } Ok(()) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 4a907ac0e1..fec8a80b9b 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -4,7 +4,6 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; use crate::layered_repository::{load_metadata, Repository, Timeline}; -use crate::repository::RepositoryTimeline; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; @@ -378,15 +377,7 @@ pub fn get_local_timeline_with_load( tenant_id: ZTenantId, timeline_id: ZTimelineId, ) -> anyhow::Result> { - let repository = get_repository_for_tenant(tenant_id)?; - match repository.get_timeline(timeline_id) { - Some(RepositoryTimeline::Loaded(loaded_timeline)) => { - loaded_timeline.init_logical_size()?; - Ok(loaded_timeline) - } - _ => load_local_timeline(&repository, timeline_id) - .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}")), - } + get_repository_for_tenant(tenant_id)?.get_timeline_load(timeline_id) } pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { @@ -470,17 +461,6 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any Ok(()) } -fn load_local_timeline( - repo: &Repository, - timeline_id: ZTimelineId, -) -> anyhow::Result> { - let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { - format!("Inmem timeline {timeline_id} not found in tenant's repository") - })?; - inmem_timeline.init_logical_size()?; - Ok(inmem_timeline) -} - /// /// Get list of tenants, for the mgmt API /// @@ -489,9 +469,11 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { .iter() .map(|(id, tenant)| { let has_in_progress_downloads = remote_index - .tenant_entry(id) - .map(|entry| entry.has_in_progress_downloads()); + .tenant_entry(id) + .map(|entry| entry.has_in_progress_downloads()); + // TODO this is not correct when we might have remote storage sync disabled: + // we keep `RemoteTimelineIndex` in memory anyway for simplicity and this error message is printed still if has_in_progress_downloads.is_none() { error!("timeline is not found in remote index while it is present in the tenants registry") } @@ -581,7 +563,7 @@ fn attach_downloaded_tenant( // and then load its layers in memory for timeline_id in downloaded_timelines { - let _ = load_local_timeline(repo, timeline_id).with_context(|| { + repo.get_timeline_load(timeline_id).with_context(|| { format!( "Failed to register add local timeline for tenant {}", repo.tenant_id(), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index f816198eda..2c29a56ad2 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -315,18 +315,20 @@ pub async fn handle_walreceiver_connection( // Send zenith feedback message. // Regular standby_status_update fields are put into this message. - let zenith_status_update = ReplicationFeedback { - current_timeline_size: timeline.get_current_logical_size() as u64, + let status_update = ReplicationFeedback { + current_timeline_size: timeline + .get_current_logical_size() + .context("Status update creation failed to get current logical size")?, ps_writelsn: write_lsn, ps_flushlsn: flush_lsn, ps_applylsn: apply_lsn, ps_replytime: ts, }; - debug!("zenith_status_update {zenith_status_update:?}"); + debug!("zenith_status_update {status_update:?}"); let mut data = BytesMut::new(); - zenith_status_update.serialize(&mut data)?; + status_update.serialize(&mut data)?; physical_stream .as_mut() .zenith_status_update(data.len() as u64, &data) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index bf44dfd949..31b54f827b 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -67,11 +67,21 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 # But all others are broken - for n in range(1, 4): - (tenant, timeline, pg) = tenant_timelines[n] - with pytest.raises(Exception, match="Cannot load local timeline") as err: + + # First timeline would fail instantly due to corrupt metadata file + (_tenant, _timeline, pg) = tenant_timelines[1] + with pytest.raises(Exception, match="Cannot load local timeline") as err: + pg.start() + log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") + + # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline + for n in range(2, 4): + (_tenant, _timeline, pg) = tenant_timelines[n] + with pytest.raises(Exception, match="extracting base backup failed") as err: pg.start() - log.info(f"compute startup failed as expected: {err}") + log.info( + f"compute startup failed lazily for timeline with corrupt layers, during basebackup preparation: {err}" + ) def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index aba8567541..6fbc430e80 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -10,6 +10,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserverHttpClient, Postgres, assert_timeline_local, wait_for_last_flush_lsn, @@ -23,11 +24,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() - timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert ( - timeline_details["local"]["current_logical_size"] - == timeline_details["local"]["current_logical_size_non_incremental"] - ) + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -61,17 +58,14 @@ def test_timeline_size(neon_simple_env: NeonEnv): def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") client = env.pageserver.http_client() + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert ( - timeline_details["local"]["current_logical_size"] - == timeline_details["local"]["current_logical_size_non_incremental"] - ) - pgmain = env.postgres.create_start("test_timeline_size") - log.info("postgres is running on 'test_timeline_size' branch") + pgmain = env.postgres.create_start("test_timeline_size_createdropdb") + log.info("postgres is running on 'test_timeline_size_createdropdb' branch") with closing(pgmain.connect()) as conn: with conn.cursor() as cur: @@ -81,6 +75,10 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): local_details["current_logical_size"] == local_details["current_logical_size_non_incremental"] ) + assert ( + timeline_details["local"]["current_logical_size_non_incremental"] + == local_details["current_logical_size_non_incremental"] + ), "no writes should not change the incremental logical size" cur.execute("CREATE DATABASE foodb") with closing(pgmain.connect(dbname="foodb")) as conn: @@ -140,13 +138,10 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") - client = env.pageserver.http_client() - res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert ( - res["local"]["current_logical_size"] == res["local"]["current_logical_size_non_incremental"] - ) + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) pgmain = env.postgres.create_start( "test_timeline_size_quota", @@ -211,6 +206,12 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): pg_cluster_size = cur.fetchone() log.info(f"pg_cluster_size = {pg_cluster_size}") + new_res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + assert ( + new_res["local"]["current_logical_size"] + == new_res["local"]["current_logical_size_non_incremental"] + ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" + def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env = neon_simple_env @@ -425,3 +426,22 @@ def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimel == res["local"]["current_physical_size_non_incremental"] ) assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) + + +# Timeline logical size initialization is an asynchronous background task that runs once, +# try a few times to ensure it's activated properly +def wait_for_timeline_size_init( + client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId +): + for i in range(10): + timeline_details = assert_timeline_local(client, tenant, timeline) + if ( + timeline_details["local"]["current_logical_size"] + == timeline_details["local"]["current_logical_size_non_incremental"] + ): + return + log.info(f"waiting for current_logical_size of a timeline to be calculated, iteration {i}") + time.sleep(1) + raise Exception( + f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" + ) From 2db20e55871fdbbe38c2ae7a28b0692a67be4838 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 1 Sep 2022 16:22:22 +0300 Subject: [PATCH 007/166] Remove [Un]Loaded timeline code (#2359) --- pageserver/src/http/models.rs | 2 - pageserver/src/http/routes.rs | 59 +-- pageserver/src/layered_repository.rs | 392 +++++++++--------- pageserver/src/layered_repository/timeline.rs | 111 +---- pageserver/src/page_service.rs | 69 +-- pageserver/src/repository.rs | 26 -- pageserver/src/storage_sync.rs | 17 +- pageserver/src/tenant_mgr.rs | 60 +-- pageserver/src/timelines.rs | 17 +- .../src/walreceiver/walreceiver_connection.rs | 2 +- test_runner/regress/test_broken_timeline.py | 6 +- test_runner/regress/test_pageserver_api.py | 5 +- 12 files changed, 290 insertions(+), 476 deletions(-) diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 654f45a95d..7c7d7f7b0c 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -8,7 +8,6 @@ use utils::{ }; // These enums are used in the API response fields. -use crate::repository::LocalTimelineState; use crate::tenant_mgr::TenantState; #[serde_as] @@ -133,7 +132,6 @@ pub struct LocalTimelineInfo { pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, pub current_physical_size_non_incremental: Option, - pub timeline_state: LocalTimelineState, pub wal_source_connstr: Option, #[serde_as(as = "Option")] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 710014de98..f1033eeb2a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,8 +11,7 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::{metadata::TimelineMetadata, Timeline}; -use crate::repository::{LocalTimelineState, RepositoryTimeline}; +use crate::layered_repository::Timeline; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; @@ -74,7 +73,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline -fn local_timeline_info_from_loaded_timeline( +fn local_timeline_info_from_timeline( timeline: &Arc, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, @@ -105,7 +104,6 @@ fn local_timeline_info_from_loaded_timeline( last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), - timeline_state: LocalTimelineState::Loaded, current_logical_size: Some( timeline .get_current_logical_size() @@ -129,61 +127,20 @@ fn local_timeline_info_from_loaded_timeline( Ok(info) } -fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> LocalTimelineInfo { - LocalTimelineInfo { - ancestor_timeline_id: metadata.ancestor_timeline(), - ancestor_lsn: { - match metadata.ancestor_lsn() { - Lsn(0) => None, - lsn @ Lsn(_) => Some(lsn), - } - }, - disk_consistent_lsn: metadata.disk_consistent_lsn(), - last_record_lsn: metadata.disk_consistent_lsn(), - prev_record_lsn: metadata.prev_record_lsn(), - latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(), - timeline_state: LocalTimelineState::Unloaded, - current_logical_size: None, - current_physical_size: None, - current_logical_size_non_incremental: None, - current_physical_size_non_incremental: None, - wal_source_connstr: None, - last_received_msg_lsn: None, - last_received_msg_ts: None, - } -} - -fn local_timeline_info_from_repo_timeline( - repo_timeline: &RepositoryTimeline, - include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, -) -> anyhow::Result { - match repo_timeline { - RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline( - timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ), - RepositoryTimeline::Unloaded { metadata } => { - Ok(local_timeline_info_from_unloaded_timeline(metadata)) - } - } -} - fn list_local_timelines( tenant_id: ZTenantId, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> Result> { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; + .with_context(|| format!("Failed to get repo for tenant {tenant_id}"))?; let repo_timelines = repo.list_timelines(); let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); for (timeline_id, repository_timeline) in repo_timelines { local_timeline_info.push(( timeline_id, - local_timeline_info_from_repo_timeline( + local_timeline_info_from_timeline( &repository_timeline, include_non_incremental_logical_size, include_non_incremental_physical_size, @@ -214,12 +171,12 @@ async fn timeline_create_handler(mut request: Request) -> Result { + Ok(Some(new_timeline)) => { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_loaded_timeline(&new_timeline, false, false)?; + let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; Ok(Some(TimelineInfo { tenant_id, - timeline_id: new_timeline_id, + timeline_id: new_timeline.timeline_id, local: Some(local_info), remote: None, })) @@ -311,7 +268,7 @@ async fn timeline_detail_handler(request: Request) -> Result>, tenant_id: ZTenantId, - timelines: Mutex>, + timelines: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration (especially with enforced checkpoint) @@ -126,37 +128,18 @@ pub struct Repository { impl Repository { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { - self.timelines - .lock() - .unwrap() - .get(&timelineid) - .cloned() - .map(RepositoryTimeline::from) - } - - /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - pub fn get_timeline_load(&self, timeline_id: ZTimelineId) -> Result> { - let mut timelines = self.timelines.lock().unwrap(); - match self.get_timeline_load_internal(timeline_id, &mut timelines)? { - Some(local_loaded_timeline) => Ok(local_loaded_timeline), - None => anyhow::bail!("cannot get local timeline, unknown timeline id: {timeline_id}"), - } + pub fn get_timeline(&self, timeline_id: ZTimelineId) -> Option> { + self.timelines.lock().unwrap().get(&timeline_id).cloned() } /// Lists timelines the repository contains. /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. - pub fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { + pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { self.timelines .lock() .unwrap() .iter() - .map(|(timeline_id, timeline_entry)| { - ( - *timeline_id, - RepositoryTimeline::from(timeline_entry.clone()), - ) - }) + .map(|(timeline_id, timeline_entry)| (*timeline_id, Arc::clone(timeline_entry))) .collect() } @@ -164,16 +147,18 @@ impl Repository { /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. pub fn create_empty_timeline( &self, - timeline_id: ZTimelineId, + new_timeline_id: ZTimelineId, initdb_lsn: Lsn, ) -> Result> { + // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); - let vacant_timeline_entry = match timelines.entry(timeline_id) { - Entry::Occupied(_) => bail!("Timeline already exists"), - Entry::Vacant(vacant_entry) => vacant_entry, - }; - let timeline_path = self.conf.timeline_path(&timeline_id, &self.tenant_id); + anyhow::ensure!( + timelines.get(&new_timeline_id).is_none(), + "Timeline {new_timeline_id} already exists" + ); + + let timeline_path = self.conf.timeline_path(&new_timeline_id, &self.tenant_id); if timeline_path.exists() { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } @@ -181,31 +166,25 @@ impl Repository { // Create the timeline directory, and write initial metadata to file. crashsafe_dir::create_dir_all(timeline_path)?; - let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; - - let timeline = Timeline::new( + let new_metadata = + TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); + save_metadata( self.conf, - Arc::clone(&self.tenant_conf), - metadata, - None, - timeline_id, + new_timeline_id, self.tenant_id, - Arc::clone(&self.walredo_mgr), - self.upload_layers, - ); - timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); + &new_metadata, + true, + )?; - // Insert if not exists - let timeline = Arc::new(timeline); - vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))); + let new_timeline = + self.initialize_new_timeline(new_timeline_id, new_metadata, &mut timelines)?; + new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), - timeline: Arc::clone(&timeline), - }); + if let hash_map::Entry::Vacant(v) = timelines.entry(new_timeline_id) { + v.insert(Arc::clone(&new_timeline)); + } - Ok(timeline) + Ok(new_timeline) } /// Branch a timeline @@ -214,7 +193,7 @@ impl Repository { src: ZTimelineId, dst: ZTimelineId, start_lsn: Option, - ) -> Result<()> { + ) -> Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. @@ -229,12 +208,12 @@ impl Repository { // Step 2 is to avoid initializing the new branch using data removed by past GC iterations // or in-queue GC iterations. + // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = self - .get_timeline_load_internal(src, &mut timelines) + let src_timeline = timelines + .get(&src) // message about timeline being remote is one .context up in the stack - .context("failed to load timeline for branching")? - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; + .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); @@ -252,7 +231,7 @@ impl Repository { .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context(format!( "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn + *latest_gc_cutoff_lsn, ))?; { let gc_info = src_timeline.gc_info.read().unwrap(); @@ -293,11 +272,13 @@ impl Repository { ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); - info!("branched timeline {} from {} at {}", dst, src, start_lsn); + let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; + timelines.insert(dst, Arc::clone(&new_timeline)); - Ok(()) + info!("branched timeline {dst} from {src} at {start_lsn}"); + + Ok(new_timeline) } /// perform one garbage collection iteration, removing old data files from disk. @@ -346,14 +327,7 @@ impl Repository { for (timelineid, timeline) in &timelines_to_compact { let _entered = info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered(); - match timeline { - LayeredTimelineEntry::Loaded(timeline) => { - timeline.compact()?; - } - LayeredTimelineEntry::Unloaded { .. } => { - debug!("Cannot compact remote timeline {}", timelineid) - } - } + timeline.compact()?; } Ok(()) @@ -371,15 +345,7 @@ impl Repository { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - // filter to get only loaded timelines - .filter_map(|(timelineid, entry)| match entry { - LayeredTimelineEntry::Loaded(timeline) => Some((timelineid, timeline)), - LayeredTimelineEntry::Unloaded { .. } => { - debug!("Skipping checkpoint for unloaded timeline {}", timelineid); - None - } - }) - .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) + .map(|(timelineid, timeline)| (*timelineid, Arc::clone(timeline))) .collect::>(); drop(timelines); @@ -403,7 +369,7 @@ impl Repository { // because detach removes files, which will break child branches let children_exist = timelines .iter() - .any(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); ensure!( !children_exist, @@ -431,19 +397,36 @@ impl Repository { Ok(()) } - /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. - /// See [`crate::remote_storage`] for more details about the synchronization. - pub fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { - debug!("attach timeline_id: {}", timeline_id,); - match self.timelines.lock().unwrap().entry(timeline_id) { - Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), - Entry::Vacant(entry) => { - // we need to get metadata of a timeline, another option is to pass it along with Downloaded status - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; - // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata }) - }, + pub fn init_attach_timelines( + &self, + timelines: Vec<(ZTimelineId, TimelineMetadata)>, + ) -> anyhow::Result<()> { + let sorted_timelines = if timelines.len() == 1 { + timelines + } else if !timelines.is_empty() { + tree_sort_timelines(timelines)? + } else { + warn!("No timelines to attach received"); + return Ok(()); }; + + let mut timelines_accessor = self.timelines.lock().unwrap(); + for (timeline_id, metadata) in sorted_timelines { + let timeline = self + .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) + .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; + + match timelines_accessor.entry(timeline.timeline_id) { + hash_map::Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {} in the tenant map", + timeline.timeline_id + ), + hash_map::Entry::Vacant(v) => { + v.insert(timeline); + } + } + } + Ok(()) } @@ -453,6 +436,49 @@ impl Repository { } } +/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), +/// perform a topological sort, so that the parent of each timeline comes +/// before the children. +fn tree_sort_timelines( + timelines: Vec<(ZTimelineId, TimelineMetadata)>, +) -> Result> { + let mut result = Vec::with_capacity(timelines.len()); + + let mut now = Vec::with_capacity(timelines.len()); + // (ancestor, children) + let mut later: HashMap> = + HashMap::with_capacity(timelines.len()); + + for (timeline_id, metadata) in timelines { + if let Some(ancestor_id) = metadata.ancestor_timeline() { + let children = later.entry(ancestor_id).or_default(); + children.push((timeline_id, metadata)); + } else { + now.push((timeline_id, metadata)); + } + } + + while let Some((timeline_id, metadata)) = now.pop() { + result.push((timeline_id, metadata)); + // All children of this can be loaded now + if let Some(mut children) = later.remove(&timeline_id) { + now.append(&mut children); + } + } + + // All timelines should be visited now. Unless there were timelines with missing ancestors. + if !later.is_empty() { + for (missing_id, orphan_ids) in later { + for (orphan_id, _) in orphan_ids { + error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"); + } + } + bail!("could not load tenant because some timelines are missing ancestors"); + } + + Ok(result) +} + /// Private functions impl Repository { pub fn get_checkpoint_distance(&self) -> u64 { @@ -548,87 +574,49 @@ impl Repository { Ok(()) } - // Implementation of the public `get_timeline_load` function. - // Differences from the public: - // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. - fn get_timeline_load_internal( + fn initialize_new_timeline( &self, - timeline_id: ZTimelineId, - timelines: &mut HashMap, - ) -> anyhow::Result>> { - Ok(match timelines.get(&timeline_id) { - Some(entry) => match entry { - LayeredTimelineEntry::Loaded(local_timeline) => { - debug!("timeline {timeline_id} found loaded into memory"); - Some(Arc::clone(local_timeline)) - } - LayeredTimelineEntry::Unloaded { .. } => { - debug!( - "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" - ); - let timeline = self.load_local_timeline(timeline_id, timelines)?; - let was_loaded = timelines.insert( - timeline_id, - LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), - ); - ensure!( - was_loaded.is_none() - || matches!(was_loaded, Some(LayeredTimelineEntry::Unloaded { .. })), - "assertion failure, inserted wrong timeline in an incorrect state" - ); - Some(timeline) - } - }, - None => { - debug!("timeline {timeline_id} not found"); - None - } - }) - } - - fn load_local_timeline( - &self, - timeline_id: ZTimelineId, - timelines: &mut HashMap, + new_timeline_id: ZTimelineId, + new_metadata: TimelineMetadata, + timelines: &mut MutexGuard>>, ) -> anyhow::Result> { - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) - .context("failed to load metadata")?; - let disk_consistent_lsn = metadata.disk_consistent_lsn(); + let ancestor = match new_metadata.ancestor_timeline() { + Some(ancestor_timeline_id) => Some( + timelines + .get(&ancestor_timeline_id) + .cloned() + .with_context(|| { + format!( + "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" + ) + })?, + ), + None => None, + }; - let ancestor = metadata - .ancestor_timeline() - .map(|ancestor_timeline_id| { - trace!("loading {timeline_id}'s ancestor {}", &ancestor_timeline_id); - self.get_timeline_load_internal(ancestor_timeline_id, timelines) - }) - .transpose() - .context("cannot load ancestor timeline")? - .flatten() - .map(LayeredTimelineEntry::Loaded); - let _enter = info_span!("loading local timeline").entered(); + let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); - let timeline = Timeline::new( + let new_timeline = Arc::new(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), - metadata, + new_metadata, ancestor, - timeline_id, + new_timeline_id, self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, - ); - timeline - .load_layer_map(disk_consistent_lsn) + )); + + new_timeline + .load_layer_map(new_disk_consistent_lsn) .context("failed to load layermap")?; - let timeline = Arc::new(timeline); - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), - timeline: Arc::clone(&timeline), + id: ZTenantTimelineId::new(self.tenant_id(), new_timeline_id), + timeline: Arc::clone(&new_timeline), }); - Ok(timeline) + Ok(new_timeline) } pub fn new( @@ -775,18 +763,20 @@ impl Repository { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. // Somewhat related: https://github.com/zenithdb/zenith/issues/999 - if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children if let Some(timelineid) = target_timeline_id { if ancestor_timeline_id == &timelineid { - all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + all_branchpoints.insert(( + *ancestor_timeline_id, + timeline_entry.get_ancestor_lsn(), + )); } } // Collect branchpoints for all timelines else { all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); + .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn())); } } @@ -801,7 +791,9 @@ impl Repository { let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); for timeline_id in timeline_ids { // Timeline is known to be local and loaded. - let timeline = self.get_timeline_load(timeline_id)?; + let timeline = self + .get_timeline(timeline_id) + .with_context(|| format!("Timeline {timeline_id} was not found"))?; // If target_timeline is specified, ignore all other timelines if let Some(target_timelineid) = target_timeline_id { @@ -1031,20 +1023,21 @@ pub mod repo_harness { false, ); // populate repo with locally available timelines + let mut timelines_to_load = Vec::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") { - let timeline_dir_entry = timeline_dir_entry.unwrap(); + let timeline_dir_entry = timeline_dir_entry?; let timeline_id: ZTimelineId = timeline_dir_entry .path() .file_name() .unwrap() .to_string_lossy() - .parse() - .unwrap(); - - repo.attach_timeline(timeline_id)?; + .parse()?; + let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; + timelines_to_load.push((timeline_id, timeline_metadata)); } + repo.init_attach_timelines(timelines_to_load)?; Ok(repo) } @@ -1127,7 +1120,10 @@ mod tests { match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { Ok(_) => panic!("duplicate timeline creation should fail"), - Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), + Err(e) => assert_eq!( + e.to_string(), + format!("Timeline {TIMELINE_ID} already exists") + ), } Ok(()) @@ -1170,7 +1166,7 @@ mod tests { // Branch the history, modify relation differently on the new timeline repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; @@ -1318,7 +1314,7 @@ mod tests { repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; @@ -1334,7 +1330,7 @@ mod tests { repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; @@ -1363,17 +1359,8 @@ mod tests { } let repo = harness.load(); - let tline = repo - .get_timeline(TIMELINE_ID) + repo.get_timeline(TIMELINE_ID) .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - assert!(repo.get_timeline_load(TIMELINE_ID).is_ok()); - - let tline = repo - .get_timeline(TIMELINE_ID) - .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); Ok(()) } @@ -1393,7 +1380,7 @@ mod tests { repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) + .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; @@ -1402,28 +1389,15 @@ mod tests { // check that both of them are initially unloaded let repo = harness.load(); - { - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - let tline = repo - .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - } - // load only child timeline - let _ = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("cannot load timeline"); // check that both, child and ancestor are loaded - let tline = repo + let _child_tline = repo .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + .expect("cannot get child timeline loaded"); - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + let _ancestor_tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot get ancestor timeline loaded"); Ok(()) } @@ -1447,7 +1421,9 @@ mod tests { std::fs::write(metadata_path, metadata_bytes)?; let err = harness.try_load().err().expect("should fail"); - assert_eq!(err.to_string(), "failed to load local metadata"); + assert!(err + .to_string() + .starts_with("Failed to parse metadata bytes from path")); let mut found_error_message = false; let mut err_source = err.source(); @@ -1663,7 +1639,9 @@ mod tests { for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo.get_timeline_load(new_tline_id)?; + tline = repo + .get_timeline(new_tline_id) + .expect("Should have the branched timeline"); tline_id = new_tline_id; for _ in 0..NUM_KEYS { @@ -1722,7 +1700,9 @@ mod tests { for idx in 0..NUM_TLINES { let new_tline_id = ZTimelineId::generate(); repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo.get_timeline_load(new_tline_id)?; + tline = repo + .get_timeline(new_tline_id) + .expect("Should have the branched timeline"); tline_id = new_tline_id; for _ in 0..NUM_KEYS { @@ -1749,11 +1729,11 @@ mod tests { if lsn.0 == 0 { continue; } - println!("chekcking [{}][{}] at {}", idx, blknum, lsn); + println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, *lsn)?, - TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn)) + TEST_IMG(&format!("{idx} {blknum} at {lsn}")) ); } } diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index fd719812a3..821995fad1 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -50,7 +50,7 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::repository::{GcResult, RepositoryTimeline}; +use crate::repository::GcResult; use crate::repository::{Key, Value}; use crate::thread_mgr; use crate::walreceiver::IS_WAL_RECEIVER; @@ -164,72 +164,6 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -#[derive(Clone)] -pub enum LayeredTimelineEntry { - Loaded(Arc), - Unloaded { - id: ZTimelineId, - metadata: TimelineMetadata, - }, -} - -impl LayeredTimelineEntry { - fn timeline_id(&self) -> ZTimelineId { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id, - LayeredTimelineEntry::Unloaded { id, .. } => *id, - } - } - - pub fn ancestor_timeline_id(&self) -> Option { - match self { - LayeredTimelineEntry::Loaded(timeline) => { - timeline.ancestor_timeline.as_ref().map(|t| t.timeline_id()) - } - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_timeline(), - } - } - - pub fn ancestor_lsn(&self) -> Lsn { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline.ancestor_lsn, - LayeredTimelineEntry::Unloaded { metadata, .. } => metadata.ancestor_lsn(), - } - } - - fn ensure_loaded(&self) -> anyhow::Result<&Arc> { - match self { - LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), - LayeredTimelineEntry::Unloaded { .. } => { - anyhow::bail!("timeline is unloaded") - } - } - } - - pub fn layer_removal_guard(&self) -> Result>, anyhow::Error> { - match self { - LayeredTimelineEntry::Loaded(timeline) => timeline - .layer_removal_cs - .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) - .map(Some), - - LayeredTimelineEntry::Unloaded { .. } => Ok(None), - } - } -} - -impl From for RepositoryTimeline { - fn from(entry: LayeredTimelineEntry) -> Self { - match entry { - LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), - LayeredTimelineEntry::Unloaded { metadata, .. } => { - RepositoryTimeline::Unloaded { metadata } - } - } - } -} - struct TimelineMetrics { pub reconstruct_time_histo: Histogram, pub materialized_page_cache_hit_counter: GenericCounter, @@ -342,7 +276,7 @@ pub struct Timeline { // Parent timeline that this timeline was branched from, and the LSN // of the branch point. - ancestor_timeline: Option, + ancestor_timeline: Option>, ancestor_lsn: Lsn, // Metrics @@ -566,7 +500,7 @@ impl Timeline { pub fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() - .map(LayeredTimelineEntry::timeline_id) + .map(|ancestor| ancestor.timeline_id) } /// Lock and get timeline's GC cuttof @@ -781,7 +715,7 @@ impl Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, metadata: TimelineMetadata, - ancestor: Option, + ancestor: Option>, timeline_id: ZTimelineId, tenant_id: ZTenantId, walredo_mgr: Arc, @@ -938,6 +872,12 @@ impl Timeline { Ok(()) } + pub fn layer_removal_guard(&self) -> Result, anyhow::Error> { + self.layer_removal_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + } + /// Retrieve current logical size of the timeline. /// /// The size could be lagging behind the actual number, in case @@ -1204,24 +1144,13 @@ impl Timeline { } fn get_ancestor_timeline(&self) -> Result> { - let ancestor = self - .ancestor_timeline - .as_ref() - .with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })? - .ensure_loaded() - .with_context(|| { - format!( - "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; + let ancestor = self.ancestor_timeline.as_ref().with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })?; Ok(Arc::clone(ancestor)) } @@ -1251,7 +1180,9 @@ impl Timeline { layer = Arc::clone(open_layer); } else { // No writeable layer yet. Create one. - let start_lsn = layers.next_open_layer_at.unwrap(); + let start_lsn = layers + .next_open_layer_at + .context("No next open layer found")?; trace!( "creating layer for write at {}/{} for record at {}", @@ -1496,7 +1427,7 @@ impl Timeline { let ancestor_timelineid = self .ancestor_timeline .as_ref() - .map(LayeredTimelineEntry::timeline_id); + .map(|ancestor| ancestor.timeline_id); let metadata = TimelineMetadata::new( disk_consistent_lsn, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d59a82d488..7f7fa3c22b 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -457,18 +457,18 @@ impl PageServerHandler { fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, ) -> anyhow::Result<()> { - let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered(); + let _enter = + info_span!("pagestream", timeline = %timeline_id, tenant = %tenant_id).entered(); // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association - thread_mgr::associate_with(Some(tenantid), Some(timelineid)); + thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Check that the timeline exists - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; /* switch client to COPYBOTH */ pgb.write_message(&BeMessage::CopyBothResponse)?; @@ -488,8 +488,8 @@ impl PageServerHandler { }; let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_id = tenantid.to_string(); - let timeline_id = timelineid.to_string(); + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME @@ -599,7 +599,9 @@ impl PageServerHandler { info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo.get_timeline_load(timeline_id)?; + let timeline = repo + .get_timeline(timeline_id) + .with_context(|| format!("Timeline {timeline_id} was not found"))?; ensure!(timeline.get_last_record_lsn() == start_lsn); // TODO leave clean state on error. For now you can use detach to clean @@ -762,19 +764,18 @@ impl PageServerHandler { fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, lsn: Option, prev_lsn: Option, - tenantid: ZTenantId, + tenant_id: ZTenantId, full_backup: bool, ) -> anyhow::Result<()> { - let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty); + let span = info_span!("basebackup", timeline = %timeline_id, tenant = %tenant_id, lsn = field::Empty); let _enter = span.enter(); info!("starting"); // check that the timeline exists - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline @@ -906,12 +907,11 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for get_last_record_rlsn command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + self.check_permission(Some(tenant_id))?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; let end_of_timeline = timeline.get_last_record_rlsn(); @@ -1134,10 +1134,9 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("Invalid compact: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Couldn't load timeline")?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? @@ -1152,11 +1151,9 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.checkpoint(CheckpointConfig::Forced)?; @@ -1172,10 +1169,9 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); @@ -1201,6 +1197,15 @@ impl postgres_backend::Handler for PageServerHandler { } } +fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { + tenant_mgr::get_repository_for_tenant(tenant_id) + .and_then(|repo| { + repo.get_timeline(timeline_id) + .context("No timeline in tenant's repository") + }) + .with_context(|| format!("Could not get timeline {timeline_id} in tenant {tenant_id}")) +} + /// /// A std::io::Write implementation that wraps all data written to it in CopyData /// messages. diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index e46a39436d..c3b08c93de 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,4 +1,3 @@ -use crate::layered_repository::metadata::TimelineMetadata; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; @@ -6,7 +5,6 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; use std::ops::{AddAssign, Range}; -use std::sync::Arc; use std::time::Duration; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] @@ -175,30 +173,6 @@ impl Value { } } -/// A timeline, that belongs to the current repository. -pub enum RepositoryTimeline { - /// Timeline, with its files present locally in pageserver's working directory. - /// Loaded into pageserver's memory and ready to be used. - Loaded(Arc), - - /// All the data is available locally, but not loaded into memory, so loading have to be done before actually using the timeline - Unloaded { - // It is ok to keep metadata here, because it is not changed when timeline is unloaded. - // FIXME can s3 sync actually change it? It can change it when timeline is in awaiting download state. - // but we currently do not download something for the timeline once it is local (even if there are new checkpoints) is it correct? - // also it is not that good to keep TimelineMetadata here, because it is layered repo implementation detail - metadata: TimelineMetadata, - }, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum LocalTimelineState { - // timeline is loaded into memory (with layer map and all the bits), - Loaded, - // timeline is on disk locally and ready to be loaded into memory. - Unloaded, -} - /// /// Result of performing GC /// diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index a52cde7286..0bdc30a73f 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -903,8 +903,10 @@ fn storage_sync_loop( "Sync loop step completed, {} new tenant state update(s)", updated_tenants.len() ); - let mut sync_status_updates: HashMap> = - HashMap::new(); + let mut timelines_to_attach: HashMap< + ZTenantId, + Vec<(ZTimelineId, TimelineMetadata)>, + > = HashMap::new(); let index_accessor = runtime.block_on(index.read()); for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { @@ -930,13 +932,18 @@ fn storage_sync_loop( // and register them all at once in a repository for download // to be submitted in a single operation to repository // so it can apply them at once to internal timeline map. - sync_status_updates - .insert(tenant_id, tenant_entry.keys().copied().collect()); + timelines_to_attach.insert( + tenant_id, + tenant_entry + .iter() + .map(|(&id, entry)| (id, entry.metadata.clone())) + .collect(), + ); } } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - attach_downloaded_tenants(conf, &index, sync_status_updates); + attach_downloaded_tenants(conf, &index, timelines_to_attach); } } ControlFlow::Break(()) => { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index fec8a80b9b..cbf9f2094a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,6 +3,7 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; +use crate::layered_repository::metadata::TimelineMetadata; use crate::layered_repository::{load_metadata, Repository, Timeline}; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -14,7 +15,7 @@ use anyhow::Context; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fmt; use std::sync::Arc; use tokio::sync::mpsc; @@ -192,7 +193,7 @@ impl std::fmt::Debug for LocalTimelineUpdate { pub fn attach_downloaded_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - sync_status_updates: HashMap>, + sync_status_updates: HashMap>, ) { if sync_status_updates.is_empty() { debug!("No sync status updates to apply"); @@ -212,11 +213,9 @@ pub fn attach_downloaded_tenants( continue; } }; - match attach_downloaded_tenant(&repo, downloaded_timelines) { - Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), - Err(e) => error!( - "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" - ), + match repo.init_attach_timelines(downloaded_timelines) { + Ok(()) => info!("successfully loaded local timelines for tenant {tenant_id}"), + Err(e) => error!("Failed to load local timelines for tenant {tenant_id}: {e:?}"), } } } @@ -371,15 +370,6 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { - get_repository_for_tenant(tenant_id)?.get_timeline_load(timeline_id) -} - pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists @@ -499,7 +489,7 @@ fn check_broken_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId, -) -> anyhow::Result<()> { +) -> anyhow::Result { let metadata = load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?; @@ -509,7 +499,7 @@ fn check_broken_timeline( anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN."); } - Ok(()) + Ok(metadata) } /// Note: all timelines are attached at once if and only if all of them are locally complete @@ -519,14 +509,14 @@ fn init_local_repository( local_timeline_init_statuses: HashMap, remote_index: &RemoteIndex, ) -> anyhow::Result<(), anyhow::Error> { - let mut timelines_to_attach = HashSet::new(); + let mut timelines_to_attach = Vec::new(); for (timeline_id, init_status) in local_timeline_init_statuses { match init_status { LocalTimelineInitStatus::LocallyComplete => { debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - check_broken_timeline(conf, tenant_id, timeline_id) + let metadata = check_broken_timeline(conf, tenant_id, timeline_id) .context("found broken timeline")?; - timelines_to_attach.insert(timeline_id); + timelines_to_attach.push((timeline_id, metadata)); } LocalTimelineInitStatus::NeedsSync => { debug!( @@ -545,32 +535,8 @@ fn init_local_repository( // Lets fail here loudly to be on the safe side. // XXX: It may be a better api to actually distinguish between repository startup // and processing of newly downloaded timelines. - attach_downloaded_tenant(&repo, timelines_to_attach) - .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?; - Ok(()) -} - -fn attach_downloaded_tenant( - repo: &Repository, - downloaded_timelines: HashSet, -) -> anyhow::Result<()> { - // first, register timeline metadata to ensure ancestors will be found later during layer load - for &timeline_id in &downloaded_timelines { - repo.attach_timeline(timeline_id).with_context(|| { - format!("Failed to load timeline {timeline_id} into in-memory repository") - })?; - } - - // and then load its layers in memory - for timeline_id in downloaded_timelines { - repo.get_timeline_load(timeline_id).with_context(|| { - format!( - "Failed to register add local timeline for tenant {}", - repo.tenant_id(), - ) - })?; - } - + repo.init_attach_timelines(timelines_to_attach) + .with_context(|| format!("Failed to init local timelines for tenant {tenant_id}"))?; Ok(()) } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 4f760751db..936699c2ec 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -108,7 +108,7 @@ fn bootstrap_timeline( tenantid: ZTenantId, tli: ZTimelineId, repo: &Repository, -) -> Result<()> { +) -> Result> { let initdb_path = conf .tenant_path(&tenantid) .join(format!("tmp-timeline-{}", tli)); @@ -141,7 +141,7 @@ fn bootstrap_timeline( // Remove temp dir. We don't need it anymore fs::remove_dir_all(pgdata_path)?; - Ok(()) + Ok(timeline) } /// @@ -159,7 +159,7 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result)>> { +) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -168,11 +168,11 @@ pub(crate) fn create_timeline( return Ok(None); } - match ancestor_timeline_id { + let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo - .get_timeline_load(ancestor_timeline_id) - .context("Cannot branch off the timeline that's not present locally")?; + .get_timeline(ancestor_timeline_id) + .context("Cannot branch off the timeline that's not present in pageserver")?; if let Some(lsn) = ancestor_start_lsn.as_mut() { // Wait for the WAL to arrive and be processed on the parent branch up @@ -201,8 +201,5 @@ pub(crate) fn create_timeline( None => bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?, }; - // load the timeline into memory - let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; - - Ok(Some((new_timeline_id, loaded_timeline))) + Ok(Some(loaded_timeline)) } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 2c29a56ad2..d441bbb4ab 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -132,7 +132,7 @@ pub async fn handle_walreceiver_connection( let (repo, timeline) = tokio::task::spawn_blocking(move || { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id) + let timeline = repo.get_timeline(timeline_id) .with_context(|| { format!("local timeline {timeline_id} not found for tenant {tenant_id}") })?; diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 31b54f827b..4aba2494e9 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -68,9 +68,11 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # But all others are broken - # First timeline would fail instantly due to corrupt metadata file + # First timeline would not get loaded into pageserver due to corrupt metadata file (_tenant, _timeline, pg) = tenant_timelines[1] - with pytest.raises(Exception, match="Cannot load local timeline") as err: + with pytest.raises( + Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" + ) as err: pg.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 8ee38fcf4f..a7b7189824 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -93,10 +93,7 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: ZTenantId): assert ZTenantId(timeline_details["tenant_id"]) == tenant_id assert ZTimelineId(timeline_details["timeline_id"]) == timeline_id - - local_timeline_details = timeline_details.get("local") - assert local_timeline_details is not None - assert local_timeline_details["timeline_state"] == "Loaded" + assert timeline_details.get("local") is not None def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): From 827c3013bde272660d2577883cbc07c770c5a4e6 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 13:20:31 +0300 Subject: [PATCH 008/166] Adjust benchmark code to Ids --- test_runner/fixtures/compare_fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 6bca5be335..ceeeffc785 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -112,10 +112,10 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): - self.pscur.execute(f"do_gc {self.env.initial_tenant.hex} {self.timeline} 0") + self.pscur.execute(f"do_gc {self.env.initial_tenant} {self.timeline} 0") def compact(self): - self.pscur.execute(f"compact {self.env.initial_tenant.hex} {self.timeline}") + self.pscur.execute(f"compact {self.env.initial_tenant} {self.timeline}") def report_peak_memory_use(self) -> None: self.zenbenchmark.record( From 8b28adb6a63c493e099f8f9e6a81e1b48b3caa70 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 13:24:00 +0300 Subject: [PATCH 009/166] Merge file name and extension for index part files --- pageserver/src/storage_sync/download.rs | 6 ++---- pageserver/src/storage_sync/index.rs | 3 +-- pageserver/src/storage_sync/upload.rs | 12 ++---------- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index ebc9a252b7..e11a863dcc 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -141,8 +141,7 @@ async fn download_index_part( sync_id: ZTenantTimelineId, ) -> Result { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME) - .with_extension(IndexPart::FILE_EXTENSION); + .with_file_name(IndexPart::FILE_NAME); let mut index_part_download = storage .download_storage_object(None, &index_part_path) .await?; @@ -663,8 +662,7 @@ mod tests { let local_index_part_path = metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME) - .with_extension(IndexPart::FILE_EXTENSION); + .with_file_name(IndexPart::FILE_NAME); let storage_path = local_storage.remote_object_id(&local_index_part_path)?; fs::create_dir_all(storage_path.parent().unwrap()).await?; fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 7e644da412..b17bb40da4 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -278,8 +278,7 @@ pub struct IndexPart { } impl IndexPart { - pub const FILE_NAME: &'static str = "index_part"; - pub const FILE_EXTENSION: &'static str = "json"; + pub const FILE_NAME: &'static str = "index_part.json"; #[cfg(test)] pub fn new( diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 7ef775e690..38bad73d3b 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -42,8 +42,7 @@ pub(super) async fn upload_index_part( let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME) - .with_extension(IndexPart::FILE_EXTENSION); + .with_file_name(IndexPart::FILE_NAME); storage .upload_storage_object(index_part_bytes, index_part_size, &index_part_path) .await @@ -442,17 +441,10 @@ mod tests { let index_part_path = storage_files.first().unwrap(); assert_eq!( - index_part_path.file_stem().and_then(|name| name.to_str()), + index_part_path.file_name().and_then(|name| name.to_str()), Some(IndexPart::FILE_NAME), "Remote index part should have the correct name" ); - assert_eq!( - index_part_path - .extension() - .and_then(|extension| extension.to_str()), - Some(IndexPart::FILE_EXTENSION), - "Remote index part should have the correct extension" - ); let remote_index_part: IndexPart = serde_json::from_slice(&fs::read(&index_part_path).await?)?; From 73f926c39a606ab119e080d542bc103d5c402c56 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 2 Sep 2022 13:13:42 +0300 Subject: [PATCH 010/166] Return safekeeper remote storage logging during downloads --- safekeeper/src/wal_backup.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index a15ba02863..5c6991c196 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -437,16 +437,23 @@ pub async fn read_object( file_path: PathBuf, offset: u64, ) -> anyhow::Result>> { - let download = REMOTE_STORAGE + let storage = REMOTE_STORAGE .get() .context("Failed to get remote storage")? .as_ref() - .context("No remote storage configured")? + .context("No remote storage configured")?; + + info!( + "segment download about to start for local path {} at offset {}", + file_path.display(), + offset + ); + let download = storage .download_storage_object(Some((offset, None)), &file_path) .await .with_context(|| { format!( - "Failed to open WAL segment download stream for local storage path {}", + "Failed to open WAL segment download stream for local path {}", file_path.display() ) })?; From a463749f59c3fe020065c4cacc91df8fa11ffb99 Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 2 Sep 2022 14:34:40 +0200 Subject: [PATCH 011/166] Slim down compute-node images (#2346) Slim down compute-node images: - Optimize compute_ctl build for size, not performance & debug-ability - Don't run unused stages. Saves time in not building the PLV8 extension. - Do not include static libraries in clean postgres - Do the installation and finishing touches in the final layer in one job This allows docker (and kaniko) to only register one change to the files, removing potentially duplicate changed files. - The runtime library for libreadline-dev is libreadline8, changing the dependency saves 45 MB - libprotobuf-c-dev -> libprotobuf-c1, saving 100 kB - libossp-uuid-dev -> libossp-uuid16, saving 150 kB - gdal-bin + libgdal-dev -> libgeos-c1v5 + libgdal28 + libproj19, saving 747MB - binutils @ testing -> libc6 @ testing, saving 32 MB --- .github/workflows/build_and_test.yml | 2 +- Cargo.toml | 53 ++++++++++++++++++ Dockerfile.compute-node | 84 ++++++++++++++++++++++------ 3 files changed, 122 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a3314738fa..6fae36c6e4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -459,7 +459,7 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build compute node with extensions - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID promote-images: runs-on: dev diff --git a/Cargo.toml b/Cargo.toml index f0934853f0..a19f65a14f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,59 @@ members = [ # Besides, debug info should not affect the performance. debug = true +[profile.release-line-debug] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +[profile.release-line-debug-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +lto = true + +[profile.release-line-debug-size] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "s" +[profile.release-line-debug-zize] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "z" +[profile.release-line-debug-size-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "s" +lto = true +[profile.release-line-debug-zize-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "z" +lto = true + +[profile.release-no-debug] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only + +[profile.release-no-debug-size] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "s" +[profile.release-no-debug-zize] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "z" + +[profile.release-no-debug-size-lto] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "s" +lto = true + +[profile.release-no-debug-zize-lto] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "z" +lto = true + + # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 2e031b17da..3298032030 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -3,12 +3,18 @@ ARG TAG=pinned # ARG POSTGIS_VERSION=3.3.0 # ARG PLV8_VERSION=3.1.4 +# +# Layer "build-deps" +# FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ libcurl4-openssl-dev libossp-uuid-dev +# +# Layer "pg-build" # Build Postgres from the neon postgres repository. +# FROM build-deps AS pg-build COPY vendor/postgres postgres RUN cd postgres && \ @@ -19,9 +25,14 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install -# Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes. -# Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some -# investigation to check that it works, and also keeps working in the future. So for now, we compile our own binaries. +# +# Layer "postgis-build" +# Build PostGIS from the upstream PostGIS mirror. +# +# PostGIS compiles against neon postgres sources without changes. Perhaps we +# could even use the upstream binaries, compiled against vanilla Postgres, but +# it would require some investigation to check that it works, and also keeps +# working in the future. So for now, we compile our own binaries. FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ @@ -42,7 +53,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control +# +# Layer "plv8-build" # Build plv8 +# FROM build-deps AS plv8-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ @@ -64,7 +78,10 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "neon-pg-ext-build" # compile neon extensions +# FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ @@ -79,9 +96,32 @@ FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && cargo build --locked --release +RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto +# +# Clean up postgres folder before inclusion +# +FROM neon-pg-ext-build AS postgres-cleanup-layer +COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql + +# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) +RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp + +# Remove headers that we won't need anymore - we've completed installation of all extensions +RUN rm -r /usr/local/pgsql/include + +# Remove now-useless PGXS src infrastructure +RUN rm -r /usr/local/pgsql/lib/pgxs/src + +# Remove static postgresql libraries - all compilation is finished, so we +# can now remove these files - they must be included in other binaries by now +# if they were to be used by other libraries. +RUN rm /usr/local/pgsql/lib/lib*.a + +# +# Final layer # Put it all together into the final image +# FROM debian:bullseye-slim # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ @@ -93,22 +133,34 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ # TODO: Check if we can make the extension setup more modular versus a linear build # currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# -COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +# Install: +# libreadline8 for psql +# libossp-uuid16 for extension ossp-uuid +# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS +# GLIBC 2.34 for plv8. +# Debian bullseye provides GLIBC 2.31, so we install the library from testing +# +# Lastly, link compute_ctl into zenith_ctl while we're at it, +# so that we don't need to put this in another layer. RUN apt update && \ - apt install -y libreadline-dev libossp-uuid-dev gdal-bin libgdal-dev libprotobuf-c-dev && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Debian bullseye provides GLIBC 2.31 when 2.34 is necessary as we compiled plv8 with that version -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + apt install --no-install-recommends -y \ + libreadline8 \ + libossp-uuid16 \ + libgeos-c1v5 \ + libgdal28 \ + libproj19 \ + libprotobuf-c1 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + echo "Installing GLIBC 2.34" && \ + echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ apt update && \ - apt install -y --no-install-recommends -t testing binutils && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# "temporary" symlink for old control-plane -RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + apt install -y --no-install-recommends -t testing libc6 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] From a4e79db348b4de57c55ff991f93e89831baec2c2 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Sep 2022 15:46:46 +0300 Subject: [PATCH 012/166] Move `neon_local` to `control_plane`. Seems a bit silly to have a separate crate just for the executable. It relies on the control plane for everything it does, and it's the only user of the control plane. --- Cargo.lock | 20 +++---------------- Cargo.toml | 1 - control_plane/Cargo.toml | 3 +++ .../src/bin/neon_local.rs | 7 +++++++ neon_local/Cargo.toml | 19 ------------------ 5 files changed, 13 insertions(+), 37 deletions(-) rename neon_local/src/main.rs => control_plane/src/bin/neon_local.rs (99%) delete mode 100644 neon_local/Cargo.toml diff --git a/Cargo.lock b/Cargo.lock index 2e300e46f5..563a998601 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -495,6 +495,9 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", + "clap 3.2.16", + "comfy-table", + "git-version", "nix", "once_cell", "pageserver", @@ -1648,23 +1651,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "neon_local" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap 3.2.16", - "comfy-table", - "control_plane", - "git-version", - "pageserver", - "postgres", - "safekeeper", - "serde_json", - "utils", - "workspace_hack", -] - [[package]] name = "nix" version = "0.23.1" diff --git a/Cargo.toml b/Cargo.toml index a19f65a14f..1936b261f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,6 @@ members = [ "proxy", "safekeeper", "workspace_hack", - "neon_local", "libs/*", ] diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 425eb332c3..8a79a6e566 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" edition = "2021" [dependencies] +clap = "3.0" +comfy-table = "5.0.1" +git-version = "0.3.5" tar = "0.4.38" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } diff --git a/neon_local/src/main.rs b/control_plane/src/bin/neon_local.rs similarity index 99% rename from neon_local/src/main.rs rename to control_plane/src/bin/neon_local.rs index 78a465539a..828d6a2e5a 100644 --- a/neon_local/src/main.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1,3 +1,10 @@ +//! +//! `neon_local` is an executable that can be used to create a local +//! Neon environment, for testing purposes. The local environment is +//! quite different from the cloud environment with Kubernetes, but it +//! easier to work with locally. The python tests in `test_runner` +//! rely on `neon_local` to set up the environment for each test. +//! use anyhow::{anyhow, bail, Context, Result}; use clap::{App, AppSettings, Arg, ArgMatches}; use control_plane::compute::ComputeControlPlane; diff --git a/neon_local/Cargo.toml b/neon_local/Cargo.toml deleted file mode 100644 index 2fc38cfe02..0000000000 --- a/neon_local/Cargo.toml +++ /dev/null @@ -1,19 +0,0 @@ -[package] -name = "neon_local" -version = "0.1.0" -edition = "2021" - -[dependencies] -clap = "3.0" -anyhow = "1.0" -serde_json = "1" -comfy-table = "5.0.1" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -git-version = "0.3.5" - -# FIXME: 'pageserver' is needed for BranchInfo. Refactor -pageserver = { path = "../pageserver" } -control_plane = { path = "../control_plane" } -safekeeper = { path = "../safekeeper" } -utils = { path = "../libs/utils" } -workspace_hack = { version = "0.1", path = "../workspace_hack" } From 71c965b0e162a5f1431b417b794e64fb5a39832f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 3 Sep 2022 08:48:28 +0300 Subject: [PATCH 013/166] Move backpressure throttling implementation to neon extension and function for monitoring throttling time (#2380) * Move backpressure throttling implementation to neon extension and function for monitoring throttling time * Add missing includes * Bump postgres version --- pgxn/neon/neon--1.0.sql | 7 +++++++ pgxn/neon/neon.c | 7 +++++++ pgxn/neon/walproposer.c | 46 +++++++++++++++++++++++++++++++++++++++-- pgxn/neon/walproposer.h | 3 +++ vendor/postgres | 2 +- 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql index 34f1ba78d4..58b98a5923 100644 --- a/pgxn/neon/neon--1.0.sql +++ b/pgxn/neon/neon--1.0.sql @@ -15,3 +15,10 @@ RETURNS record AS 'MODULE_PATHNAME', 'backpressure_lsns' LANGUAGE C STRICT PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_throttling_time() +RETURNS bigint +AS 'MODULE_PATHNAME', 'backpressure_throttling_time' +LANGUAGE C STRICT +PARALLEL UNSAFE; + diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 595a126f04..62d2624e56 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -40,6 +40,7 @@ void _PG_init(void) PG_FUNCTION_INFO_V1(pg_cluster_size); PG_FUNCTION_INFO_V1(backpressure_lsns); +PG_FUNCTION_INFO_V1(backpressure_throttling_time); Datum pg_cluster_size(PG_FUNCTION_ARGS) @@ -80,3 +81,9 @@ backpressure_lsns(PG_FUNCTION_ARGS) PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); } + +Datum +backpressure_throttling_time(PG_FUNCTION_ARGS) +{ + PG_RETURN_UINT64(BackpressureThrottlingTime()); +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 9625325c0a..3baa4802b0 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -36,6 +36,7 @@ #include #include #include +#include "access/xact.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" #include "storage/latch.h" @@ -58,6 +59,7 @@ #include "utils/builtins.h" #include "utils/guc.h" #include "utils/memutils.h" +#include "utils/ps_status.h" #include "utils/timestamp.h" #include "neon.h" @@ -159,8 +161,9 @@ static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); +static bool backpressure_throttling_impl(void); - +static process_interrupts_callback_t PrevProcessInterruptsCallback; static shmem_startup_hook_type prev_shmem_startup_hook_type; @@ -175,9 +178,11 @@ void pg_init_walproposer(void) nwp_prepare_shmem(); delay_backend_us = &backpressure_lag_impl; + PrevProcessInterruptsCallback = ProcessInterruptsCallback; + ProcessInterruptsCallback = backpressure_throttling_impl; WalProposerRegister(); - + WalProposerInit = &WalProposerInitImpl; WalProposerStart = &WalProposerStartImpl; } @@ -1963,6 +1968,7 @@ WalproposerShmemInit(void) { memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); } LWLockRelease(AddinShmemInitLock); @@ -2401,3 +2407,39 @@ backpressure_lag_impl(void) } return 0; } + +#define BACK_PRESSURE_DELAY 10000L // 0.01 sec + +static bool backpressure_throttling_impl(void) +{ + int64 lag; + TimestampTz start, stop; + bool retry = PrevProcessInterruptsCallback + ? PrevProcessInterruptsCallback() + : false; + + // Don't throttle read only transactions and wal sender. + if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + return retry; + + // Calculate replicas lag + lag = backpressure_lag_impl(); + if (lag == 0) + return retry; + + // Suspend writers until replicas catch up + set_ps_display("backpressure throttling"); + + elog(DEBUG2, "backpressure throttling: lag %lu", lag); + start = GetCurrentTimestamp(); + pg_usleep(BACK_PRESSURE_DELAY); + stop = GetCurrentTimestamp(); + pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); + return true; +} + +uint64 +BackpressureThrottlingTime(void) +{ + return pg_atomic_read_u64(&walprop_shared->backpressureThrottlingTime); +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index b684d5264f..75167163f3 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -287,6 +287,7 @@ typedef struct WalproposerShmemState slock_t mutex; ReplicationFeedback feedback; term_t mineLastElectedTerm; + pg_atomic_uint64 backpressureThrottlingTime; } WalproposerShmemState; /* @@ -537,4 +538,6 @@ typedef struct WalProposerFunctionsType */ extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; +extern uint64 BackpressureThrottlingTime(void); + #endif /* __NEON_WALPROPOSER_H__ */ diff --git a/vendor/postgres b/vendor/postgres index 22d9ead36b..bbd2ab1544 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 22d9ead36beeab6b6a99c64f9b0b1576927ad91b +Subproject commit bbd2ab15443935a6871b39f90ed669160d9987ad From eef74754082a0e8e0f8486d9022be98c5f682ce8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 3 Sep 2022 17:06:19 +0300 Subject: [PATCH 014/166] Add tests for measuring effect of lsn caching (#2384) * Add tests for measurif effet of lsn caching * Fix formatting of test_latency.py * Fix test_lsn_mapping test --- test_runner/performance/test_latency.py | 29 +++++++++++++++++++++++++ test_runner/regress/test_lsn_mapping.py | 6 +++-- 2 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 test_runner/performance/test_latency.py diff --git a/test_runner/performance/test_latency.py b/test_runner/performance/test_latency.py new file mode 100644 index 0000000000..9aa618650d --- /dev/null +++ b/test_runner/performance/test_latency.py @@ -0,0 +1,29 @@ +import threading + +import pytest +from fixtures.compare_fixtures import PgCompare +from fixtures.neon_fixtures import Postgres +from performance.test_perf_pgbench import get_scales_matrix +from performance.test_wal_backpressure import record_read_latency + + +def start_write_workload(pg: Postgres, scale: int = 10): + with pg.connect().cursor() as cur: + cur.execute(f"create table big as select generate_series(1,{scale*100_000})") + + +# Measure latency of reads on one table, while lots of writes are happening on another table. +# The fine-grained tracking of last-written LSNs helps to keep the latency low. Without it, the reads would +# often need to wait for the WAL records of the unrelated writes to be processed by the pageserver. +@pytest.mark.parametrize("scale", get_scales_matrix(1)) +def test_measure_read_latency_heavy_write_workload(neon_with_baseline: PgCompare, scale: int): + env = neon_with_baseline + pg = env.pg + + with pg.connect().cursor() as cur: + cur.execute(f"create table small as select generate_series(1,{scale*100_000})") + + write_thread = threading.Thread(target=start_write_workload, args=(pg, scale * 100)) + write_thread.start() + + record_read_latency(env, lambda: write_thread.is_alive(), "SELECT count(*) from small") diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index f6ca7000dd..9d1efec2c1 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,7 +1,7 @@ from datetime import timedelta from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.utils import query_scalar @@ -34,9 +34,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Execute one more transaction with synchronous_commit enabled, to flush # all the previous transactions - cur.execute("SET synchronous_commit=on") cur.execute("INSERT INTO foo VALUES (-1)") + # Wait until WAL is received by pageserver + wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id) + # Check edge cases: timestamp in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = query_scalar( From 2b6c49b2ea07fb95ef3eb571d4081b396bac08f2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 3 Sep 2022 14:06:00 +0300 Subject: [PATCH 015/166] Fix negative usize parsing --- pageserver/src/layered_repository/timeline.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 821995fad1..b050ef4030 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -432,14 +432,12 @@ impl LogicalSize { .map(CurrentLogicalSize::Exact) } None => { - let non_negative_size_increment = size_increment.max(0); - u64::try_from(non_negative_size_increment) - .with_context(|| { - format!( - "Failed to convert size increment {non_negative_size_increment} to u64" - ) - }) - .map(CurrentLogicalSize::Approximate) + let non_negative_size_increment = if size_increment < 0 { + 0 + } else { + u64::try_from(size_increment).expect("not negative, cannot fail") + }; + Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) } } } From 846d71b948a3a471a13ca9e5d9b813d048b14138 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 4 Sep 2022 22:25:32 +0300 Subject: [PATCH 016/166] Add test for last written lsn cache (#1949) * Fix pythin style * Fix iport of test_backpressure in test_latency * Apply changed to moved neon extension * Apply changed to moved neon extension * Merge with main * Update pgxn/neon/pagestore_smgr.c Co-authored-by: Heikki Linnakangas * Bump postgres version Co-authored-by: Heikki Linnakangas --- pgxn/neon/pagestore_smgr.c | 29 +++++++++++++++++++---------- vendor/postgres | 2 +- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3e1b74dba7..21d6dfec52 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -558,7 +558,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ - SetLastWrittenPageLSN(lsn); + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); } @@ -603,7 +603,7 @@ zm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -zenith_get_request_lsn(bool *latest) +zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; @@ -630,9 +630,9 @@ zenith_get_request_lsn(bool *latest) * so our request cannot concern those. */ *latest = true; - lsn = GetLastWrittenPageLSN(); + lsn = GetLastWrittenLSN(rnode, forknum, blkno); Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", + elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); lsn = zm_adjust_lsn(lsn); @@ -716,7 +716,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); { ZenithExistsRequest request = { .req.tag = T_ZenithExistsRequest, @@ -791,7 +791,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created - * relation. Currently, we don't call SetLastWrittenPageLSN() when a new + * relation. Currently, we don't call SetLastWrittenLSN() when a new * relation created, so if we didn't remember the size in the relsize * cache, we might call smgrnblocks() on the newly-created relation before * the creation WAL record hass been received by the page server. @@ -904,6 +904,8 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif + + SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); } /* @@ -1079,7 +1081,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -1284,7 +1286,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); { ZenithNblocksRequest request = { .req.tag = T_ZenithNblocksRequest, @@ -1343,8 +1345,9 @@ zenith_dbsize(Oid dbNode) int64 db_size; XLogRecPtr request_lsn; bool latest; + RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { ZenithDbSizeRequest request = { .req.tag = T_ZenithDbSizeRequest, @@ -1431,7 +1434,13 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ XLogFlush(lsn); - SetLastWrittenPageLSN(lsn); + /* + * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them, + * or update LSN for "dummy" metadata block. Second approach seems more efficient. If the relation is extended + * again later, the extension will update the last-written LSN for the extended pages, so there's no harm in + * leaving behind obsolete entries for the truncated chunks. + */ + SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) diff --git a/vendor/postgres b/vendor/postgres index bbd2ab1544..a4963aa6df 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit bbd2ab15443935a6871b39f90ed669160d9987ad +Subproject commit a4963aa6df6a44bdee17ef387c01bcf46f6017fd From 7a3e8bb7fb965bea6af0ae4c7393838e21f33d5e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 5 Sep 2022 11:02:13 +0300 Subject: [PATCH 017/166] Make tracing span names consistent for mgmt API handlers. --- pageserver/src/http/routes.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f1033eeb2a..52997da5a0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -162,7 +162,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered(); + let _enter = info_span!("tenant_detach", tenant = %tenant_id).entered(); tenant_mgr::detach_tenant(conf, tenant_id) }) .await From aeb1cf9c36d3a895828fd8376ea474c5b635c025 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 5 Sep 2022 11:09:32 +0300 Subject: [PATCH 018/166] Fix misc typos and grammar in comments. --- pageserver/src/tenant_mgr.rs | 8 ++++---- test_runner/fixtures/neon_fixtures.py | 4 ++-- test_runner/fixtures/types.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index cbf9f2094a..7c82745142 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -435,10 +435,10 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any tenants_state::write_tenants().remove(&tenant_id); // If removal fails there will be no way to successfully retry detach, - // because tenant no longer exists in in memory map. And it needs to be removed from it - // before we remove files because it contains references to repository - // which references ephemeral files which are deleted on drop. So if we keep these references - // code will attempt to remove files which no longer exist. This can be fixed by having shutdown + // because the tenant no longer exists in the in-memory map. And it needs to be removed from it + // before we remove files, because it contains references to repository + // which references ephemeral files which are deleted on drop. So if we keep these references, + // we will attempt to remove files which no longer exist. This can be fixed by having shutdown // mechanism for repository that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9ad9c0cd2f..8ffb2eb829 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2522,8 +2522,8 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post def wait_until(number_of_iterations: int, interval: float, func): """ - Wait until 'func' returns successfully, without exception. Returns the last return value - from the the function. + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. """ last_exception = None for i in range(number_of_iterations): diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index d5cb200080..bdf675a785 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -50,7 +50,7 @@ class ZId: """ Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and the string representation is in hex. This corresponds to the ZId / ZTenantId / - ZTimelineIds in in the Rust code. + ZTimelineIds in the Rust code. """ def __init__(self, x: str): From ad057124beecafe188e3402cdd7e7581dc6ec096 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 5 Sep 2022 13:12:02 +0300 Subject: [PATCH 019/166] Update relation size cache only when latest LSN is requested (#2310) * Update relation size cache only when latest LSN is requested * Fix tests * Add a test case for timetravel query after pageserver restart. This test is currently failing, the queries return incorrect results. I don't know why, needs to be investigated. FAILED test_runner/batch_others/test_readonly_node.py::test_timetravel - assert 85 == 100000 If you remove the pageserver restart from the test, it passes. * yapf3 test_readonly_node.py * Add comment about cache correction in case of setting incorrect latest flag * Fix formatting for test_readonly_node.py * Remove unused imports * Fix mypy warning for test_readonly_node.py * Fix formatting of test_readonly_node.py * Bump postgres version Co-authored-by: Heikki Linnakangas --- pageserver/src/basebackup.rs | 6 +- pageserver/src/page_service.rs | 9 +- pageserver/src/pgdatadir_mapping.rs | 58 ++++++----- pageserver/src/walingest.rs | 112 ++++++++++++---------- test_runner/regress/test_readonly_node.py | 51 +++++++++- 5 files changed, 154 insertions(+), 82 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 864c5b8ac8..48b5f1a695 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -186,7 +186,7 @@ where } fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { - let nblocks = self.timeline.get_rel_size(tag, self.lsn)?; + let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?; // Function that adds relation segment data to archive let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { @@ -207,7 +207,9 @@ where for (seg, blocks) in chunks.into_iter().enumerate() { let mut segment_data: Vec = vec![]; for blknum in blocks { - let img = self.timeline.get_rel_page_at_lsn(tag, blknum, self.lsn)?; + let img = self + .timeline + .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?; segment_data.extend_from_slice(&img[..]); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7f7fa3c22b..358618f20c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -696,7 +696,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let exists = timeline.get_rel_exists(req.rel, lsn)?; + let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, @@ -712,7 +712,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let n_blocks = timeline.get_rel_size(req.rel, lsn)?; + let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -728,7 +728,8 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; + let total_blocks = + timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -754,7 +755,7 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ - let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?; + let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 24002a36e5..7bba64179c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -83,10 +83,16 @@ impl Timeline { //------------------------------------------------------------------------------ /// Look up given page version. - pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + pub fn get_rel_page_at_lsn( + &self, + tag: RelTag, + blknum: BlockNumber, + lsn: Lsn, + latest: bool, + ) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); - let nblocks = self.get_rel_size(tag, lsn)?; + let nblocks = self.get_rel_size(tag, lsn, latest)?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", @@ -100,20 +106,20 @@ impl Timeline { } // Get size of a database in blocks - pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, lsn)?; for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn)?; + let n_blocks = self.get_rel_size(rel, lsn, latest)?; total_blocks += n_blocks as usize; } Ok(total_blocks) } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { @@ -122,7 +128,7 @@ impl Timeline { if (tag.forknum == pg_constants::FSM_FORKNUM || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn)? + && !self.get_rel_exists(tag, lsn, latest)? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -135,13 +141,21 @@ impl Timeline { let mut buf = self.get(key, lsn)?; let nblocks = buf.get_u32_le(); - // Update relation size cache - self.update_cached_rel_size(tag, lsn, nblocks); + if latest { + // Update relation size cache only if "latest" flag is set. + // This flag is set by compute when it is working with most recent version of relation. + // Typically master compute node always set latest=true. + // Please notice, that even if compute node "by mistake" specifies old LSN but set + // latest=true, then it can not cause cache corruption, because with latest=true + // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be + // associated with most recent value of LSN. + self.update_cached_rel_size(tag, lsn, nblocks); + } Ok(nblocks) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); // first try to lookup relation in cache @@ -660,7 +674,7 @@ impl<'a> DatadirModification<'a> { pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn)?; + let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?; // Remove entry from dbdir let buf = self.get(DBDIR_KEY)?; @@ -733,7 +747,7 @@ impl<'a> DatadirModification<'a> { pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn)? { + if self.tline.get_rel_exists(rel, last_lsn, true)? { let size_key = rel_size_to_key(rel); // Fetch the old size first let old_size = self.get(size_key)?.get_u32_le(); @@ -1499,19 +1513,19 @@ mod tests { writer.finish()?; // Test read before rel creation. Should error out. - assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err()); + assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10), false).is_err()); // Read block beyond end of relation at different points in time. // These reads should fall into different delta, image, and in-memory layers. - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE); - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, ZERO_PAGE); // Test on an in-memory layer with no preceding layer let mut writer = tline.begin_record(Lsn(0x70)); @@ -1523,7 +1537,7 @@ mod tests { )?; writer.finish()?; - assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70), false)?6, ZERO_PAGE); Ok(()) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c0965e7a22..57592a46d3 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -504,7 +504,7 @@ impl<'a> WalIngest<'a> { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = modification.tline.get_rel_size(src_rel, req_lsn)?; + let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, @@ -521,7 +521,7 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn)?; + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } @@ -680,7 +680,7 @@ impl<'a> WalIngest<'a> { relnode: xnode.relnode, }; let last_lsn = self.timeline.get_last_record_lsn(); - if modification.tline.get_rel_exists(rel, last_lsn)? { + if modification.tline.get_rel_exists(rel, last_lsn, true)? { self.put_rel_drop(modification, rel)?; } } @@ -924,10 +924,10 @@ impl<'a> WalIngest<'a> { } fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn)? { + let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? { 0 } else { - self.timeline.get_rel_size(rel, lsn)? + self.timeline.get_rel_size(rel, lsn, true)? }; Ok(nblocks) } @@ -943,12 +943,12 @@ impl<'a> WalIngest<'a> { // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { + let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? { // create it with 0 size initially, the logic below will extend it modification.put_rel_creation(rel, 0)?; 0 } else { - self.timeline.get_rel_size(rel, last_lsn)? + self.timeline.get_rel_size(rel, last_lsn, true)? }; if new_nblocks > old_nblocks { @@ -1082,43 +1082,43 @@ mod tests { assert_current_logical_size(&*tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err()); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); // Check page contents at each LSN assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, TEST_IMG("foo blk 2 at 5") ); @@ -1129,20 +1129,20 @@ mod tests { assert_current_logical_size(&*tline, Lsn(0x60)); // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, TEST_IMG("foo blk 1 at 4") ); // should still see the truncated block with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, TEST_IMG("foo blk 2 at 5") ); @@ -1150,19 +1150,19 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x68)); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?, ZERO_PAGE ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?, TEST_IMG("foo blk 1") ); @@ -1170,15 +1170,15 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x80)); walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501); for blk in 2..1500 { assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?, + tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?, ZERO_PAGE ); } assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?, + tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?, TEST_IMG("foo blk 1500") ); @@ -1198,8 +1198,8 @@ mod tests { m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); @@ -1207,10 +1207,10 @@ mod tests { m.commit()?; // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false); // FIXME: should fail - //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); + //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); @@ -1218,8 +1218,8 @@ mod tests { m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40))?, 1); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1); Ok(()) } @@ -1243,18 +1243,18 @@ mod tests { m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err()); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, relsize); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn)?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?, TEST_IMG(&data) ); } @@ -1266,24 +1266,24 @@ mod tests { m.commit()?; // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?, TEST_IMG(&data) ); } // should still see all blocks with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, relsize); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize); for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?, TEST_IMG(&data) ); } @@ -1298,14 +1298,14 @@ mod tests { } m.commit()?; - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?, TEST_IMG(&data) ); } @@ -1332,14 +1332,17 @@ mod tests { assert_current_logical_size(&*tline, Lsn(lsn)); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE + 1); + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + RELSEG_SIZE + 1 + ); // Truncate one block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block @@ -1347,7 +1350,10 @@ mod tests { let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE - 1); + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + RELSEG_SIZE - 1 + ); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time @@ -1359,7 +1365,7 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, size as BlockNumber ); diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index fac9d97a42..3be64e077f 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,6 +1,6 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -101,3 +101,52 @@ def test_readonly_node(neon_simple_env: NeonEnv): node_name="test_readonly_node_preinitdb", lsn=Lsn("0/42"), ) + + +# Similar test, but with more data, and we force checkpoints +def test_timetravel(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_timetravel", "empty") + pg = env.postgres.create_start("test_timetravel") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + lsns = [] + + with pg.cursor() as cur: + cur.execute( + """ + CREATE TABLE testtab(id serial primary key, iteration int, data text); + INSERT INTO testtab (iteration, data) SELECT 0, 'data' FROM generate_series(1, 100000); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((0, current_lsn)) + + for i in range(1, 5): + with pg.cursor() as cur: + cur.execute(f"UPDATE testtab SET iteration = {i}") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((i, current_lsn)) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to force a new layer file + env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + + ##### Restart pageserver + env.postgres.stop_all() + env.pageserver.stop() + env.pageserver.start() + + for (i, lsn) in lsns: + pg_old = env.postgres.create_start( + branch_name="test_timetravel", node_name=f"test_old_lsn_{i}", lsn=lsn + ) + with pg_old.cursor() as cur: + assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000 + assert query_scalar(cur, f"select count(*) from testtab where iteration<>{i}") == 0 From 772078eb5ccdacba2e01a2a09d54d3813c0c3512 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 1 Sep 2022 01:28:18 +0300 Subject: [PATCH 020/166] Reword proxy SNI error message Be more strict with project id/name difference and explain how to get project id out of the domain name. --- proxy/src/auth.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 4e78c576e2..d09470d15e 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -46,9 +46,9 @@ pub enum AuthErrorImpl { MalformedPassword(&'static str), #[error( - "Project name is not specified. \ + "Project ID is not specified. \ Either please upgrade the postgres client library (libpq) for SNI support \ - or pass the project name as a parameter: '&options=project%3D'. \ + or pass the project ID (first part of the domain name) as a parameter: '?options=project%3D'. \ See more at https://neon.tech/sni" )] MissingProjectName, From ee0071e90d2084148e0ff931883ac1e3c8c8dc41 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 5 Sep 2022 14:30:37 +0100 Subject: [PATCH 021/166] Fix nightly benchmark reports (#2392) --- .github/actions/run-python-test-set/action.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 2344fba13c..01ddced313 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -80,14 +80,14 @@ runs: env: NEON_BIN: /tmp/neon/bin TEST_OUTPUT: /tmp/test_output - # this variable will be embedded in perf test report - # and is needed to distinguish different environments - PLATFORM: github-actions-selfhosted BUILD_TYPE: ${{ inputs.build_type }} AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} shell: bash -euxo pipefail {0} run: | + # PLATFORM will be embedded in the perf test report + # and it is needed to distinguish different environments + export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} if [ "${BUILD_TYPE}" = "remote" ]; then @@ -155,7 +155,7 @@ runs: if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO=local + export REPORT_TO="$PLATFORM" scripts/generate_and_push_perf_report.sh fi fi From 05e263d0d36bb2e49aa83b4488361b78ec3016e9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 5 Sep 2022 18:30:54 +0300 Subject: [PATCH 022/166] Prepare pg 15 support (build system and submodules) (#2337) * Add submodule postgres-15 * Support pg_15 in pgxn/neon * Renamed zenith -> neon in Makefile * fix name of codestyle check * Refactor build system to prepare for building multiple Postgres versions. Rename "vendor/postgres" to "vendor/postgres-v14" Change Postgres build and install directory paths to be version-specific: - tmp_install/build -> pg_install/build/14 - tmp_install/* -> pg_install/14/* And Makefile targets: - "make postgres" -> "make postgres-v14" - "make postgres-headers" -> "make postgres-v14-headers" - etc. Add Makefile aliases: - "make postgres" to build "postgres-v14" and in future, "postgres-v15" - similarly for "make postgres-headers" Fix POSTGRES_DISTRIB_DIR path in pytest scripts * Make postgres version a variable in codestyle workflow * Support vendor/postgres-v15 in codestyle check workflow * Support postgres-v15 building in Makefile * fix pg version in Dockerfile.compute-node * fix kaniko path * Build neon extensions in version-specific directories * fix obsolete mentions of vendor/postgres * use vendor/postgres-v14 in Dockerfile.compute-node.legacy * Use PG_VERSION_NUM to gate dependencies in inmem_smgr.c * Use versioned ECR repositories and image names for compute-node. The image name format is compute-node-vXX, where XX is postgres major version number. For now only v14 is supported. Old format unversioned name (compute-node) is left, because cloud repo depends on it. * update vendor/postgres submodule url (zenith->neondatabase rename) * Fix postgres path in python tests after rebase * fix path in regress test * Use separate dockerfiles to build compute-node: Dockerfile.compute-node-v15 should be identical to Dockerfile.compute-node-v14 except for the version number. This is a hack, because Kaniko doesn't support build ARGs properly * bump vendor/postgres-v14 and vendor/postgres-v15 * Don't use Kaniko cache for v14 and v15 compute-node images * Build compute-node images for different versions in different jobs Co-authored-by: Heikki Linnakangas --- .dockerignore | 3 +- .../actions/run-python-test-set/action.yml | 2 +- .github/workflows/build_and_test.yml | 67 +++++-- .github/workflows/codestyle.yml | 18 +- .github/workflows/pg_clients.yml | 2 +- .gitignore | 2 +- .gitmodules | 10 +- Dockerfile | 19 +- ...ompute-node => Dockerfile.compute-node-v14 | 7 +- Dockerfile.compute-node-v15 | 172 +++++++++++++++++ Dockerfile.compute-node.legacy | 3 +- Makefile | 174 ++++++++++++------ NOTICE | 4 +- README.md | 5 +- control_plane/src/local_env.rs | 4 +- docs/settings.md | 2 +- docs/sourcetree.md | 6 +- libs/postgres_ffi/build.rs | 9 +- libs/postgres_ffi/src/xlog_utils.rs | 2 +- .../wal_craft/src/bin/wal_craft.rs | 2 +- pageserver/src/config.rs | 2 +- pgxn/neon/inmem_smgr.c | 4 + pgxn/neon/pagestore_smgr.c | 9 + pgxn/neon/relsize_cache.c | 28 +++ pgxn/neon/walproposer.c | 58 +++++- pgxn/neon/walproposer_utils.c | 97 ++++++++-- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_pg_regress.py | 10 +- vendor/{postgres => postgres-v14} | 0 vendor/postgres-v15 | 1 + 30 files changed, 593 insertions(+), 133 deletions(-) rename Dockerfile.compute-node => Dockerfile.compute-node-v14 (98%) create mode 100644 Dockerfile.compute-node-v15 rename vendor/{postgres => postgres-v14} (100%) create mode 160000 vendor/postgres-v15 diff --git a/.dockerignore b/.dockerignore index 2c78951923..9f8a22d598 100644 --- a/.dockerignore +++ b/.dockerignore @@ -13,6 +13,7 @@ !pgxn/ !proxy/ !safekeeper/ -!vendor/postgres/ +!vendor/postgres-v14/ +!vendor/postgres-v15/ !workspace_hack/ !neon_local/ diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 01ddced313..f04f5d11b8 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -88,7 +88,7 @@ runs: # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} - export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6fae36c6e4..6eddbc3335 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -78,8 +78,8 @@ jobs: fetch-depth: 1 - name: Set pg revision for caching - id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + id: pg_v14_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) shell: bash -euxo pipefail {0} # Set some environment variables used by all the steps. @@ -124,12 +124,12 @@ jobs: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - - name: Cache postgres build + - name: Cache postgres v14 build id: cache_pg uses: actions/cache@v3 with: - path: tmp_install/ - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' @@ -192,7 +192,7 @@ jobs: shell: bash -euxo pipefail {0} - name: Install postgres binaries - run: cp -a tmp_install /tmp/neon/pg_install + run: cp -a pg_install /tmp/neon/pg_install shell: bash -euxo pipefail {0} - name: Upload Neon artifact @@ -447,7 +447,6 @@ jobs: compute-node-image: runs-on: dev container: gcr.io/kaniko-project/executor:v1.9.0-debug - steps: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko @@ -458,18 +457,57 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build compute node with extensions - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + # compute-node uses postgres 14, which is default now + # cloud repo depends on this image name, thus duplicating it + # remove compute-node when cloud repo is updated + - name: Kaniko build compute node with extensions v14 (compatibility) + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + + compute-node-image-v14: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute node with extensions v14 + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID + + + compute-node-image-v15: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute node with extensions v15 + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID promote-images: runs-on: dev - needs: [ neon-image, compute-node-image, compute-tools-image ] + needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-tools-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: fail-fast: false matrix: - name: [ neon, compute-node, compute-tools ] + # compute-node uses postgres 14, which is default now + # cloud repo depends on this image name, thus duplicating it + # remove compute-node when cloud repo is updated + name: [ neon, compute-node, compute-node-v14, compute-tools ] steps: - name: Promote image to latest @@ -501,6 +539,9 @@ jobs: - name: Pull compute node image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node + - name: Pull compute node v14 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14 + - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust @@ -519,6 +560,9 @@ jobs: - name: Push compute node image to Docker Hub run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} + - name: Push compute node v14 image to Docker Hub + run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned @@ -530,6 +574,7 @@ jobs: crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index b64ea8a01f..a5e31d49ee 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -27,8 +27,10 @@ jobs: # Rust toolchains (e.g. nightly or 1.37.0), add them here. rust_toolchain: [1.58] os: [ubuntu-latest, macos-latest] + # To support several Postgres versions, add them here. + postgres_version: [v14, v15] timeout-minutes: 60 - name: run regression test suite + name: check codestyle rust and postgres runs-on: ${{ matrix.os }} steps: @@ -61,14 +63,14 @@ jobs: - name: Set pg revision for caching id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-${{matrix.postgres_version}}) - - name: Cache postgres build + - name: Cache postgres ${{matrix.postgres_version}} build id: cache_pg uses: actions/cache@v3 with: path: | - tmp_install/ + pg_install/${{matrix.postgres_version}} key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} - name: Set extra env for macOS @@ -90,10 +92,10 @@ jobs: if: failure() continue-on-error: true run: | - echo '' && echo '=== config.log ===' && echo '' - cat tmp_install/build/config.log - echo '' && echo '=== configure.log ===' && echo '' - cat tmp_install/build/configure.log + echo '' && echo '=== Postgres ${{matrix.postgres_version}} config.log ===' && echo '' + cat pg_install/build/${{matrix.postgres_version}}/config.log + echo '' && echo '=== Postgres ${{matrix.postgres_version}} configure.log ===' && echo '' + cat pg_install/build/${{matrix.postgres_version}}/configure.log - name: Cache cargo deps id: cache_cargo diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 95052619cd..bf14865db2 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -52,7 +52,7 @@ jobs: REMOTE_ENV: 1 BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 shell: bash -euxo pipefail {0} run: | # Test framework expects we have psql binary; diff --git a/.gitignore b/.gitignore index ed718c8c79..618ff2c5b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ +/pg_install /target /tmp_check -/tmp_install /tmp_check_cli __pycache__/ test_output/ diff --git a/.gitmodules b/.gitmodules index 8975c6e2fa..23765194c1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,8 @@ -[submodule "vendor/postgres"] - path = vendor/postgres - url = https://github.com/zenithdb/postgres +[submodule "vendor/postgres-v14"] + path = vendor/postgres-v14 + url = https://github.com/neondatabase/postgres.git branch = main +[submodule "vendor/postgres-v15"] + path = vendor/postgres-v15 + url = https://github.com/neondatabase/postgres.git + branch = REL_15_STABLE_neon diff --git a/Dockerfile b/Dockerfile index aa31e227da..d379c05051 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,20 +5,24 @@ ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com ARG IMAGE=rust ARG TAG=pinned +# ARGs don't get replaced in RUN commands in Kaniko +# so use hardcoded value below +# ARG PG_VERSION=v14 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot -COPY --chown=nonroot vendor/postgres vendor/postgres +ARG PG_VERSION=v14 +COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile ENV BUILD_TYPE release RUN set -e \ - && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf tmp_install/build \ - && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . + && mold -run make -j $(nproc) -s neon-pg-ext-v14 \ + && rm -rf pg_install/v14/build \ + && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries FROM $REPOSITORY/$IMAGE:$TAG AS build @@ -35,7 +39,8 @@ ARG CACHEPOT_BUCKET=neon-github-dev #ARG AWS_ACCESS_KEY_ID #ARG AWS_SECRET_ACCESS_KEY -COPY --from=pg-build /home/nonroot/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +ARG PG_VERSION=v14 +COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. @@ -64,7 +69,9 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin -COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ +# v14 is default for now +ARG PG_VERSION=v14 +COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node-v14 similarity index 98% rename from Dockerfile.compute-node rename to Dockerfile.compute-node-v14 index 3298032030..8ddf752191 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node-v14 @@ -2,6 +2,7 @@ ARG TAG=pinned # apparently, ARGs don't get replaced in RUN commands in kaniko # ARG POSTGIS_VERSION=3.3.0 # ARG PLV8_VERSION=3.1.4 +# ARG PG_VERSION=v14 # # Layer "build-deps" @@ -16,7 +17,7 @@ RUN apt update && \ # Build Postgres from the neon postgres repository. # FROM build-deps AS pg-build -COPY vendor/postgres postgres +COPY vendor/postgres-v14 postgres RUN cd postgres && \ ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ @@ -28,8 +29,8 @@ RUN cd postgres && \ # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. -# -# PostGIS compiles against neon postgres sources without changes. Perhaps we +# +# PostGIS compiles against neon postgres sources without changes. Perhaps we # could even use the upstream binaries, compiled against vanilla Postgres, but # it would require some investigation to check that it works, and also keeps # working in the future. So for now, we compile our own binaries. diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 new file mode 100644 index 0000000000..f949ef7680 --- /dev/null +++ b/Dockerfile.compute-node-v15 @@ -0,0 +1,172 @@ +# +# This file is identical to the Dockerfile.compute-node-v14 file +# except for the version of Postgres that is built. +# + +ARG TAG=pinned +# apparently, ARGs don't get replaced in RUN commands in kaniko +# ARG POSTGIS_VERSION=3.3.0 +# ARG PLV8_VERSION=3.1.4 +# ARG PG_VERSION=v15 + +# +# Layer "build-deps" +# +FROM debian:bullseye-slim AS build-deps +RUN apt update && \ + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ + libcurl4-openssl-dev libossp-uuid-dev + +# +# Layer "pg-build" +# Build Postgres from the neon postgres repository. +# +FROM build-deps AS pg-build +COPY vendor/postgres-v15 postgres +RUN cd postgres && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install + +# +# Layer "postgis-build" +# Build PostGIS from the upstream PostGIS mirror. +# +# PostGIS compiles against neon postgres sources without changes. Perhaps we +# could even use the upstream binaries, compiled against vanilla Postgres, but +# it would require some investigation to check that it works, and also keeps +# working in the future. So for now, we compile our own binaries. +FROM build-deps AS postgis-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ + tar xvzf postgis-3.3.0.tar.gz && \ + cd postgis-3.3.0 && \ + ./autogen.sh && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + ./configure && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + cd extensions/postgis && \ + make clean && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control + +# +# Layer "plv8-build" +# Build plv8 +# +FROM build-deps AS plv8-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + +# https://github.com/plv8/plv8/issues/475 +# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing binutils + +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ + tar xvzf v3.1.4.tar.gz && \ + cd plv8-3.1.4 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + rm -rf /plv8-* && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control + +# +# Layer "neon-pg-ext-build" +# compile neon extensions +# +FROM build-deps AS neon-pg-ext-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY pgxn/ pgxn/ + +RUN make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon \ + -s install + +# Compile and run the Neon-specific `compute_ctl` binary +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools +USER nonroot +# Copy entire project to get Cargo.* files with proper dependencies for the whole project +COPY --chown=nonroot . . +RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto + +# +# Clean up postgres folder before inclusion +# +FROM neon-pg-ext-build AS postgres-cleanup-layer +COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql + +# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) +RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp + +# Remove headers that we won't need anymore - we've completed installation of all extensions +RUN rm -r /usr/local/pgsql/include + +# Remove now-useless PGXS src infrastructure +RUN rm -r /usr/local/pgsql/lib/pgxs/src + +# Remove static postgresql libraries - all compilation is finished, so we +# can now remove these files - they must be included in other binaries by now +# if they were to be used by other libraries. +RUN rm /usr/local/pgsql/lib/lib*.a + +# +# Final layer +# Put it all together into the final image +# +FROM debian:bullseye-slim +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +# TODO: Check if we can make the extension setup more modular versus a linear build +# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl + +# Install: +# libreadline8 for psql +# libossp-uuid16 for extension ossp-uuid +# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS +# GLIBC 2.34 for plv8. +# Debian bullseye provides GLIBC 2.31, so we install the library from testing +# +# Lastly, link compute_ctl into zenith_ctl while we're at it, +# so that we don't need to put this in another layer. +RUN apt update && \ + apt install --no-install-recommends -y \ + libreadline8 \ + libossp-uuid16 \ + libgeos-c1v5 \ + libgdal28 \ + libproj19 \ + libprotobuf-c1 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + echo "Installing GLIBC 2.34" && \ + echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing libc6 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +USER postgres +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy index ba34e2486f..7689167156 100644 --- a/Dockerfile.compute-node.legacy +++ b/Dockerfile.compute-node.legacy @@ -37,7 +37,8 @@ RUN adduser postgres RUN mkdir /pg && chown postgres:postgres /pg # Copy source files -COPY ./vendor/postgres /pg/ +# version 14 is default for now +COPY ./vendor/postgres-v14 /pg/ COPY ./pgxn /pg/ # Build and install Postgres locally diff --git a/Makefile b/Makefile index 9d7e1497e5..0b2b097ebc 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -# Where to install Postgres, default is ./tmp_install, maybe useful for package managers -POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install - -# Seccomp BPF is only available for Linux -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Linux) - SECCOMP = --with-libseccomp -else - SECCOMP = -endif +# Where to install Postgres, default is ./pg_install, maybe useful for package managers +POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ # # We differentiate between release / debug build types using the BUILD_TYPE @@ -28,6 +20,13 @@ else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif +# Seccomp BPF is only available for Linux +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + PG_CONFIGURE_OPTS += --with-libseccomp +endif + + # macOS with brew-installed openssl requires explicit paths # It can be configured with OPENSSL_PREFIX variable UNAME_S := $(shell uname -s) @@ -48,75 +47,136 @@ CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # -# Top level Makefile to build Zenith and PostgreSQL +# Top level Makefile to build Neon and PostgreSQL # .PHONY: all -all: zenith postgres neon-pg-ext +all: neon postgres neon-pg-ext -### Zenith Rust bits +### Neon Rust bits # # The 'postgres_ffi' depends on the Postgres headers. -.PHONY: zenith -zenith: postgres-headers - +@echo "Compiling Zenith" +.PHONY: neon +neon: postgres-v14-headers + +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts -$(POSTGRES_INSTALL_DIR)/build/config.status: - +@echo "Configuring postgres build" - mkdir -p $(POSTGRES_INSTALL_DIR)/build - (cd $(POSTGRES_INSTALL_DIR)/build && \ - $(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ +# The rules are duplicated for Postgres v14 and 15. We may want to refactor +# to avoid the duplication in the future, but it's tolerable for now. +# +$(POSTGRES_INSTALL_DIR)/build/v14/config.status: + +@echo "Configuring Postgres v14 build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14 + (cd $(POSTGRES_INSTALL_DIR)/build/v14 && \ + $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \ $(PG_CONFIGURE_OPTS) \ - $(SECCOMP) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log) + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log) -# nicer alias for running 'configure' -.PHONY: postgres-configure -postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status +$(POSTGRES_INSTALL_DIR)/build/v15/config.status: + +@echo "Configuring Postgres v15 build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15 + (cd $(POSTGRES_INSTALL_DIR)/build/v15 && \ + $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \ + $(PG_CONFIGURE_OPTS) \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log) -# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include -.PHONY: postgres-headers -postgres-headers: postgres-configure - +@echo "Installing PostgreSQL headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install +# nicer alias to run 'configure' +.PHONY: postgres-v14-configure +postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status -# Compile and install PostgreSQL and contrib/neon -.PHONY: postgres -postgres: postgres-configure \ - postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` - +@echo "Compiling PostgreSQL" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install - +@echo "Compiling libpq" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install - +@echo "Compiling pg_buffercache" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install - +@echo "Compiling pageinspect" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install +.PHONY: postgres-v15-configure +postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status -.PHONY: postgres-clean -postgres-clean: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean +# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include +.PHONY: postgres-v14-headers +postgres-v14-headers: postgres-v14-configure + +@echo "Installing PostgreSQL v14 headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install -neon-pg-ext: postgres - +@echo "Compiling neon" - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ - -C $(ROOT_PROJECT_DIR)/pgxn/neon install - +@echo "Compiling neon_test_utils" - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ - -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install +.PHONY: postgres-v15-headers +postgres-v15-headers: postgres-v15-configure + +@echo "Installing PostgreSQL v15 headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install + +# Compile and install PostgreSQL +.PHONY: postgres-v14 +postgres-v14: postgres-v14-configure \ + postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install + +@echo "Compiling libpq v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install + +@echo "Compiling pg_buffercache v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install + +@echo "Compiling pageinspect v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install + +.PHONY: postgres-v15 +postgres-v15: postgres-v15-configure \ + postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install + +@echo "Compiling libpq v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install + +@echo "Compiling pg_buffercache v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install + +@echo "Compiling pageinspect v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install + +# shorthand to build all Postgres versions +postgres: postgres-v14 postgres-v15 + +.PHONY: postgres-v14-clean +postgres-v14-clean: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean + +.PHONY: postgres-v15-clean +postgres-v15-clean: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean + +neon-pg-ext-v14: postgres-v14 + +@echo "Compiling neon v14" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_test_utils" v14 + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) + +neon-pg-ext-v15: postgres-v15 + +@echo "Compiling neon v15" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_test_utils" v15 + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) .PHONY: neon-pg-ext-clean $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean +neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15 +postgres-headers: postgres-v14-headers postgres-v15-headers +postgres-clean: postgres-v14-clean postgres-v15-clean + # This doesn't remove the effects of 'configure'. .PHONY: clean clean: - cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean cd pgxn/neon && $(MAKE) clean cd pgxn/neon_test_utils && $(MAKE) clean diff --git a/NOTICE b/NOTICE index 47cc4e798f..4fbec9763b 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ Neon Copyright 2022 Neon Inc. -The PostgreSQL submodule in vendor/postgres is licensed under the -PostgreSQL license. See vendor/postgres/COPYRIGHT. +The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the +PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT. diff --git a/README.md b/README.md index f557b19987..57d0a144cb 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Pageserver consists of: - WAL receiver - service that receives WAL from WAL service and stores it in the repository. - Page service - service that communicates with compute nodes and responds with pages from the repository. - WAL redo - service that builds pages from base images and WAL records on Page service request + ## Running local installation @@ -101,7 +102,7 @@ make -j`sysctl -n hw.logicalcpu` ``` #### Dependency installation notes -To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. +To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory. @@ -208,7 +209,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -make # builds also postgres and installs it to ./tmp_install +make # builds also postgres and installs it to ./pg_install ./scripts/pytest ``` diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 75e552f6cc..c4a61dbd7b 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -289,13 +289,13 @@ impl LocalEnv { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install". + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". if env.pg_distrib_dir == Path::new("") { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { env.pg_distrib_dir = postgres_bin.into(); } else { let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("tmp_install") + env.pg_distrib_dir = cwd.join("pg_install/v14") } } diff --git a/docs/settings.md b/docs/settings.md index 5a0e976b47..30db495dbe 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -157,7 +157,7 @@ for other files and for sockets for incoming connections. A directory with Postgres installation to use during pageserver activities. Inside that dir, a `bin/postgres` binary should be present. -The default distrib dir is `./tmp_install/`. +The default distrib dir is `./pg_install/`. #### workdir (-D) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 88f4b0e559..f3bc9230e2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -40,15 +40,15 @@ and create new databases and accounts (control plane API in our case). Integration tests, written in Python using the `pytest` framework. -`/vendor/postgres`: +`/vendor/postgres-v14`: PostgreSQL source tree, with the modifications needed for Neon. -`/vendor/postgres/contrib/neon`: +`/pgxn/neon`: PostgreSQL extension that implements storage manager API and network communications with remote page server. -`/vendor/postgres/contrib/neon_test_utils`: +`/pgxn/neon_test_utils`: PostgreSQL extension that contains functions needed for testing and debugging. diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 69b2711c22..19507f0557 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -47,14 +47,17 @@ fn main() { println!("cargo:rerun-if-changed=bindgen_deps.h"); // Finding the location of C headers for the Postgres server: - // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` - // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` + // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` + // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/v14/include/postgresql/server` let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { - PathBuf::from("tmp_install") + PathBuf::from("pg_install") }; + // Currently, we only expect to find PostgreSQL v14 sources, in "pg_install/v14". In the + // future, we will run this for all supported PostgreSQL versions. + pg_install_dir.push("v14"); if pg_install_dir.is_relative() { let cwd = env::current_dir().unwrap(); diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index e7838c3f2c..0d9aaa4708 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -449,7 +449,7 @@ mod tests { .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join("tmp_install"), + pg_distrib_dir: top_path.join("pg_install/v14"), datadir: top_path.join(format!("test_output/{}", test_name)), }; if cfg.datadir.exists() { diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 938f8f421b..2a607db6dc 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,7 +37,7 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") + .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)") .default_value("/usr/local") ) ) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c1c4169e14..fb70ea327d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -205,7 +205,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join("tmp_install")), + .join("pg_install/v14")), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 7840292b08..13fd4d50b6 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -29,6 +29,10 @@ #include "storage/relfilenode.h" #include "storage/smgr.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#endif + /* Size of the in-memory smgr */ #define MAX_PAGES 64 diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 21d6dfec52..e3f083fd43 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -64,6 +64,11 @@ #include "catalog/pg_tablespace_d.h" #include "postmaster/autovacuum.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#endif + /* * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API * calls to md.c, and *also* do the calls to the Page Server. On every @@ -645,7 +650,11 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc * _bt_blwritepage logs the full page without flushing WAL before * smgrextend (files are fsynced before build ends). */ +#if PG_VERSION_NUM >= 150000 + flushlsn = GetFlushRecPtr(NULL); +#else flushlsn = GetFlushRecPtr(); +#endif if (lsn > flushlsn) { elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 8dfcffe1d1..31021f3e41 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -24,6 +24,9 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#if PG_VERSION_NUM >= 150000 +#include "miscadmin.h" +#endif typedef struct { @@ -41,6 +44,10 @@ static HTAB *relsize_hash; static LWLockId relsize_lock; static int relsize_hash_size; static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void relsize_shmem_request(void); +#endif /* * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, @@ -158,10 +165,31 @@ relsize_hash_init(void) if (relsize_hash_size > 0) { +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = relsize_shmem_request; +#else RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); RequestNamedLWLockTranche("neon_relsize", 1); +#endif prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = zenith_smgr_shmem_startup; } } + +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in zenith_smgr_shmem_startup(). + */ +static void +relsize_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); +} +#endif diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 3baa4802b0..a769a5216b 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -39,6 +39,10 @@ #include "access/xact.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" +#include "access/xloginsert.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif #include "storage/latch.h" #include "miscadmin.h" #include "pgstat.h" @@ -165,7 +169,10 @@ static bool backpressure_throttling_impl(void); static process_interrupts_callback_t PrevProcessInterruptsCallback; static shmem_startup_hook_type prev_shmem_startup_hook_type; - +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void walproposer_shmem_request(void); +#endif void pg_init_walproposer(void) @@ -221,19 +228,38 @@ static void nwp_register_gucs(void) GUC_UNIT_MS, NULL, NULL, NULL ); - + } /* shmem handling */ static void nwp_prepare_shmem(void) { +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = walproposer_shmem_request; +#else RequestAddinShmemSpace(WalproposerShmemSize()); - +#endif prev_shmem_startup_hook_type = shmem_startup_hook; shmem_startup_hook = nwp_shmem_startup_hook; } +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in nwp_shmem_startup_hook(). + */ +static void +walproposer_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(WalproposerShmemSize()); +} +#endif + static void nwp_shmem_startup_hook(void) { if (prev_shmem_startup_hook_type) @@ -248,6 +274,10 @@ static void nwp_shmem_startup_hook(void) void WalProposerMain(Datum main_arg) { +#if PG_VERSION_NUM >= 150000 + TimeLineID tli; +#endif + /* Establish signal handlers. */ pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGHUP, SignalHandlerForConfigReload); @@ -255,9 +285,14 @@ WalProposerMain(Datum main_arg) BackgroundWorkerUnblockSignals(); +#if PG_VERSION_NUM >= 150000 + // FIXME pass proper tli to WalProposerInit ? + GetXLogReplayRecPtr(&tli); + WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier()); +#else GetXLogReplayRecPtr(&ThisTimeLineID); - WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier()); +#endif last_reconnect_attempt = GetCurrentTimestamp(); @@ -468,7 +503,12 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); +#if PG_VERSION_NUM >= 150000 +// FIXME don't use hardcoded timeline id + greetRequest.timeline = 1; +#else greetRequest.timeline = ThisTimeLineID; +#endif greetRequest.walSegSize = wal_segment_size; InitEventSet(); @@ -1702,7 +1742,12 @@ SendAppendRequests(Safekeeper *sk) &sk->outbuf.data[sk->outbuf.len], req->beginLsn, req->endLsn - req->beginLsn, + #if PG_VERSION_NUM >= 150000 + // FIXME don't use hardcoded timelineid here + 1, + #else ThisTimeLineID, + #endif &errinfo)) { WALReadRaiseError(&errinfo); @@ -2373,8 +2418,11 @@ backpressure_lag_impl(void) XLogRecPtr writePtr; XLogRecPtr flushPtr; XLogRecPtr applyPtr; +#if PG_VERSION_NUM >= 150000 + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); +#else XLogRecPtr myFlushLsn = GetFlushRecPtr(); - +#endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024*1024) diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index 7b96fd580c..417a8c4586 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -21,6 +21,11 @@ #include #include +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#endif + /* * These variables are used similarly to openLogFile/SegNo, * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID @@ -85,7 +90,11 @@ static volatile sig_atomic_t replication_active = false; typedef void (*WalSndSendDataCallback) (void); static void WalSndLoop(WalSndSendDataCallback send_data); static void XLogSendPhysical(void); +#if PG_VERSION_NUM >= 150000 +static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); +#else static XLogRecPtr GetStandbyFlushRecPtr(void); +#endif static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p); @@ -222,10 +231,10 @@ SafekeeperStateDesiredEvents(SafekeeperState state) result = WL_SOCKET_READABLE; break; - /* + /* * Flush states require write-ready for flushing. * Active state does both reading and writing. - * + * * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. */ @@ -398,12 +407,21 @@ XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) if (walpropFile < 0) { + #if PG_VERSION_NUM >= 150000 + // FIXME Is it ok to use hardcoded value here? + TimeLineID tli = 1; + #else bool use_existent = true; - + #endif /* Create/use new log file */ XLByteToSeg(recptr, walpropSegNo, wal_segment_size); + #if PG_VERSION_NUM >= 150000 + walpropFile = XLogFileInit(walpropSegNo, tli); + walpropFileTLI = tli; + #else walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); walpropFileTLI = ThisTimeLineID; + #endif } /* Calculate the start offset of the received logs */ @@ -488,11 +506,14 @@ void StartProposerReplication(StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; + TimeLineID currTLI; + #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + #endif /* create xlogreader for physical replication */ xlogreader = @@ -534,10 +555,19 @@ StartProposerReplication(StartReplicationCmd *cmd) * Select the timeline. If it was given explicitly by the client, use * that. Otherwise use the timeline of the last replayed record, which is * kept in ThisTimeLineID. - * + * * Neon doesn't currently use PG Timelines, but it may in the future, so * we keep this code around to lighten the load for when we need it. */ +#if PG_VERSION_NUM >= 150000 + if (am_cascading_walsender) + { + /* this also updates ThisTimeLineID */ + FlushPtr = GetStandbyFlushRecPtr(&currTLI); + } + else + FlushPtr = GetFlushRecPtr(&currTLI); +#else if (am_cascading_walsender) { /* this also updates ThisTimeLineID */ @@ -546,12 +576,16 @@ StartProposerReplication(StartReplicationCmd *cmd) else FlushPtr = GetFlushRecPtr(); + currTLI = ThisTimeLineID; +#endif + + if (cmd->timeline != 0) { XLogRecPtr switchpoint; sendTimeLine = cmd->timeline; - if (sendTimeLine == ThisTimeLineID) + if (sendTimeLine == currTLI) { sendTimeLineIsHistoric = false; sendTimeLineValidUpto = InvalidXLogRecPtr; @@ -566,7 +600,7 @@ StartProposerReplication(StartReplicationCmd *cmd) * Check that the timeline the client requested exists, and the * requested start location is on that timeline. */ - timeLineHistory = readTimeLineHistory(ThisTimeLineID); + timeLineHistory = readTimeLineHistory(currTLI); switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, &sendTimeLineNextTLI); list_free_deep(timeLineHistory); @@ -605,7 +639,7 @@ StartProposerReplication(StartReplicationCmd *cmd) } else { - sendTimeLine = ThisTimeLineID; + sendTimeLine = currTLI; sendTimeLineValidUpto = InvalidXLogRecPtr; sendTimeLineIsHistoric = false; } @@ -710,6 +744,34 @@ StartProposerReplication(StartReplicationCmd *cmd) EndReplicationCommand("START_STREAMING"); } +#if PG_VERSION_NUM >= 150000 +static XLogRecPtr +GetStandbyFlushRecPtr(TimeLineID *tli) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + *tli = replayTLI; + + result = replayPtr; + if (receiveTLI == replayTLI && receivePtr > replayPtr) + result = receivePtr; + + return result; +} +#else /* * Returns the latest point in WAL that has been safely flushed to disk, and * can be sent to the standby. This should only be called when in recovery, @@ -744,6 +806,9 @@ GetStandbyFlushRecPtr(void) return result; } +#endif + + /* XLogReaderRoutine->segment_open callback */ static void @@ -878,6 +943,7 @@ XLogSendPhysical(void) XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes PG_USED_FOR_ASSERTS_ONLY; + TimeLineID currTLI; /* If requested switch the WAL sender to the stopping state. */ if (got_STOPPING) @@ -919,9 +985,12 @@ XLogSendPhysical(void) * FlushPtr that was calculated before it became historic. */ bool becameHistoric = false; - +#if PG_VERSION_NUM >= 150000 + SendRqstPtr = GetStandbyFlushRecPtr(&currTLI); +#else SendRqstPtr = GetStandbyFlushRecPtr(); - + currTLI = ThisTimeLineID; +#endif if (!RecoveryInProgress()) { /* @@ -935,10 +1004,10 @@ XLogSendPhysical(void) { /* * Still a cascading standby. But is the timeline we're sending - * still the one recovery is recovering from? ThisTimeLineID was + * still the one recovery is recovering from? currTLI was * updated by the GetStandbyFlushRecPtr() call above. */ - if (sendTimeLine != ThisTimeLineID) + if (sendTimeLine != currTLI) becameHistoric = true; } @@ -951,7 +1020,7 @@ XLogSendPhysical(void) */ List *history; - history = readTimeLineHistory(ThisTimeLineID); + history = readTimeLineHistory(currTLI); sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); Assert(sendTimeLine < sendTimeLineNextTLI); @@ -974,7 +1043,11 @@ XLogSendPhysical(void) * primary: if the primary subsequently crashes and restarts, standbys * must not have applied any WAL that got lost on the primary. */ + #if PG_VERSION_NUM >= 150000 + SendRqstPtr = GetFlushRecPtr(NULL); + #else SendRqstPtr = GetFlushRecPtr(); + #endif } /* diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8ffb2eb829..b47e560325 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,7 +59,7 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_POSTGRES_DIR = "tmp_install" +DEFAULT_POSTGRES_DIR = "pg_install/v14" DEFAULT_BRANCH_NAME = "main" BASE_PORT = 15000 @@ -188,7 +188,7 @@ def can_bind(host: str, port: int) -> bool: Check whether a host:port is available to bind for listening Inspired by the can_bind() perl function used in Postgres tests, in - vendor/postgres/src/test/perl/PostgresNode.pm + vendor/postgres-v14/src/test/perl/PostgresNode.pm """ with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 119528b8f9..aa5a65f446 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -26,8 +26,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") - src_path = os.path.join(base_dir, "vendor/postgres/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") + src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/regress") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -80,8 +80,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "build/src/test/isolation") - src_path = os.path.join(base_dir, "vendor/postgres/src/test/isolation") + build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/isolation") + src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/isolation") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -124,7 +124,7 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") src_path = os.path.join(base_dir, "test_runner/sql_regress") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") diff --git a/vendor/postgres b/vendor/postgres-v14 similarity index 100% rename from vendor/postgres rename to vendor/postgres-v14 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 new file mode 160000 index 0000000000..26c6466873 --- /dev/null +++ b/vendor/postgres-v15 @@ -0,0 +1 @@ +Subproject commit 26c64668736b729a3e4c02c6fc0a84544118df26 From f081419e68a32b1420eb1a1337a1d666955278bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Tue, 6 Sep 2022 11:30:20 +0300 Subject: [PATCH 023/166] Cleanup tenant specific metrics once a tenant is detached. (#2328) * Add test for pageserver metric cleanup once a tenant is detached. * Remove tenant specific timeline metrics on detach. * Use definitions from timeline_metrics in page service. * Move metrics to own file from layered_repository/timeline.rs * TIMELINE_METRICS: define smgr metrics * REMOVE SMGR cleanup from timeline_metrics. Doesn't seem to work as expected. * Vritual file centralized metrics, except for evicted file as there's no tenat id or timeline id. * Use STORAGE_TIME from timeline_metrics in layered_repository. * Remove timelineless gc metrics for tenant on detach. * Rename timeline metrics -> metrics as it's more generic. * Don't create a TimelineMetrics instance for VirtualFile * Move the rest of the metric definitions to metrics.rs too. * UUID -> ZTenantId * Use consistent style for dict. * Use Repository's Drop trait for dropping STORAGE_TIME metrics. * No need for Arc, TimelineMetrics is used in just one place. Due to that, we can fall back using ZTenantId and ZTimelineId too to avoid additional string allocation. --- pageserver/src/layered_repository.rs | 9 +- .../src/layered_repository/layer_map.rs | 8 +- pageserver/src/layered_repository/timeline.rs | 190 +------- pageserver/src/lib.rs | 12 +- pageserver/src/metrics.rs | 419 ++++++++++++++++++ pageserver/src/page_service.rs | 23 +- pageserver/src/storage_sync.rs | 35 +- pageserver/src/storage_sync/upload.rs | 12 +- pageserver/src/tenant_tasks.rs | 13 +- pageserver/src/virtual_file.rs | 35 +- .../src/walreceiver/walreceiver_connection.rs | 3 +- pageserver/src/walredo.rs | 69 +-- test_runner/fixtures/metrics.py | 31 +- test_runner/regress/test_tenants.py | 49 +- 14 files changed, 522 insertions(+), 386 deletions(-) create mode 100644 pageserver/src/metrics.rs diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 74abbeba86..200834300b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -32,9 +32,11 @@ use std::time::{Duration, Instant}; use self::metadata::{metadata_path, TimelineMetadata}; use crate::config::PageServerConf; +use crate::metrics::remove_tenant_metrics; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::metrics::STORAGE_TIME; use crate::repository::GcResult; use crate::tenant_mgr::LocalTimelineUpdate; use crate::thread_mgr; @@ -301,7 +303,7 @@ impl Repository { .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); - timeline::STORAGE_TIME + STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) @@ -858,6 +860,11 @@ impl Repository { } } +impl Drop for Repository { + fn drop(&mut self) { + remove_tenant_metrics(&self.tenant_id); + } +} /// Dump contents of a layer file to stdout. pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { use std::os::unix::fs::FileExt; diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 8363d6314f..88dcf32409 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -13,21 +13,15 @@ use crate::layered_repository::inmemory_layer::InMemoryLayer; use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; +use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; use anyhow::Result; -use metrics::{register_int_gauge, IntGauge}; -use once_cell::sync::Lazy; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; -static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { - register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") - .expect("failed to define a metric") -}); - /// /// LayerMap tracks what layers exist on a timeline. /// diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index b050ef4030..aa9d636739 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -4,8 +4,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; -use metrics::core::{AtomicU64, GenericCounter}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use tracing::*; use std::cmp::{max, min, Ordering}; @@ -17,12 +16,6 @@ use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; use std::{fs, thread}; -use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec, - register_uint_gauge_vec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, - IntGaugeVec, UIntGauge, UIntGaugeVec, -}; - use crate::layered_repository::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, @@ -37,6 +30,7 @@ use crate::layered_repository::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; @@ -58,182 +52,6 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; -/// Prometheus histogram buckets (in seconds) that capture the majority of -/// latencies in the microsecond range but also extend far enough up to distinguish -/// "bad" from "really bad". -fn get_buckets_for_critical_operations() -> Vec { - let buckets_per_digit = 5; - let min_exponent = -6; - let max_exponent = 2; - - let mut buckets = vec![]; - // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp - // because it's more numerically stable and doesn't result in numbers like 9.999999 - for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { - buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) - } - buckets -} - -// Metrics collected on operations on the storage repository. -pub static STORAGE_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_storage_operations_seconds", - "Time spent on storage operations", - &["operation", "tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), - ) - .expect("failed to define a metric") -}); - -// Metrics collected on operations on the storage repository. -static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value", - &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), - ) - .expect("failed to define a metric") -}); - -static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_materialized_cache_hits_total", - "Number of cache hits from materialized page cache", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -static WAIT_LSN_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", - &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), - ) - .expect("failed to define a metric") -}); - -static LAST_RECORD_LSN: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_last_record_lsn", - "Last record LSN grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -// Metrics for determining timeline's physical size. -// A layered timeline's physical is defined as the total size of -// (delta/image) layer files on disk. -static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { - register_uint_gauge_vec!( - "pageserver_current_physical_size", - "Current physical size grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { - register_uint_gauge_vec!( - "pageserver_current_logical_size", - "Current logical size grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define current logical size metric") -}); - -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric") -}); - -static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - ) - .expect("failed to define a metric") -}); - -struct TimelineMetrics { - pub reconstruct_time_histo: Histogram, - pub materialized_page_cache_hit_counter: GenericCounter, - pub flush_time_histo: Histogram, - pub compact_time_histo: Histogram, - pub create_images_time_histo: Histogram, - pub init_logical_size_histo: Histogram, - pub load_layer_map_histo: Histogram, - pub last_record_gauge: IntGauge, - pub wait_lsn_time_histo: Histogram, - pub current_physical_size_gauge: UIntGauge, - /// copy of LayeredTimeline.current_logical_size - pub current_logical_size_gauge: UIntGauge, -} - -impl TimelineMetrics { - fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) - .unwrap(); - let init_logical_size_histo = STORAGE_TIME - .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) - .unwrap(); - let load_layer_map_histo = STORAGE_TIME - .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let current_logical_size_gauge = CURRENT_LOGICAL_SIZE - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - - TimelineMetrics { - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - init_logical_size_histo, - load_layer_map_histo, - last_record_gauge, - wait_lsn_time_histo, - current_physical_size_gauge, - current_logical_size_gauge, - } - } -} - pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -1494,8 +1312,8 @@ impl Timeline { let sz = new_delta_path.metadata()?.len(); self.metrics.current_physical_size_gauge.add(sz); // update metrics - NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(sz); + self.metrics.num_persistent_files_created.inc_by(1); + self.metrics.persistent_bytes_written.inc_by(sz); Ok(new_delta_path) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 06c5f552a4..4731179e22 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -4,6 +4,7 @@ pub mod http; pub mod import_datadir; pub mod keyspace; pub mod layered_repository; +pub mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; @@ -22,11 +23,9 @@ pub mod walreceiver; pub mod walrecord; pub mod walredo; -use once_cell::sync::Lazy; use tracing::info; use crate::thread_mgr::ThreadKind; -use metrics::{register_int_gauge_vec, IntGaugeVec}; /// Current storage format version /// @@ -39,15 +38,6 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3; pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; -static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_live_connections", - "Number of live network connections", - &["pageserver_connection_kind"] - ) - .expect("failed to define a metric") -}); - pub const LOG_FILE_NAME: &str = "pageserver.log"; /// Config for the Repository checkpointer diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs new file mode 100644 index 0000000000..35fdeacce5 --- /dev/null +++ b/pageserver/src/metrics.rs @@ -0,0 +1,419 @@ +use metrics::core::{AtomicU64, GenericCounter}; +use metrics::{ + register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, + register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec, + IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, +}; +use once_cell::sync::Lazy; +use utils::zid::{ZTenantId, ZTimelineId}; + +/// Prometheus histogram buckets (in seconds) that capture the majority of +/// latencies in the microsecond range but also extend far enough up to distinguish +/// "bad" from "really bad". +fn get_buckets_for_critical_operations() -> Vec { + let buckets_per_digit = 5; + let min_exponent = -6; + let max_exponent = 2; + + let mut buckets = vec![]; + // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp + // because it's more numerically stable and doesn't result in numbers like 9.999999 + for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { + buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) + } + buckets +} + +// Metrics collected on operations on the storage repository. +const STORAGE_TIME_OPERATIONS: &[&str] = &[ + "layer flush", + "compact", + "create images", + "init logical size", + "load layer map", + "gc", +]; + +pub static STORAGE_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_storage_operations_seconds", + "Time spent on storage operations", + &["operation", "tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +// Metrics collected on operations on the storage repository. +static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_materialized_cache_hits_total", + "Number of cache hits from materialized page cache", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static WAIT_LSN_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_wait_lsn_seconds", + "Time spent waiting for WAL to arrive", + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +static LAST_RECORD_LSN: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_last_record_lsn", + "Last record LSN grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +// Metrics for determining timeline's physical size. +// A layered timeline's physical is defined as the total size of +// (delta/image) layer files on disk. +static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_current_physical_size", + "Current physical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_current_logical_size", + "Current logical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define current logical size metric") +}); + +// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, +// or in testing they estimate how much we would upload if we did. +static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { + IntCounter::new( + "pageserver_created_persistent_files_total", + "Number of files created that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric") +}); + +static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { + IntCounter::new( + "pageserver_written_persistent_bytes_total", + "Total bytes written that are meant to be uploaded to cloud storage", + ) + .expect("failed to define a metric") +}); + +// Metrics collected on disk IO operations +const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ + 0.000001, // 1 usec + 0.00001, // 10 usec + 0.0001, // 100 usec + 0.001, // 1 msec + 0.01, // 10 msec + 0.1, // 100 msec + 1.0, // 1 sec +]; + +const STORAGE_IO_TIME_OPERATIONS: &[&str] = + &["open", "close", "read", "write", "seek", "fsync", "gc"]; + +const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; + +pub static STORAGE_IO_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_io_operations_seconds", + "Time spent in IO operations", + &["operation", "tenant_id", "timeline_id"], + STORAGE_IO_TIME_BUCKETS.into() + ) + .expect("failed to define a metric") +}); + +pub static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_io_operations_bytes_total", + "Total amount of bytes read/written in IO operations", + &["operation", "tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[ + "get_rel_exists", + "get_rel_size", + "get_page_at_lsn", + "get_db_size", +]; + +const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[ + 0.00001, // 1/100000 s + 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s + 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s + 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s + 0.1, // 1/10 s +]; + +pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_smgr_query_seconds", + "Time spent on smgr query handling", + &["smgr_query_type", "tenant_id", "timeline_id"], + SMGR_QUERY_TIME_BUCKETS.into() + ) + .expect("failed to define a metric") +}); + +pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_live_connections", + "Number of live network connections", + &["pageserver_connection_kind"] + ) + .expect("failed to define a metric") +}); + +pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { + register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") + .expect("failed to define a metric") +}); + +pub static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_remote_storage_remaining_sync_items", + "Number of storage sync items left in the queue" + ) + .expect("failed to register pageserver remote storage remaining sync items int gauge") +}); + +pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_remote_storage_image_sync_seconds", + "Time took to synchronize (download or upload) a whole pageserver image. \ + Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", + &["tenant_id", "timeline_id", "operation_kind", "status"], + vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] + ) + .expect("failed to register pageserver image sync time histogram vec") +}); + +pub static REMOTE_INDEX_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_remote_index_uploads_total", + "Number of remote index uploads", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver remote index upload vec") +}); + +pub static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_no_layers_uploads_total", + "Number of skipped uploads due to no layers", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver no layers upload vec") +}); + +pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_task_events", + "Number of task start/stop/fail events.", + &["event"], + ) + .expect("Failed to register tenant_task_events metric") +}); + +// Metrics collected on WAL redo operations +// +// We collect the time spent in actual WAL redo ('redo'), and time waiting +// for access to the postgres process ('wait') since there is only one for +// each tenant. + +/// Time buckets are small because we want to be able to measure the +/// smallest redo processing times. These buckets allow us to measure down +/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. +/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. +macro_rules! redo_histogram_time_buckets { + () => { + vec![ + 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + ] + }; +} + +/// While we're at it, also measure the amount of records replayed in each +/// operation. We have a global 'total replayed' counter, but that's not +/// as useful as 'what is the skew for how many records we replay in one +/// operation'. +macro_rules! redo_histogram_count_buckets { + () => { + vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] + }; +} + +pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_seconds", + "Time spent on WAL redo", + redo_histogram_time_buckets!() + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_wait_seconds", + "Time spent waiting for access to the WAL redo process", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_records_histogram", + "Histogram of number of records replayed per redo", + redo_histogram_count_buckets!(), + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_replayed_wal_records_total", + "Number of WAL records replayed in WAL redo process" + ) + .unwrap() +}); + +#[derive(Debug)] +pub struct TimelineMetrics { + tenant_id: String, + timeline_id: String, + pub reconstruct_time_histo: Histogram, + pub materialized_page_cache_hit_counter: GenericCounter, + pub flush_time_histo: Histogram, + pub compact_time_histo: Histogram, + pub create_images_time_histo: Histogram, + pub init_logical_size_histo: Histogram, + pub load_layer_map_histo: Histogram, + pub last_record_gauge: IntGauge, + pub wait_lsn_time_histo: Histogram, + pub current_physical_size_gauge: UIntGauge, + /// copy of LayeredTimeline.current_logical_size + pub current_logical_size_gauge: UIntGauge, + pub num_persistent_files_created: IntCounter, + pub persistent_bytes_written: IntCounter, +} + +impl TimelineMetrics { + pub fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) + .unwrap(); + let init_logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) + .unwrap(); + let load_layer_map_histo = STORAGE_TIME + .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_logical_size_gauge = CURRENT_LOGICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone(); + let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone(); + + TimelineMetrics { + tenant_id, + timeline_id, + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + init_logical_size_histo, + load_layer_map_histo, + last_record_gauge, + wait_lsn_time_histo, + current_physical_size_gauge, + current_logical_size_gauge, + num_persistent_files_created, + persistent_bytes_written, + } + } +} + +impl Drop for TimelineMetrics { + fn drop(&mut self) { + let tenant_id = &self.tenant_id; + let timeline_id = &self.timeline_id; + let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); + let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); + let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + + for op in STORAGE_TIME_OPERATIONS { + let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + for op in STORAGE_IO_TIME_OPERATIONS { + let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + + for op in STORAGE_IO_SIZE_OPERATIONS { + let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]); + } + + for op in SMGR_QUERY_TIME_OPERATIONS { + let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + } +} + +pub fn remove_tenant_metrics(tenant_id: &ZTenantId) { + let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 358618f20c..783fcb2412 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -11,7 +11,6 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use once_cell::sync::Lazy; use regex::Regex; use std::io::{self, Read}; use std::net::TcpListener; @@ -32,6 +31,7 @@ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; use crate::layered_repository::Timeline; +use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -39,7 +39,6 @@ use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; -use metrics::{register_histogram_vec, HistogramVec}; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; @@ -374,7 +373,7 @@ fn page_service_conn_main( // Immediately increment the gauge, then create a job to decrement it on thread exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); + let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); gauge.inc(); scopeguard::defer! { gauge.dec(); @@ -427,24 +426,6 @@ struct PageServerHandler { claims: Option, } -const TIME_BUCKETS: &[f64] = &[ - 0.00001, // 1/100000 s - 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s - 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s - 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s - 0.1, // 1/10 s -]; - -static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_smgr_query_seconds", - "Time spent on smgr query handling", - &["smgr_query_type", "tenant_id", "timeline_id"], - TIME_BUCKETS.into() - ) - .expect("failed to define a metric") -}); - impl PageServerHandler { pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self { PageServerHandler { diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 0bdc30a73f..491f882e0b 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -155,7 +155,7 @@ use std::{ use anyhow::{anyhow, bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use remote_storage::GenericRemoteStorage; use tokio::{ fs, @@ -170,6 +170,7 @@ use self::{ index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; +use crate::metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}; use crate::{ config::PageServerConf, exponential_backoff, @@ -183,44 +184,12 @@ use crate::{ thread_mgr::ThreadKind, }; -use metrics::{ - register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec, - IntCounterVec, IntGauge, -}; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; pub use self::download::TEMP_DOWNLOAD_EXTENSION; -static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { - register_int_gauge!( - "pageserver_remote_storage_remaining_sync_items", - "Number of storage sync items left in the queue" - ) - .expect("failed to register pageserver remote storage remaining sync items int gauge") -}); - -static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_remote_storage_image_sync_seconds", - "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", - &["tenant_id", "timeline_id", "operation_kind", "status"], - vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] - ) - .expect("failed to register pageserver image sync time histogram vec") -}); - -static REMOTE_INDEX_UPLOAD: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_remote_storage_remote_index_uploads_total", - "Number of remote index uploads", - &["tenant_id", "timeline_id"], - ) - .expect("failed to register pageserver remote index upload vec") -}); - static SYNC_QUEUE: OnceCell = OnceCell::new(); /// A timeline status to share with pageserver's sync counterpart, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 38bad73d3b..8dd73d9431 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -4,7 +4,6 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use once_cell::sync::Lazy; use remote_storage::GenericRemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -15,19 +14,10 @@ use super::{ index::{IndexPart, RemoteTimeline}, LayersUpload, SyncData, SyncQueue, }; +use crate::metrics::NO_LAYERS_UPLOAD; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; -use metrics::{register_int_counter_vec, IntCounterVec}; - -static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_remote_storage_no_layers_uploads_total", - "Number of skipped uploads due to no layers", - &["tenant_id", "timeline_id"], - ) - .expect("failed to register pageserver no layers upload vec") -}); /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index ca239ae254..11be13b80c 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -5,28 +5,19 @@ use std::collections::HashMap; use std::ops::ControlFlow; use std::time::Duration; +use crate::metrics::TENANT_TASK_EVENTS; use crate::tenant_mgr::TenantState; use crate::thread_mgr::ThreadKind; use crate::{tenant_mgr, thread_mgr}; use anyhow::{self, Context}; use futures::stream::FuturesUnordered; use futures::StreamExt; -use metrics::{register_int_counter_vec, IntCounterVec}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use tokio::sync::mpsc; use tokio::sync::watch; use tracing::*; use utils::zid::ZTenantId; -static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_tenant_task_events", - "Number of task start/stop/fail events.", - &["event"], - ) - .expect("Failed to register tenant_task_events metric") -}); - /// /// Compaction task's main loop /// diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 5b24b848ad..7a2c699b44 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,7 +10,7 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use once_cell::sync::Lazy; +use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; use once_cell::sync::OnceCell; use std::fs::{File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; @@ -19,38 +19,6 @@ use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{RwLock, RwLockWriteGuard}; -use metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec}; - -// Metrics collected on disk IO operations -const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ - 0.000001, // 1 usec - 0.00001, // 10 usec - 0.0001, // 100 usec - 0.001, // 1 msec - 0.01, // 10 msec - 0.1, // 100 msec - 1.0, // 1 sec -]; - -static STORAGE_IO_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_io_operations_seconds", - "Time spent in IO operations", - &["operation", "tenant_id", "timeline_id"], - STORAGE_IO_TIME_BUCKETS.into() - ) - .expect("failed to define a metric") -}); - -static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_io_operations_bytes_total", - "Total amount of bytes read/written in IO operations", - &["operation", "tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, @@ -85,7 +53,6 @@ pub struct VirtualFile { pub path: PathBuf, open_options: OpenOptions, - /// For metrics tenantid: String, timelineid: String, } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index d441bbb4ab..4c30481e02 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -19,6 +19,7 @@ use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; +use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, @@ -105,7 +106,7 @@ pub async fn handle_walreceiver_connection( // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); + let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); gauge.inc(); scopeguard::defer! { gauge.dec(); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index bf48bd1759..4e49fd9373 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,7 +21,6 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; -use once_cell::sync::Lazy; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -39,11 +38,13 @@ use tracing::*; use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; use crate::config::PageServerConf; +use crate::metrics::{ + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, +}; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; -use metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, @@ -83,70 +84,6 @@ pub trait WalRedoManager: Send + Sync { ) -> Result; } -// Metrics collected on WAL redo operations -// -// We collect the time spent in actual WAL redo ('redo'), and time waiting -// for access to the postgres process ('wait') since there is only one for -// each tenant. - -/// Time buckets are small because we want to be able to measure the -/// smallest redo processing times. These buckets allow us to measure down -/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. -/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. -macro_rules! redo_histogram_time_buckets { - () => { - vec![ - 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, - 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, - ] - }; -} - -/// While we're at it, also measure the amount of records replayed in each -/// operation. We have a global 'total replayed' counter, but that's not -/// as useful as 'what is the skew for how many records we replay in one -/// operation'. -macro_rules! redo_histogram_count_buckets { - () => { - vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] - }; -} - -static WAL_REDO_TIME: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_wal_redo_seconds", - "Time spent on WAL redo", - redo_histogram_time_buckets!() - ) - .expect("failed to define a metric") -}); - -static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the WAL redo process", - redo_histogram_time_buckets!(), - ) - .expect("failed to define a metric") -}); - -static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_wal_redo_records_histogram", - "Histogram of number of records replayed per redo", - redo_histogram_count_buckets!(), - ) - .expect("failed to define a metric") -}); - -static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_replayed_wal_records_total", - "Number of WAL records replayed in WAL redo process" - ) - .unwrap() -}); - /// /// This is the real implementation that uses a Postgres process to /// perform WAL replay. Only one thread can use the process at a time, diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index b51c7250e0..4d680aa641 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -16,8 +16,11 @@ class Metrics: def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]: res = [] for sample in self.metrics[name]: - if all(sample.labels[k] == v for k, v in filter.items()): - res.append(sample) + try: + if all(sample.labels[k] == v for k, v in filter.items()): + res.append(sample) + except KeyError: + pass return res def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample: @@ -34,3 +37,27 @@ def parse_metrics(text: str, name: str = ""): metrics.metrics[sample.name].append(sample) return metrics + + +PAGESERVER_PER_TENANT_METRICS = [ + "pageserver_current_logical_size", + "pageserver_current_physical_size", + "pageserver_getpage_reconstruct_seconds_bucket", + "pageserver_getpage_reconstruct_seconds_count", + "pageserver_getpage_reconstruct_seconds_sum", + "pageserver_io_operations_bytes_total", + "pageserver_io_operations_seconds_bucket", + "pageserver_io_operations_seconds_count", + "pageserver_io_operations_seconds_sum", + "pageserver_last_record_lsn", + "pageserver_materialized_cache_hits_total", + "pageserver_smgr_query_seconds_bucket", + "pageserver_smgr_query_seconds_count", + "pageserver_smgr_query_seconds_sum", + "pageserver_storage_operations_seconds_bucket", + "pageserver_storage_operations_seconds_count", + "pageserver_storage_operations_seconds_sum", + "pageserver_wait_lsn_seconds_bucket", + "pageserver_wait_lsn_seconds_count", + "pageserver_wait_lsn_seconds_sum", +] diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 8bbf45205a..767f94d167 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,12 +1,14 @@ import os from contextlib import closing from datetime import datetime +from typing import List import pytest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics +from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import Lsn +from fixtures.types import Lsn, ZTenantId +from prometheus_client.samples import Sample @pytest.mark.parametrize("with_safekeepers", [False, True]) @@ -122,3 +124,46 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): log.info( f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}" ) + + +def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder): + """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" + + neon_env_builder.num_safekeepers = 3 + + env = neon_env_builder.init_start() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() + + env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1) + env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2) + + pg_tenant1 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_2) + + for pg in [pg_tenant1, pg_tenant2]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) + + def get_ps_metric_samples_for_tenant(tenant_id: ZTenantId) -> List[Sample]: + ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + samples = [] + for metric_name in ps_metrics.metrics: + for sample in ps_metrics.query_all( + name=metric_name, filter={"tenant_id": str(tenant_id)} + ): + samples.append(sample) + return samples + + for tenant in [tenant_1, tenant_2]: + pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) + assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS) + + env.pageserver.http_client().tenant_detach(tenant) + + post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) + assert post_detach_samples == set() From cf157ad8e4541031bc2acded74a9584d6565ec24 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 6 Sep 2022 13:00:40 +0300 Subject: [PATCH 024/166] Add test that repeatedly kills and restarts the pageserver. This caught or reproduced several bugs when I originally wrote this test back in May, including #1731, #1740, #1751, and #707. I believe all the issues have been fixed now, but since this was a very fruitful test, let's add it to the test suite. We didn't commit this earlier, because the test was very slow especially with a debug build. We've since changed the build options so that even the debug builds are not quite so slow anymore. --- .../regress/test_pageserver_restart.py | 72 ++++++++++++++++++- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index e2bd8be9b7..eac5e6e61d 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -1,3 +1,6 @@ +from contextlib import closing + +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -38,9 +41,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - # Stop and restart pageserver. This is a more or less graceful shutdown, although - # the page server doesn't currently have a shutdown routine so there's no difference - # between stopping and crashing. + # Stop the pageserver gracefully and restart it. env.pageserver.stop() env.pageserver.start() @@ -58,3 +59,68 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Stop the page server by force, and restart it env.pageserver.stop() env.pageserver.start() + + +# Test that repeatedly kills and restarts the page server, while the +# safekeeper and compute node keep running. +@pytest.mark.timeout(540) +def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + # Use a tiny checkpoint distance, to create a lot of layers quickly. + # That allows us to stress the compaction and layer flushing logic more. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": "5000000", + } + ) + env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant) + pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant) + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo (id int, t text, updates int)") + cur.execute("CREATE INDEX ON foo (id)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 'long string to consume some space' || g, 0 + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Update the whole table, then immediately kill and restart the pageserver + for i in range(1, 15): + pg.safe_psql("UPDATE foo set updates = updates + 1") + + # This kills the pageserver immediately, to simulate a crash + env.pageserver.stop(immediate=True) + env.pageserver.start() + + # Stopping the pageserver breaks the connection from the postgres backend to + # the page server, and causes the next query on the connection to fail. Start a new + # postgres connection too, to avoid that error. (Ideally, the compute node would + # handle that and retry internally, without propagating the error to the user, but + # currently it doesn't...) + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Check that all the updates are visible + num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0] + assert num_updates == i * 100000 From f441fe57d47cade9454bc584f6b820c868d43272 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 6 Sep 2022 17:35:40 +0300 Subject: [PATCH 025/166] Register prometheus counters correctly. Commit f081419e68 moved all the prometheus counters to `metrics.rs`, but accidentally replaced a couple of `register_int_counter!(...)` calls with just `IntCounter::new(...)`. Because of that, the counters were not registered in the metrics registry, and were not exposed through the metrics HTTP endpoint. Fixes failures we're seeing in a bunch of 'performance' tests because of the missing metrics. --- pageserver/src/metrics.rs | 4 ++-- test_runner/fixtures/benchmark_fixture.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 35fdeacce5..ada0bbd359 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -107,7 +107,7 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - IntCounter::new( + register_int_counter!( "pageserver_created_persistent_files_total", "Number of files created that are meant to be uploaded to cloud storage", ) @@ -115,7 +115,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { }); static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - IntCounter::new( + register_int_counter!( "pageserver_written_persistent_bytes_total", "Total bytes written that are meant to be uploaded to cloud storage", ) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 338cc47ea2..b9cdfdebc4 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -362,7 +362,7 @@ class NeonBenchmarker: # and round to integer. all_metrics = pageserver.http_client().get_metrics() matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE) - assert matches + assert matches, f"metric {metric_name} not found" return int(round(float(matches.group(1)))) def get_timeline_size(self, repo_dir: Path, tenantid: ZTenantId, timelineid: ZTimelineId): From 65b592d4bd6d503816fd9f4fbb8e11505623f1cd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 5 Sep 2022 13:59:04 +0300 Subject: [PATCH 026/166] Remove deprecated management API for timeline detach. It is no longer used anywhere. --- pageserver/src/http/openapi_spec.yml | 45 ---------------------------- pageserver/src/http/routes.rs | 5 ---- 2 files changed, 50 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index fc3e80ba19..6beb938d6a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -257,51 +257,6 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/detach: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: timeline_id - in: path - required: true - schema: - type: string - format: hex - post: - description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead - deprecated: true - responses: - "200": - description: Ok - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/detach: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 52997da5a0..09c4812067 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -696,10 +696,5 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, ) - // for backward compatibility - .post( - "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", - timeline_delete_handler, - ) .any(handler_404)) } From 2794cd83c70abfd99a879614de9fd5766b707d52 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 7 Sep 2022 12:40:48 +0300 Subject: [PATCH 027/166] Prepare pg 15 support (generate bindings for pg15) (#2396) Another preparatory commit for pg15 support: * generate bindings for both pg14 and pg15; * update Makefile and CI scripts: now neon build depends on both PostgreSQL versions; * some code refactoring to decrease version-specific dependencies. --- .github/workflows/build_and_test.yml | 27 ++++- Dockerfile | 11 +- Makefile | 2 +- libs/postgres_ffi/README.md | 8 +- libs/postgres_ffi/build.rs | 162 +++++++++++++------------ libs/postgres_ffi/src/lib.rs | 16 ++- libs/postgres_ffi/src/pg_constants.rs | 9 +- libs/postgres_ffi/src/waldecoder.rs | 6 +- libs/postgres_ffi/src/xlog_utils.rs | 36 +++--- libs/postgres_ffi/wal_craft/src/lib.rs | 6 +- pageserver/src/basebackup.rs | 19 +-- pageserver/src/import_datadir.rs | 16 +-- pageserver/src/pgdatadir_mapping.rs | 3 +- pageserver/src/walrecord.rs | 4 +- safekeeper/src/json_ctrl.rs | 4 +- safekeeper/src/metrics.rs | 2 +- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/send_wal.rs | 4 +- safekeeper/src/timeline.rs | 2 +- safekeeper/src/wal_backup.rs | 4 +- safekeeper/src/wal_storage.rs | 4 +- 21 files changed, 186 insertions(+), 161 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6eddbc3335..6d966ce0a2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -77,11 +77,16 @@ jobs: submodules: true fetch-depth: 1 - - name: Set pg revision for caching + - name: Set pg 14 revision for caching id: pg_v14_rev run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) shell: bash -euxo pipefail {0} + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15) + shell: bash -euxo pipefail {0} + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -125,15 +130,27 @@ jobs: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres v14 build - id: cache_pg + id: cache_pg_14 uses: actions/cache@v3 with: path: pg_install/v14 key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: mold -run make postgres -j$(nproc) + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + shell: bash -euxo pipefail {0} + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) shell: bash -euxo pipefail {0} - name: Build neon extensions diff --git a/Dockerfile b/Dockerfile index d379c05051..3e173f4d5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,23 +5,21 @@ ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com ARG IMAGE=rust ARG TAG=pinned -# ARGs don't get replaced in RUN commands in Kaniko -# so use hardcoded value below -# ARG PG_VERSION=v14 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot -ARG PG_VERSION=v14 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 +COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile ENV BUILD_TYPE release RUN set -e \ - && mold -run make -j $(nproc) -s neon-pg-ext-v14 \ + && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf pg_install/v14/build \ + && rm -rf pg_install/v15/build \ && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries @@ -39,8 +37,8 @@ ARG CACHEPOT_BUCKET=neon-github-dev #ARG AWS_ACCESS_KEY_ID #ARG AWS_SECRET_ACCESS_KEY -ARG PG_VERSION=v14 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. @@ -70,7 +68,6 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin # v14 is default for now -ARG PG_VERSION=v14 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ diff --git a/Makefile b/Makefile index 0b2b097ebc..0e7ceec15b 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ all: neon postgres neon-pg-ext # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-v14-headers +neon: postgres-v14-headers postgres-v15-headers +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) diff --git a/libs/postgres_ffi/README.md b/libs/postgres_ffi/README.md index 5656314fd7..de046eb3da 100644 --- a/libs/postgres_ffi/README.md +++ b/libs/postgres_ffi/README.md @@ -9,9 +9,11 @@ should be auto-generated too, but that's a TODO. The PostgreSQL on-disk file format is not portable across different CPU architectures and operating systems. It is also subject to change -in each major PostgreSQL version. Currently, this module is based on -PostgreSQL v14, but in the future we will probably need a separate -copy for each PostgreSQL version. +in each major PostgreSQL version. Currently, this module supports +PostgreSQL v14 and v15: bindings and code that depends on them are version-specific. +This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15` +Version independend code is explicitly exported into shared `postgres_ffi`. + TODO: Currently, there is also some code that deals with WAL records in pageserver/src/waldecoder.rs. That should be moved into this diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 19507f0557..8389ac37fe 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -48,90 +48,98 @@ fn main() { // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` - // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/v14/include/postgresql/server` - let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") - { + // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/{PG_MAJORVERSION}/include/postgresql/server` + let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { PathBuf::from("pg_install") }; - // Currently, we only expect to find PostgreSQL v14 sources, in "pg_install/v14". In the - // future, we will run this for all supported PostgreSQL versions. - pg_install_dir.push("v14"); - if pg_install_dir.is_relative() { - let cwd = env::current_dir().unwrap(); - pg_install_dir = cwd.join("..").join("..").join(pg_install_dir); - } - - let pg_config_bin = pg_install_dir.join("bin").join("pg_config"); - let inc_server_path: String = if pg_config_bin.exists() { - let output = Command::new(pg_config_bin) - .arg("--includedir-server") - .output() - .expect("failed to execute `pg_config --includedir-server`"); - - if !output.status.success() { - panic!("`pg_config --includedir-server` failed") + for pg_version in &["v14", "v15"] { + let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); + if pg_install_dir_versioned.is_relative() { + let cwd = env::current_dir().unwrap(); + pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } - String::from_utf8(output.stdout).unwrap().trim_end().into() - } else { - pg_install_dir - .join("include") - .join("postgresql") - .join("server") - .into_os_string() - .into_string() - .unwrap() - }; + let pg_config_bin = pg_install_dir_versioned + .join(pg_version) + .join("bin") + .join("pg_config"); + let inc_server_path: String = if pg_config_bin.exists() { + let output = Command::new(pg_config_bin) + .arg("--includedir-server") + .output() + .expect("failed to execute `pg_config --includedir-server`"); - // The bindgen::Builder is the main entry point - // to bindgen, and lets you build up options for - // the resulting bindings. - let bindings = bindgen::Builder::default() - // - // All the needed PostgreSQL headers are included from 'bindgen_deps.h' - // - .header("bindgen_deps.h") - // - // Tell cargo to invalidate the built crate whenever any of the - // included header files changed. - // - .parse_callbacks(Box::new(PostgresFfiCallbacks)) - // - // These are the types and constants that we want to generate bindings for - // - .allowlist_type("BlockNumber") - .allowlist_type("OffsetNumber") - .allowlist_type("MultiXactId") - .allowlist_type("MultiXactOffset") - .allowlist_type("MultiXactStatus") - .allowlist_type("ControlFileData") - .allowlist_type("CheckPoint") - .allowlist_type("FullTransactionId") - .allowlist_type("XLogRecord") - .allowlist_type("XLogPageHeaderData") - .allowlist_type("XLogLongPageHeaderData") - .allowlist_var("XLOG_PAGE_MAGIC") - .allowlist_var("PG_CONTROL_FILE_SIZE") - .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") - .allowlist_type("PageHeaderData") - .allowlist_type("DBState") - // Because structs are used for serialization, tell bindgen to emit - // explicit padding fields. - .explicit_padding(true) - // - .clang_arg(format!("-I{inc_server_path}")) - // - // Finish the builder and generate the bindings. - // - .generate() - .expect("Unable to generate bindings"); + if !output.status.success() { + panic!("`pg_config --includedir-server` failed") + } - // Write the bindings to the $OUT_DIR/bindings.rs file. - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); - bindings - .write_to_file(out_path.join("bindings.rs")) - .expect("Couldn't write bindings!"); + String::from_utf8(output.stdout).unwrap().trim_end().into() + } else { + pg_install_dir_versioned + .join("include") + .join("postgresql") + .join("server") + .into_os_string() + .into_string() + .unwrap() + }; + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // + // All the needed PostgreSQL headers are included from 'bindgen_deps.h' + // + .header("bindgen_deps.h") + // + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + // + .parse_callbacks(Box::new(PostgresFfiCallbacks)) + // + // These are the types and constants that we want to generate bindings for + // + .allowlist_type("BlockNumber") + .allowlist_type("OffsetNumber") + .allowlist_type("XLogRecPtr") + .allowlist_type("XLogSegNo") + .allowlist_type("TimeLineID") + .allowlist_type("TimestampTz") + .allowlist_type("MultiXactId") + .allowlist_type("MultiXactOffset") + .allowlist_type("MultiXactStatus") + .allowlist_type("ControlFileData") + .allowlist_type("CheckPoint") + .allowlist_type("FullTransactionId") + .allowlist_type("XLogRecord") + .allowlist_type("XLogPageHeaderData") + .allowlist_type("XLogLongPageHeaderData") + .allowlist_var("XLOG_PAGE_MAGIC") + .allowlist_var("PG_CONTROL_FILE_SIZE") + .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") + .allowlist_type("PageHeaderData") + .allowlist_type("DBState") + // Because structs are used for serialization, tell bindgen to emit + // explicit padding fields. + .explicit_padding(true) + // + .clang_arg(format!("-I{inc_server_path}")) + // + // Finish the builder and generate the bindings. + // + .generate() + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + let filename = format!("bindings_{pg_version}.rs"); + + bindings + .write_to_file(out_path.join(filename)) + .expect("Couldn't write bindings!"); + } } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 022355329c..f43232ed0c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -13,13 +13,17 @@ macro_rules! postgres_ffi { ($version:ident) => { #[path = "."] pub mod $version { - // fixme: does this have to be 'pub'? pub mod bindings { // bindgen generates bindings for a lot of stuff we don't need #![allow(dead_code)] use serde::{Deserialize, Serialize}; - include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + include!(concat!( + env!("OUT_DIR"), + "/bindings_", + stringify!($version), + ".rs" + )); } pub mod controlfile_utils; pub mod nonrelfile_utils; @@ -28,6 +32,8 @@ macro_rules! postgres_ffi { pub mod waldecoder; pub mod xlog_utils; + pub const PG_MAJORVERSION: &str = stringify!($version); + // Re-export some symbols from bindings pub use bindings::DBState_DB_SHUTDOWNED; pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; @@ -36,20 +42,26 @@ macro_rules! postgres_ffi { } postgres_ffi!(v14); +postgres_ffi!(v15); // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; pub use v14::bindings::{MultiXactId, TransactionId}; +pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; +pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); pub const XLOG_BLCKSZ: usize = 8192; +pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; + +pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. // diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 42b5c5d842..8cc9fa7af6 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -7,7 +7,7 @@ //! comments on them. //! -use super::bindings::PageHeaderData; +use super::bindings::{PageHeaderData, XLogRecord}; use crate::BLCKSZ; // @@ -176,7 +176,7 @@ pub const XLOG_DBASE_DROP: u8 = 0x10; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = 24; +pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::() as u32; // // from xlogrecord.h @@ -206,15 +206,10 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; -/* FIXME: pageserver should request wal_seg_size from compute node */ -pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; - pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_LONG_HEADER: u16 = 0x0002; -pub const PG_MAJORVERSION: &str = "14"; - // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index b509fc87a5..4d79e4b1d1 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -9,8 +9,8 @@ //! for that is in pageserver/src/walrecord.rs //! use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; -use super::pg_constants; use super::xlog_utils::*; +use crate::WAL_SEGMENT_SIZE; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; @@ -133,7 +133,7 @@ impl WalStreamDecoder { // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason. match self.state { State::WaitingForRecord | State::ReassemblingRecord { .. } => { - if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { + if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 { // parse long header if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { @@ -265,7 +265,7 @@ impl WalStreamDecoder { // to the next WAL segment. let next_lsn = if xlogrec.is_xlog_switch_record() { trace!("saw xlog switch record at {}", self.lsn); - self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) + self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64) } else { // Pad to an 8-byte boundary self.lsn.align() diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 0d9aaa4708..f8606b6e47 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -10,14 +10,14 @@ use crc32c::crc32c_append; use super::bindings::{ - CheckPoint, FullTransactionId, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, - XLOG_PAGE_MAGIC, + CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, + XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, }; use super::pg_constants; -use super::pg_constants::WAL_SEGMENT_SIZE; -use crate::v14::waldecoder::WalStreamDecoder; +use super::waldecoder::WalStreamDecoder; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; +use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use bytes::BytesMut; use bytes::{Buf, Bytes}; @@ -37,11 +37,9 @@ use utils::bin_ser::SerializeError; use utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; -pub const XLOG_BLCKSZ: usize = 8192; pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; -pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); @@ -49,11 +47,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; -pub type XLogRecPtr = u64; -pub type TimeLineID = u32; -pub type TimestampTz = i64; -pub type XLogSegNo = u64; - /// Interval of checkpointing metadata file. We should store metadata file to enforce /// predicate that checkpoint.nextXid is larger than any XID in WAL. /// But flushing checkpoint file for each transaction seems to be too expensive, @@ -318,9 +311,9 @@ impl CheckPoint { // We need this segment to start compute node. // pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { - let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize); + let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize); - let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE); + let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { @@ -333,7 +326,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result Result Vec { #[cfg(test)] mod tests { + use super::super::PG_MAJORVERSION; use super::*; use regex::Regex; use std::cmp::min; @@ -434,23 +428,23 @@ mod tests { use utils::const_assert; fn init_logging() { - let _ = env_logger::Builder::from_env( - env_logger::Env::default() - .default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"), - ) + let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( + format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), + )) .is_test(true) .try_init(); } fn test_end_of_wal(test_name: &str) { use wal_craft::*; + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join("pg_install/v14"), - datadir: top_path.join(format!("test_output/{}", test_name)), + pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")), + datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { fs::remove_dir_all(&cfg.datadir).unwrap(); diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 6ac5afb27f..2ad92d776d 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,10 +4,8 @@ use log::*; use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; -use postgres_ffi::v14::pg_constants::WAL_SEGMENT_SIZE; -use postgres_ffi::v14::xlog_utils::{ - XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, -}; +use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; +use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; use std::cmp::Ordering; use std::fs; use std::path::{Path, PathBuf}; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 48b5f1a695..cd99c3c67d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -30,7 +30,7 @@ use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFil use postgres_ffi::v14::{CheckPoint, ControlFileData}; use postgres_ffi::TransactionId; use postgres_ffi::PG_TLI; -use postgres_ffi::{BLCKSZ, RELSEG_SIZE}; +use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, @@ -268,8 +268,11 @@ where None }; + // TODO pass this as a parameter + let pg_version = "14"; + if spcnode == pg_constants::GLOBALTABLESPACE_OID { - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); + let version_bytes = pg_version.as_bytes(); let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; @@ -312,7 +315,7 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); + let version_bytes = pg_version.as_bytes(); let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; @@ -358,7 +361,7 @@ where let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; + checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0; //reset some fields we don't want to preserve //TODO Check this. @@ -392,13 +395,13 @@ where self.ar.append(&header, &pg_control_bytes[..])?; //send wal segment - let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE); + let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); + let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); - let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?; + let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; - ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); + ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 4cc3aafb0e..f8f614f8f4 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -21,7 +21,7 @@ use postgres_ffi::v14::waldecoder::*; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; use postgres_ffi::Oid; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; // Returns checkpoint LSN from controlfile @@ -238,15 +238,15 @@ fn import_slru( fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { let mut waldecoder = WalStreamDecoder::new(startpoint); - let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); + let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; let mut walingest = WalIngest::new(tline, startpoint)?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now - let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let mut buf = Vec::new(); // Read local file @@ -265,7 +265,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) } let nread = file.read_to_end(&mut buf)?; - if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize { + if nread != WAL_SEGMENT_SIZE - offset as usize { // Maybe allow this for .partial files? error!("read only {} bytes from WAL file", nread); } @@ -355,8 +355,8 @@ pub fn import_wal_from_tar( ) -> Result<()> { // Set up walingest mutable state let mut waldecoder = WalStreamDecoder::new(start_lsn); - let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); + let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; let mut walingest = WalIngest::new(tline, start_lsn)?; @@ -373,7 +373,7 @@ pub fn import_wal_from_tar( match header.entry_type() { tar::EntryType::Regular => { // FIXME: assume postgresql tli 1 for now - let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let file_name = file_path .file_name() .expect("missing wal filename") diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7bba64179c..ba48a77961 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -14,9 +14,8 @@ use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::TimestampTz; use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, TransactionId}; +use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::Range; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index c56b1c6c0c..c718a4c30c 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -4,10 +4,10 @@ use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::v14::xlog_utils::XLOG_SIZE_OF_XLOG_RECORD; use postgres_ffi::v14::XLogRecord; use postgres_ffi::BLCKSZ; -use postgres_ffi::{BlockNumber, OffsetNumber}; +use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use tracing::*; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 3f84e7b183..16c1d36131 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -18,8 +18,8 @@ use crate::safekeeper::{ }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::TimelineTools; -use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils; +use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ lsn::Lsn, postgres_backend::PostgresBackend, @@ -100,7 +100,7 @@ fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { ztli: spg.ztimelineid.unwrap(), tenant_id: spg.ztenantid.unwrap(), tli: 0, - wal_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, // 16MB, default for tests + wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests }); let response = spg.timeline.get().process_msg(&greeting_request)?; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 648f0634f8..c693035dd3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,7 +7,7 @@ use metrics::{ proto::MetricFamily, Gauge, IntGaugeVec, }; -use postgres_ffi::v14::xlog_utils::XLogSegNo; +use postgres_ffi::XLogSegNo; use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::{ diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index ed34669dde..a2bdcb55e7 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,7 +5,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use etcd_broker::subscription_value::SkTimelineInfo; -use postgres_ffi::v14::xlog_utils::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; +use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 38523f9f82..293cf67c57 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -6,9 +6,9 @@ use crate::timeline::{ReplicaState, Timeline, TimelineTools}; use crate::wal_storage::WalReader; use anyhow::{bail, Context, Result}; -use postgres_ffi::v14::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; - use bytes::Bytes; +use postgres_ffi::v14::xlog_utils::get_current_timestamp; +use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::min; use std::net::Shutdown; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index f482dbb3aa..8d101e6ff6 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -6,7 +6,7 @@ use anyhow::{bail, Context, Result}; use etcd_broker::subscription_value::SkTimelineInfo; use once_cell::sync::Lazy; -use postgres_ffi::v14::xlog_utils::XLogSegNo; +use postgres_ffi::XLogSegNo; use serde::Serialize; use tokio::sync::watch; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 5c6991c196..6acc70e85a 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,8 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr}; -use postgres_ffi::PG_TLI; +use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::GenericRemoteStorage; use tokio::fs::File; use tokio::runtime::Builder; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 6a45ae1411..644237a00d 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -14,9 +14,9 @@ use tokio::io::AsyncRead; use once_cell::sync::Lazy; use postgres_ffi::v14::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, + find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; -use postgres_ffi::PG_TLI; +use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::min; use std::fs::{self, remove_file, File, OpenOptions}; From dc2150a90eaeee5f4a297d896f4eeb9ded63a8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Wed, 7 Sep 2022 15:11:03 +0300 Subject: [PATCH 028/166] Add built files to gitignore (#2404) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 618ff2c5b9..f1afdee599 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ test_output/ *.key *.crt +*.o +*.so +*.Po From 83dca73f85ad859d156b2550c5108faddb2cff0d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 7 Sep 2022 14:16:48 +0100 Subject: [PATCH 029/166] Store Allure tests statistics in database (#2367) --- .github/actions/allure-report/action.yml | 8 +- .github/workflows/build_and_test.yml | 18 +++++ scripts/ingest_perf_test_result.py | 7 +- scripts/ingest_regress_test_result.py | 97 ++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 scripts/ingest_regress_test_result.py diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index 34761f8df1..ec751f51fc 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -11,6 +11,10 @@ inputs: test_selection: description: '`test_selector` from run-python-test-set action' required: false +outputs: + report-url: + description: 'Allure report URL' + value: ${{ steps.generate-report.outputs.report-url }} runs: using: "composite" @@ -182,7 +186,7 @@ runs: aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} - echo "::set-output name=REPORT_URL::${REPORT_URL}" + echo "::set-output name=report-url::${REPORT_URL}" - name: Release Allure lock if: ${{ inputs.action == 'generate' && always() }} @@ -200,7 +204,7 @@ runs: - uses: actions/github-script@v6 if: ${{ inputs.action == 'generate' && always() }} env: - REPORT_URL: ${{ steps.generate-report.outputs.REPORT_URL }} + REPORT_URL: ${{ steps.generate-report.outputs.report-url }} BUILD_TYPE: ${{ inputs.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6d966ce0a2..1387514cc2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -253,6 +253,7 @@ jobs: real_s3_region: us-west-2 real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + - name: Merge and upload coverage data if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data @@ -308,11 +309,28 @@ jobs: submodules: false - name: Create Allure report + id: create-allure-report uses: ./.github/actions/allure-report with: action: generate build_type: ${{ matrix.build_type }} + - name: Store Allure test stat in the DB + env: + BUILD_TYPE: ${{ matrix.build_type }} + SHA: ${{ github.event.pull_request.head.sha || github.sha }} + REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }} + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} + shell: bash -euxo pipefail {0} + run: | + curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json + ./scripts/pysync + + # Workaround for https://github.com/neondatabase/cloud/issues/2188 + psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10 + + DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + coverage-report: runs-on: dev container: diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 71f7ad3262..7f2af290a2 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -2,6 +2,7 @@ import argparse import json import os +import sys from contextlib import contextmanager from datetime import datetime from pathlib import Path @@ -26,7 +27,7 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( def err(msg): print(f"error: {msg}") - exit(1) + sys.exit(1) @contextmanager @@ -43,8 +44,8 @@ def create_table(cur): cur.execute(CREATE_TABLE) -def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) -> int: - run_data = json.loads(data_dile.read_text()) +def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int) -> int: + run_data = json.loads(data_file.read_text()) revision = run_data["revision"] platform = run_data["platform"] diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py new file mode 100644 index 0000000000..e07a972c67 --- /dev/null +++ b/scripts/ingest_regress_test_result.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +import argparse +import os +import sys +from contextlib import contextmanager +from pathlib import Path + +import psycopg2 + +CREATE_TABLE = """ +CREATE TABLE IF NOT EXISTS regress_test_results ( + id SERIAL PRIMARY KEY, + reference CHAR(255), + revision CHAR(40), + build_type CHAR(16), + data JSONB +) +""" + + +def err(msg): + print(f"error: {msg}") + sys.exit(1) + + +@contextmanager +def get_connection_cursor(): + connstr = os.getenv("DATABASE_URL") + if not connstr: + err("DATABASE_URL environment variable is not set") + with psycopg2.connect(connstr, connect_timeout=30) as conn: + with conn.cursor() as cur: + yield cur + + +def create_table(cur): + cur.execute(CREATE_TABLE) + + +def ingest_regress_test_result( + cursor, reference: str, revision: str, build_type: str, data_file: Path +): + values = ( + reference, + revision, + build_type, + data_file.read_text(), + ) + cursor.execute( + """ + INSERT INTO regress_test_results ( + reference, + revision, + build_type, + data + ) VALUES (%s, %s, %s, %s) + """, + values, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Regress test result uploader. \ + Database connection string should be provided via DATABASE_URL environment variable", + ) + parser.add_argument("--initdb", action="store_true", help="Initialuze database") + parser.add_argument( + "--reference", type=str, required=True, help="git reference, for example refs/heads/main" + ) + parser.add_argument("--revision", type=str, required=True, help="git revision") + parser.add_argument( + "--build-type", type=str, required=True, help="build type: release, debug or remote" + ) + parser.add_argument( + "--ingest", type=Path, required=True, help="Path to regress test result file" + ) + + args = parser.parse_args() + with get_connection_cursor() as cur: + if args.initdb: + create_table(cur) + + if not args.ingest.exists(): + err(f"ingest path {args.ingest} does not exist") + + ingest_regress_test_result( + cur, + reference=args.reference, + revision=args.revision, + build_type=args.build_type, + data_file=args.ingest, + ) + + +if __name__ == "__main__": + main() From 9e3136ea378547308abee959e8175224fee79572 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 7 Sep 2022 21:40:08 +0100 Subject: [PATCH 030/166] scripts/ingest_regress_test_result.py: fix json data insertion (#2408) --- scripts/ingest_regress_test_result.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py index e07a972c67..974167483a 100644 --- a/scripts/ingest_regress_test_result.py +++ b/scripts/ingest_regress_test_result.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse import os +import re import sys from contextlib import contextmanager from pathlib import Path @@ -40,11 +41,17 @@ def create_table(cur): def ingest_regress_test_result( cursor, reference: str, revision: str, build_type: str, data_file: Path ): + data = data_file.read_text() + # In the JSON report we can have lines related to LazyFixture with escaped double-quote + # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us + # + # "" -> "" + data = re.sub(r'("")', r"\g<1>'\g<2>'\g<3>", data) values = ( reference, revision, build_type, - data_file.read_text(), + data, ) cursor.execute( """ From 1351beae19be72b148a2ae6bebec29c5aafa38c0 Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 8 Sep 2022 12:57:30 +0200 Subject: [PATCH 031/166] Fix race condition in ginHeapTupleFastInsert (#2412) Because the metadata was not locked, it could be updated concurrently such that we wouldn't actually have the tail block. The current ordering works better, as we still only start XLogBeginInsert() once we have all potentially interesting buffers loaded in memory, but still have correct lock lifetimes. See also: access/transam/README section Write-Ahead Log Coding --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a4963aa6df..e8518d3fc8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a4963aa6df6a44bdee17ef387c01bcf46f6017fd +Subproject commit e8518d3fc85e3da420d2f5a2742a21386e6585ec diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 26c6466873..313769bb62 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 26c64668736b729a3e4c02c6fc0a84544118df26 +Subproject commit 313769bb6229f46380e24d8f6ff535f9185458af From 171385ac14efa41b8e9cfe73851ff772c9722ce4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 8 Sep 2022 16:02:11 +0100 Subject: [PATCH 032/166] Pass COPT and PG_CFLAGS to Extension's CFLAGS (#2405) * fix incompatible-function-pointer-types warning * Pass COPT and PG_CFLAGS to Extension's CFLAGS --- .github/workflows/codestyle.yml | 1 + Makefile | 9 ++++----- pgxn/neon/pagestore_client.h | 2 +- pgxn/neon/pagestore_smgr.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index a5e31d49ee..bc21054e18 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -17,6 +17,7 @@ concurrency: env: RUST_BACKTRACE: 1 + COPT: '-Werror' jobs: check-codestyle-rust: diff --git a/Makefile b/Makefile index 0e7ceec15b..4d7b1bee07 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,6 @@ ifeq ($(UNAME_S),Linux) PG_CONFIGURE_OPTS += --with-libseccomp endif - # macOS with brew-installed openssl requires explicit paths # It can be configured with OPENSSL_PREFIX variable UNAME_S := $(shell uname -s) @@ -144,24 +143,24 @@ neon-pg-ext-v14: postgres-v14 +@echo "Compiling neon v14" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14 (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) +@echo "Compiling neon_test_utils" v14 mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) neon-pg-ext-v15: postgres-v15 +@echo "Compiling neon v15" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15 (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) +@echo "Compiling neon_test_utils" v15 mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) .PHONY: neon-pg-ext-clean diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index f79a3c9142..93ea6771eb 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -182,7 +182,7 @@ extern void zenith_write(SMgrRelation reln, ForkNumber forknum, extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); -extern const int64 zenith_dbsize(Oid dbNode); +extern int64 zenith_dbsize(Oid dbNode); extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index e3f083fd43..d49df7af58 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1347,7 +1347,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) /* * zenith_db_size() -- Get the size of the database in bytes. */ -const int64 +int64 zenith_dbsize(Oid dbNode) { ZenithResponse *resp; From 35b4816f09b0697fe2c7e1c7b15d87cdb85cf1b7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 31 Aug 2022 12:17:41 +0300 Subject: [PATCH 033/166] Turn GenericRemoteStorage into just a newtype around 'Arc' We had a pattern like this: match remote_storage { GenericRemoteStorage::Local(storage) => { let source = storage.remote_object_id(&file_path)?; ... storage .function(&source, ...) .await }, GenericRemoteStorage::S3(storage) => { ... exact same code as for the Local case ... }, This removes the code duplication, by allowing you to call the functions directly on GenericRemoteStorage. Also change RemoveObjectId to be just a type alias for String. Now that the callers of GenericRemoteStorage functions don't know whether they're dealing with the LocalFs or S3 implementation, RemoveObjectId must be the same type for both. --- libs/remote_storage/src/lib.rs | 235 +++++++++++++----------- libs/remote_storage/src/local_fs.rs | 131 ++++++++----- libs/remote_storage/src/s3_bucket.rs | 164 +++++++---------- pageserver/src/bin/pageserver.rs | 12 +- pageserver/src/http/routes.rs | 6 +- pageserver/src/storage_sync.rs | 26 +-- pageserver/src/storage_sync/delete.rs | 31 ++-- pageserver/src/storage_sync/download.rs | 110 ++++------- pageserver/src/storage_sync/upload.rs | 24 ++- pageserver/src/tenant_mgr.rs | 2 +- safekeeper/src/wal_backup.rs | 13 +- 11 files changed, 374 insertions(+), 380 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 8a10e098a1..55db91dc31 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -14,8 +14,10 @@ use std::{ ffi::OsStr, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, + ops::Deref, path::{Path, PathBuf}, pin::Pin, + sync::Arc, }; use anyhow::{bail, Context}; @@ -24,10 +26,7 @@ use tokio::io; use toml_edit::Item; use tracing::info; -pub use self::{ - local_fs::LocalFs, - s3_bucket::{S3Bucket, S3ObjectKey}, -}; +pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket}; /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency @@ -42,22 +41,62 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; + +#[derive(Clone, PartialEq, Eq)] +pub struct RemoteObjectId(String); + +impl From for String { + fn from(id: RemoteObjectId) -> Self { + id.0 + } +} + +/// +/// A key that refers to an object in remote storage. It works much like a Path, +/// but it's a separate datatype so that you don't accidentally mix local paths +/// and remote keys. +/// +impl RemoteObjectId { + // Needed to retrieve last component for RemoteObjectId. + // In other words a file name + /// Turn a/b/c or a/b/c/ into c + pub fn object_name(&self) -> Option<&str> { + // corner case, char::to_string is not const, thats why this is more verbose than it needs to be + // see https://github.com/rust-lang/rust/issues/88674 + if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR { + return None; + } + + if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1) + } else { + self.0 + .rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR) + .map(|(_, last)| last) + } + } +} + +impl Debug for RemoteObjectId { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + self.0.fmt(fmt) + } +} + /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[async_trait::async_trait] -pub trait RemoteStorage: Send + Sync { - /// A way to uniquely reference a file in the remote storage. - type RemoteObjectId; - +pub trait RemoteStorage: Send + Sync + 'static { /// Attempts to derive the storage path out of the local path, if the latter is correct. - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; /// Gets the download path of the given storage file. - fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result; + fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result; /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; + async fn list(&self) -> anyhow::Result>; /// Lists all top level subdirectories for a given prefix /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id @@ -65,34 +104,39 @@ pub trait RemoteStorage: Send + Sync { /// so this method doesnt need to. async fn list_prefixes( &self, - prefix: Option<&Self::RemoteObjectId>, - ) -> anyhow::Result>; + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result>; /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. from_size_bytes: usize, - to: &Self::RemoteObjectId, + to: &RemoteObjectId, metadata: Option, ) -> anyhow::Result<()>; /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. - async fn download(&self, from: &Self::RemoteObjectId) -> Result; + async fn download(&self, from: &RemoteObjectId) -> Result; /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. async fn download_byte_range( &self, - from: &Self::RemoteObjectId, + from: &RemoteObjectId, start_inclusive: u64, end_exclusive: Option, ) -> Result; - async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; + async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>; + + /// Downcast to LocalFs implementation. For tests. + fn as_local(&self) -> Option<&LocalFs> { + None + } } pub struct Download { @@ -135,34 +179,37 @@ impl std::error::Error for DownloadError {} /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. -pub enum GenericRemoteStorage { - Local(LocalFs), - S3(S3Bucket), +#[derive(Clone)] +pub struct GenericRemoteStorage(Arc); + +impl Deref for GenericRemoteStorage { + type Target = dyn RemoteStorage; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } } impl GenericRemoteStorage { - pub fn new( + pub fn new(storage: impl RemoteStorage) -> Self { + Self(Arc::new(storage)) + } + + pub fn from_config( working_directory: PathBuf, storage_config: &RemoteStorageConfig, - ) -> anyhow::Result { - match &storage_config.storage { + ) -> anyhow::Result { + Ok(match &storage_config.storage { RemoteStorageKind::LocalFs(root) => { info!("Using fs root '{}' as a remote storage", root.display()); - LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local) + GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?) } RemoteStorageKind::AwsS3(s3_config) => { info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3) + s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?) } - } - } - - pub fn as_local(&self) -> Option<&LocalFs> { - match self { - Self::Local(local_fs) => Some(local_fs), - _ => None, - } + }) } /// Takes storage object contents and its size and uploads to remote storage, @@ -172,47 +219,26 @@ impl GenericRemoteStorage { /// this path is used for the remote object id conversion only. pub async fn upload_storage_object( &self, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box, from_size_bytes: usize, from_path: &Path, ) -> anyhow::Result<()> { - async fn do_upload_storage_object( - storage: &S, - from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, - from_size_bytes: usize, - from_path: &Path, - ) -> anyhow::Result<()> - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let target_storage_path = storage.remote_object_id(from_path).with_context(|| { + let target_storage_path = self.remote_object_id(from_path).with_context(|| { + format!( + "Failed to get the storage path for source local path '{}'", + from_path.display() + ) + })?; + + self.upload(from, from_size_bytes, &target_storage_path, None) + .await + .with_context(|| { format!( - "Failed to get the storage path for source local path '{}'", - from_path.display() + "Failed to upload from '{}' to storage path '{:?}'", + from_path.display(), + target_storage_path ) - })?; - - storage - .upload(from, from_size_bytes, &target_storage_path, None) - .await - .with_context(|| { - format!( - "Failed to upload from '{}' to storage path '{:?}'", - from_path.display(), - target_storage_path - ) - }) - } - - match self { - GenericRemoteStorage::Local(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - GenericRemoteStorage::S3(storage) => { - do_upload_storage_object(storage, from, from_size_bytes, from_path).await - } - } + }) } /// Downloads the storage object into the `to_path` provided. @@ -222,42 +248,22 @@ impl GenericRemoteStorage { byte_range: Option<(u64, Option)>, to_path: &Path, ) -> Result { - async fn do_download_storage_object( - storage: &S, - byte_range: Option<(u64, Option)>, - to_path: &Path, - ) -> Result - where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, - { - let remote_object_path = storage - .remote_object_id(to_path) - .with_context(|| { - format!( - "Failed to get the storage path for target local path '{}'", - to_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; + let remote_object_path = self + .remote_object_id(to_path) + .with_context(|| { + format!( + "Failed to get the storage path for target local path '{}'", + to_path.display() + ) + }) + .map_err(DownloadError::BadInput)?; - match byte_range { - Some((start, end)) => { - storage - .download_byte_range(&remote_object_path, start, end) - .await - } - None => storage.download(&remote_object_path).await, - } - } - - match self { - GenericRemoteStorage::Local(storage) => { - do_download_storage_object(storage, byte_range, to_path).await - } - GenericRemoteStorage::S3(storage) => { - do_download_storage_object(storage, byte_range, to_path).await + match byte_range { + Some((start, end)) => { + self.download_byte_range(&remote_object_path, start, end) + .await } + None => self.download(&remote_object_path).await, } } } @@ -463,4 +469,23 @@ mod tests { "/foo/bar.baz..temp" ); } + + #[test] + fn object_name() { + let k = RemoteObjectId("a/b/c".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = RemoteObjectId("a/b/c/".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = RemoteObjectId("a/".to_owned()); + assert_eq!(k.object_name(), Some("a")); + + // XXX is it impossible to have an empty key? + let k = RemoteObjectId("".to_owned()); + assert_eq!(k.object_name(), None); + + let k = RemoteObjectId("/".to_owned()); + assert_eq!(k.object_name(), None); + } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index ddf6c01759..2561c0ca24 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -17,10 +17,19 @@ use tokio::{ }; use tracing::*; -use crate::{path_with_suffix_extension, Download, DownloadError}; +use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +/// Convert a Path in the remote storage into a RemoteObjectId +fn remote_object_id_from_path(path: &Path) -> anyhow::Result { + Ok(RemoteObjectId( + path.to_str() + .ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))? + .to_string(), + )) +} + pub struct LocalFs { working_directory: PathBuf, storage_root: PathBuf, @@ -43,11 +52,17 @@ impl LocalFs { }) } - fn resolve_in_storage(&self, path: &Path) -> anyhow::Result { + /// + /// Get the absolute path in the local filesystem to given remote object. + /// + /// This is public so that it can be used in tests. Should not be used elsewhere. + /// + pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { + let path = PathBuf::from(&remote_object_id.0); if path.is_relative() { Ok(self.storage_root.join(path)) } else if path.starts_with(&self.storage_root) { - Ok(path.to_path_buf()) + Ok(path) } else { bail!( "Path '{}' does not belong to the current storage", @@ -85,38 +100,42 @@ impl LocalFs { #[async_trait::async_trait] impl RemoteStorage for LocalFs { - type RemoteObjectId = PathBuf; - - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { - Ok(self.storage_root.join( + /// Convert a "local" path into a "remote path" + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + let path = self.storage_root.join( strip_path_prefix(&self.working_directory, local_path) .context("local path does not belong to this storage")?, - )) + ); + remote_object_id_from_path(&path) } - fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { - let relative_path = strip_path_prefix(&self.storage_root, storage_path) + fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { + let storage_path = PathBuf::from(&remote_object_id.0); + let relative_path = strip_path_prefix(&self.storage_root, &storage_path) .context("local path does not belong to this storage")?; Ok(self.working_directory.join(relative_path)) } - async fn list(&self) -> anyhow::Result> { + async fn list(&self) -> anyhow::Result> { get_all_files(&self.storage_root, true).await } async fn list_prefixes( &self, - prefix: Option<&Self::RemoteObjectId>, - ) -> anyhow::Result> { - let path = prefix.unwrap_or(&self.storage_root); + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result> { + let path = match prefix { + Some(prefix) => Path::new(&prefix.0), + None => &self.storage_root, + }; get_all_files(path, false).await } async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, from_size_bytes: usize, - to: &Self::RemoteObjectId, + to: &RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let target_file_path = self.resolve_in_storage(to)?; @@ -197,7 +216,7 @@ impl RemoteStorage for LocalFs { Ok(()) } - async fn download(&self, from: &Self::RemoteObjectId) -> Result { + async fn download(&self, from: &RemoteObjectId) -> Result { let file_path = self .resolve_in_storage(from) .map_err(DownloadError::BadInput)?; @@ -231,7 +250,7 @@ impl RemoteStorage for LocalFs { async fn download_byte_range( &self, - from: &Self::RemoteObjectId, + from: &RemoteObjectId, start_inclusive: u64, end_exclusive: Option, ) -> Result { @@ -285,7 +304,7 @@ impl RemoteStorage for LocalFs { } } - async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { + async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> { let file_path = self.resolve_in_storage(path)?; if file_path.exists() && file_path.is_file() { Ok(fs::remove_file(file_path).await?) @@ -296,6 +315,10 @@ impl RemoteStorage for LocalFs { ) } } + + fn as_local(&self) -> Option<&LocalFs> { + Some(self) + } } fn storage_metadata_path(original_path: &Path) -> PathBuf { @@ -305,7 +328,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf { fn get_all_files<'a, P>( directory_path: P, recursive: bool, -) -> Pin>> + Send + Sync + 'a>> +) -> Pin>> + Send + Sync + 'a>> where P: AsRef + Send + Sync + 'a, { @@ -322,12 +345,12 @@ where debug!("{:?} us a symlink, skipping", entry_path) } else if file_type.is_dir() { if recursive { - paths.extend(get_all_files(entry_path, true).await?.into_iter()) + paths.extend(get_all_files(&entry_path, true).await?.into_iter()) } else { - paths.push(dir_entry.path()) + paths.push(remote_object_id_from_path(&dir_entry.path())?) } } else { - paths.push(dir_entry.path()); + paths.push(remote_object_id_from_path(&dir_entry.path())?); } } Ok(paths) @@ -389,9 +412,15 @@ mod pure_tests { .join("file_name"); let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?); + let actual_path = PathBuf::from( + storage + .remote_object_id(&local_path) + .expect("Matching path should map to storage path normally") + .0, + ); assert_eq!( expected_path, - storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"), + actual_path, "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir" ); @@ -452,7 +481,9 @@ mod pure_tests { assert_eq!( local_path, storage - .local_path(&storage_root.join(local_path.strip_prefix(&workdir)?)) + .local_path(&remote_object_id_from_path( + &storage_root.join(local_path.strip_prefix(&workdir)?) + )?) .expect("For a valid input, valid local path should be parsed"), "Should be able to parse metadata out of the correctly named remote delta file" ); @@ -476,8 +507,7 @@ mod pure_tests { #[test] fn local_path_negatives() -> anyhow::Result<()> { #[track_caller] - #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements - fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String { + fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String { match storage.local_path(storage_path) { Ok(wrong_path) => panic!( "Expected local path input {:?} to cause an error, but got file path: {:?}", @@ -494,7 +524,8 @@ mod pure_tests { }; let totally_wrong_path = "wrong_wrong_wrong"; - let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path)); + let error_message = + local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string())); assert!(error_message.contains(totally_wrong_path)); Ok(()) @@ -537,7 +568,7 @@ mod fs_tests { storage: &LocalFs, #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements - remote_storage_path: &PathBuf, + remote_storage_path: &RemoteObjectId, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { let mut download = storage @@ -568,12 +599,20 @@ mod fs_tests { "whatever_contents", ) .await?; - let target_path = PathBuf::from("/").join("somewhere").join("else"); - match storage.upload(file, size, &target_path, None).await { + let target_path = "/somewhere/else"; + match storage + .upload( + Box::new(file), + size, + &RemoteObjectId(target_path.to_string()), + None, + ) + .await + { Ok(()) => panic!("Should not allow storing files with wrong target path"), Err(e) => { let message = format!("{:?}", e); - assert!(message.contains(&target_path.display().to_string())); + assert!(message.contains(target_path)); assert!(message.contains("does not belong to the current storage")); } } @@ -606,20 +645,20 @@ mod fs_tests { // Check that you get an error if the size parameter doesn't match the actual // size of the stream. storage - .upload(content.clone(), 0, &id, None) + .upload(Box::new(content.clone()), 0, &id, None) .await .expect_err("upload with zero size succeeded"); storage - .upload(content.clone(), 4, &id, None) + .upload(Box::new(content.clone()), 4, &id, None) .await .expect_err("upload with too short size succeeded"); storage - .upload(content.clone(), 6, &id, None) + .upload(Box::new(content.clone()), 6, &id, None) .await .expect_err("upload with too large size succeeded"); // Correct size is 5, this should succeed. - storage.upload(content, 5, &id, None).await?; + storage.upload(Box::new(content), 5, &id, None).await?; Ok(()) } @@ -643,8 +682,8 @@ mod fs_tests { "We should upload and download the same contents" ); - let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage.download(&non_existing_path).await { + let non_existing_path = "somewhere/else"; + match storage.download(&RemoteObjectId(non_existing_path.to_string())).await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } @@ -783,7 +822,7 @@ mod fs_tests { Err(e) => { let error_string = e.to_string(); assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&upload_target.display().to_string())); + assert!(error_string.contains(&upload_target.0)); } } Ok(()) @@ -844,15 +883,19 @@ mod fs_tests { storage: &LocalFs, name: &str, metadata: Option, - ) -> anyhow::Result { + ) -> anyhow::Result { let timeline_path = workdir.join("timelines").join("some_timeline"); let relative_timeline_path = timeline_path.strip_prefix(&workdir)?; let storage_path = storage.storage_root.join(relative_timeline_path).join(name); + let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string()); let from_path = storage.working_directory.join(name); let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; - storage.upload(file, size, &storage_path, metadata).await?; - Ok(storage_path) + + storage + .upload(Box::new(file), size, &remote_object_id, metadata) + .await?; + remote_object_id_from_path(&storage_path) } async fn create_file_for_upload( @@ -877,9 +920,9 @@ mod fs_tests { format!("contents for {name}") } - async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { + async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { let mut files = storage.list().await?; - files.sort(); + files.sort_by(|a, b| a.0.cmp(&b.0)); Ok(files) } } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index db31200c36..74632430cd 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -19,7 +19,10 @@ use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config}; +use crate::{ + strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config, + REMOTE_STORAGE_PREFIX_SEPARATOR, +}; use super::StorageMetadata; @@ -88,50 +91,26 @@ pub(super) mod metrics { } } -const S3_PREFIX_SEPARATOR: char = '/'; +fn download_destination( + id: &RemoteObjectId, + workdir: &Path, + prefix_to_strip: Option<&str>, +) -> PathBuf { + let path_without_prefix = match prefix_to_strip { + Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| { + panic!( + "Could not strip prefix '{}' from S3 object key '{}'", + prefix, id.0 + ) + }), + None => &id.0, + }; -#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct S3ObjectKey(String); - -impl S3ObjectKey { - /// Turn a/b/c or a/b/c/ into c - pub fn object_name(&self) -> Option<&str> { - // corner case, char::to_string is not const, thats why this is more verbose than it needs to be - // see https://github.com/rust-lang/rust/issues/88674 - if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { - return None; - } - - if self.0.ends_with(S3_PREFIX_SEPARATOR) { - self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) - } else { - self.0 - .rsplit_once(S3_PREFIX_SEPARATOR) - .map(|(_, last)| last) - } - } - - fn key(&self) -> &str { - &self.0 - } - - fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf { - let path_without_prefix = match prefix_to_strip { - Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| { - panic!( - "Could not strip prefix '{}' from S3 object key '{}'", - prefix, self.0 - ) - }), - None => &self.0, - }; - - workdir.join( - path_without_prefix - .split(S3_PREFIX_SEPARATOR) - .collect::(), - ) - } + workdir.join( + path_without_prefix + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .collect::(), + ) } /// AWS S3 storage. @@ -193,12 +172,12 @@ impl S3Bucket { let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { let mut prefix = prefix; - while prefix.starts_with(S3_PREFIX_SEPARATOR) { + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix = &prefix[1..] } let mut prefix = prefix.to_string(); - while prefix.ends_with(S3_PREFIX_SEPARATOR) { + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { prefix.pop(); } prefix @@ -249,23 +228,25 @@ impl S3Bucket { #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - type RemoteObjectId = S3ObjectKey; - - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { let relative_path = strip_path_prefix(&self.workdir, local_path)?; let mut key = self.prefix_in_bucket.clone().unwrap_or_default(); for segment in relative_path { - key.push(S3_PREFIX_SEPARATOR); + key.push(REMOTE_STORAGE_PREFIX_SEPARATOR); key.push_str(&segment.to_string_lossy()); } - Ok(S3ObjectKey(key)) + Ok(RemoteObjectId(key)) } - fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result { - Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref())) + fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result { + Ok(download_destination( + storage_path, + &self.workdir, + self.prefix_in_bucket.as_deref(), + )) } - async fn list(&self) -> anyhow::Result> { + async fn list(&self) -> anyhow::Result> { let mut document_keys = Vec::new(); let mut continuation_token = None; @@ -296,7 +277,7 @@ impl RemoteStorage for S3Bucket { .contents .unwrap_or_default() .into_iter() - .filter_map(|o| Some(S3ObjectKey(o.key?))), + .filter_map(|o| Some(RemoteObjectId(o.key?))), ); match fetch_response.continuation_token { @@ -312,8 +293,8 @@ impl RemoteStorage for S3Bucket { /// Note: it wont include empty "directories" async fn list_prefixes( &self, - prefix: Option<&Self::RemoteObjectId>, - ) -> anyhow::Result> { + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| p.0.clone()) @@ -321,8 +302,8 @@ impl RemoteStorage for S3Bucket { .map(|mut p| { // required to end with a separator // otherwise request will return only the entry of a prefix - if !p.ends_with(S3_PREFIX_SEPARATOR) { - p.push(S3_PREFIX_SEPARATOR); + if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); } p }); @@ -345,7 +326,7 @@ impl RemoteStorage for S3Bucket { bucket: self.bucket_name.clone(), prefix: list_prefix.clone(), continuation_token, - delimiter: Some(S3_PREFIX_SEPARATOR.to_string()), + delimiter: Some(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()), ..ListObjectsV2Request::default() }) .await @@ -359,7 +340,7 @@ impl RemoteStorage for S3Bucket { .common_prefixes .unwrap_or_default() .into_iter() - .filter_map(|o| Some(S3ObjectKey(o.prefix?))), + .filter_map(|o| Some(RemoteObjectId(o.prefix?))), ); match fetch_response.continuation_token { @@ -373,9 +354,9 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, from_size_bytes: usize, - to: &Self::RemoteObjectId, + to: &RemoteObjectId, metadata: Option, ) -> anyhow::Result<()> { let _guard = self @@ -392,7 +373,7 @@ impl RemoteStorage for S3Bucket { from_size_bytes, )), bucket: self.bucket_name.clone(), - key: to.key().to_owned(), + key: to.0.to_owned(), metadata: metadata.map(|m| m.0), ..PutObjectRequest::default() }) @@ -404,10 +385,10 @@ impl RemoteStorage for S3Bucket { Ok(()) } - async fn download(&self, from: &Self::RemoteObjectId) -> Result { + async fn download(&self, from: &RemoteObjectId) -> Result { self.download_object(GetObjectRequest { bucket: self.bucket_name.clone(), - key: from.key().to_owned(), + key: from.0.to_owned(), ..GetObjectRequest::default() }) .await @@ -415,7 +396,7 @@ impl RemoteStorage for S3Bucket { async fn download_byte_range( &self, - from: &Self::RemoteObjectId, + from: &RemoteObjectId, start_inclusive: u64, end_exclusive: Option, ) -> Result { @@ -429,14 +410,14 @@ impl RemoteStorage for S3Bucket { self.download_object(GetObjectRequest { bucket: self.bucket_name.clone(), - key: from.key().to_owned(), + key: from.0.to_owned(), range, ..GetObjectRequest::default() }) .await } - async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { + async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> { let _guard = self .concurrency_limiter .acquire() @@ -448,7 +429,7 @@ impl RemoteStorage for S3Bucket { self.client .delete_object(DeleteObjectRequest { bucket: self.bucket_name.clone(), - key: path.key().to_owned(), + key: remote_object_id.0.to_owned(), ..DeleteObjectRequest::default() }) .await @@ -467,43 +448,24 @@ mod tests { use super::*; #[test] - fn object_name() { - let k = S3ObjectKey("a/b/c".to_owned()); - assert_eq!(k.object_name(), Some("c")); - - let k = S3ObjectKey("a/b/c/".to_owned()); - assert_eq!(k.object_name(), Some("c")); - - let k = S3ObjectKey("a/".to_owned()); - assert_eq!(k.object_name(), Some("a")); - - // XXX is it impossible to have an empty key? - let k = S3ObjectKey("".to_owned()); - assert_eq!(k.object_name(), None); - - let k = S3ObjectKey("/".to_owned()); - assert_eq!(k.object_name(), None); - } - - #[test] - fn download_destination() -> anyhow::Result<()> { + fn test_download_destination() -> anyhow::Result<()> { let workdir = tempdir()?.path().to_owned(); let local_path = workdir.join("one").join("two").join("test_name"); let relative_path = local_path.strip_prefix(&workdir)?; - let key = S3ObjectKey(format!( + let key = RemoteObjectId(format!( "{}{}", - S3_PREFIX_SEPARATOR, + REMOTE_STORAGE_PREFIX_SEPARATOR, relative_path .iter() .map(|segment| segment.to_str().unwrap()) .collect::>() - .join(&S3_PREFIX_SEPARATOR.to_string()), + .join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()), )); assert_eq!( local_path, - key.download_destination(&workdir, None), + download_destination(&key, &workdir, None), "Download destination should consist of s3 path joined with the workdir prefix" ); @@ -520,8 +482,8 @@ mod tests { let storage = dummy_storage(workdir); - let expected_key = S3ObjectKey(format!( - "{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}", + let expected_key = RemoteObjectId(format!( + "{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}", storage.prefix_in_bucket.as_deref().unwrap_or_default(), )); @@ -592,7 +554,7 @@ mod tests { storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), + download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -604,7 +566,7 @@ mod tests { storage.prefix_in_bucket.as_deref(), ); assert_eq!( - s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()), + download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), storage .local_path(&s3_key) .expect("For a valid input, valid S3 info should be parsed"), @@ -645,11 +607,11 @@ mod tests { } } - fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> S3ObjectKey { - S3ObjectKey(relative_file_path.iter().fold( + fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId { + RemoteObjectId(relative_file_path.iter().fold( prefix.unwrap_or_default().to_string(), |mut path_string, segment| { - path_string.push(S3_PREFIX_SEPARATOR); + path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR); path_string.push_str(segment.to_str().unwrap()); path_string }, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7a33a548e7..5a43516728 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,7 +1,7 @@ //! Main entry point for the Page Server executable. use remote_storage::GenericRemoteStorage; -use std::{env, ops::ControlFlow, path::Path, str::FromStr, sync::Arc}; +use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -302,11 +302,13 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() let remote_storage = conf .remote_storage_config .as_ref() - .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) + .map(|storage_config| { + GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config) + }) .transpose() - .context("Failed to init generic remote storage")? - .map(Arc::new); - let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.as_ref().map(Arc::clone))?; + .context("Failed to init generic remote storage")?; + + let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?; // Spawn a new thread for the http endpoint // bind before launching separate thread so the error reported before startup exits diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 09c4812067..a31c2fd2a5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -34,7 +34,7 @@ struct State { auth: Option>, remote_index: RemoteIndex, allowlist_routes: Vec, - remote_storage: Option>, + remote_storage: Option, } impl State { @@ -42,7 +42,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, - remote_storage: Option>, + remote_storage: Option, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -659,7 +659,7 @@ pub fn make_router( conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, - remote_storage: Option>, + remote_storage: Option, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 491f882e0b..42fd6b8ea8 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -150,7 +150,7 @@ use std::{ num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, path::{Path, PathBuf}, - sync::{Arc, Condvar, Mutex}, + sync::{Condvar, Mutex}, }; use anyhow::{anyhow, bail, Context}; @@ -222,7 +222,7 @@ pub struct SyncStartupData { /// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. pub fn start_local_timeline_sync( config: &'static PageServerConf, - storage: Option>, + storage: Option, ) -> anyhow::Result { let local_timeline_files = local_tenant_timeline_files(config) .context("Failed to collect local tenant timeline files")?; @@ -766,7 +766,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, local_timeline_files: HashMap)>, - storage: Arc, + storage: GenericRemoteStorage, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, ) -> anyhow::Result { @@ -825,12 +825,12 @@ pub(super) fn spawn_storage_sync_thread( fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, ) { info!("Starting remote storage sync loop"); loop { - let loop_storage = Arc::clone(&storage); + let loop_storage = storage.clone(); let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); @@ -939,7 +939,7 @@ enum UploadStatus { async fn process_batches( conf: &'static PageServerConf, max_sync_errors: NonZeroU32, - storage: Arc, + storage: GenericRemoteStorage, index: &RemoteIndex, batched_tasks: HashMap, sync_queue: &SyncQueue, @@ -947,7 +947,7 @@ async fn process_batches( let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { - let storage = Arc::clone(&storage); + let storage = storage.clone(); let index = index.clone(); async move { let state_update = process_sync_task_batch( @@ -981,7 +981,7 @@ async fn process_batches( async fn process_sync_task_batch( conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, @@ -1009,7 +1009,7 @@ async fn process_sync_task_batch( ControlFlow::Continue(()) => { upload_timeline_data( conf, - (storage.as_ref(), &index, sync_queue), + (&storage, &index, sync_queue), current_remote_timeline.as_ref(), sync_id, upload_data, @@ -1020,7 +1020,7 @@ async fn process_sync_task_batch( } ControlFlow::Break(()) => match update_remote_data( conf, - storage.as_ref(), + &storage, &index, sync_id, RemoteDataUpdate::Upload { @@ -1053,7 +1053,7 @@ async fn process_sync_task_batch( ControlFlow::Continue(()) => { return download_timeline_data( conf, - (storage.as_ref(), &index, sync_queue), + (&storage, &index, sync_queue), current_remote_timeline.as_ref(), sync_id, download_data, @@ -1086,7 +1086,7 @@ async fn process_sync_task_batch( ControlFlow::Continue(()) => { delete_timeline_data( conf, - (storage.as_ref(), &index, sync_queue), + (&storage, &index, sync_queue), sync_id, delete_data, sync_start, @@ -1098,7 +1098,7 @@ async fn process_sync_task_batch( ControlFlow::Break(()) => { if let Err(e) = update_remote_data( conf, - storage.as_ref(), + &storage, &index, sync_id, RemoteDataUpdate::Delete(&delete_data.data.deleted_layers), diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index d80a082d0c..794ecbaeb3 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -7,15 +7,15 @@ use futures::stream::{FuturesUnordered, StreamExt}; use tracing::{debug, error, info}; use crate::storage_sync::{SyncQueue, SyncTask}; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use utils::zid::ZTenantTimelineId; use super::{LayersDeletion, SyncData}; /// Attempts to remove the timleline layers from the remote storage. /// If the task had not adjusted the metadata before, the deletion will fail. -pub(super) async fn delete_timeline_layers<'a>( - storage: &'a GenericRemoteStorage, +pub(super) async fn delete_timeline_layers( + storage: &GenericRemoteStorage, sync_queue: &SyncQueue, sync_id: ZTenantTimelineId, mut delete_data: SyncData, @@ -43,14 +43,7 @@ pub(super) async fn delete_timeline_layers<'a>( let mut delete_tasks = layers_to_delete .into_iter() .map(|local_layer_path| async { - match match storage { - GenericRemoteStorage::Local(storage) => { - remove_storage_object(storage, &local_layer_path).await - } - GenericRemoteStorage::S3(storage) => { - remove_storage_object(storage, &local_layer_path).await - } - } { + match remove_storage_object(storage, &local_layer_path).await { Ok(()) => Ok(local_layer_path), Err(e) => Err((e, local_layer_path)), } @@ -88,11 +81,10 @@ pub(super) async fn delete_timeline_layers<'a>( errored } -async fn remove_storage_object(storage: &S, local_layer_path: &Path) -> anyhow::Result<()> -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +async fn remove_storage_object( + storage: &GenericRemoteStorage, + local_layer_path: &Path, +) -> anyhow::Result<()> { let storage_path = storage .remote_object_id(local_layer_path) .with_context(|| { @@ -132,7 +124,7 @@ mod tests { let harness = RepoHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), )?); @@ -167,7 +159,7 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), )?); @@ -180,7 +172,8 @@ mod tests { let timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = local_storage.remote_object_id(&local_path)?; + let remote_path = + local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index e11a863dcc..372ca0a463 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,9 +9,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{ - path_with_suffix_extension, DownloadError, GenericRemoteStorage, RemoteStorage, -}; +use remote_storage::{path_with_suffix_extension, DownloadError, GenericRemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -371,68 +369,6 @@ async fn get_timeline_sync_ids( tenant_path: &Path, tenant_id: ZTenantId, ) -> anyhow::Result> { - let timeline_ids: Vec = match storage { - GenericRemoteStorage::Local(storage) => list_prefixes(storage, tenant_path) - .await? - .into_iter() - .map(|timeline_directory_path| { - timeline_directory_path - .file_stem() - .with_context(|| { - format!( - "Failed to get timeline id string from file '{}'", - timeline_directory_path.display() - ) - })? - .to_string_lossy() - .as_ref() - .parse() - .with_context(|| { - format!( - "failed to parse directory name '{}' as timeline id", - timeline_directory_path.display() - ) - }) - }) - .collect::>(), - GenericRemoteStorage::S3(storage) => list_prefixes(storage, tenant_path) - .await? - .into_iter() - .map(|s3_path| { - s3_path - .object_name() - .with_context(|| { - format!("Failed to get object name out of S3 path {s3_path:?}") - })? - .parse() - .with_context(|| { - format!("failed to parse object name '{s3_path:?}' as timeline id") - }) - }) - .collect::>(), - } - .with_context(|| { - format!("Tenant {tenant_id} has at least one incorrect timeline subdirectory") - })?; - - if timeline_ids.is_empty() { - anyhow::bail!("no timelines found on the remote storage for tenant {tenant_id}") - } - - Ok(timeline_ids - .into_iter() - .map(|timeline_id| ZTenantTimelineId { - tenant_id, - timeline_id, - }) - .collect()) -} - -async fn list_prefixes(storage: &S, tenant_path: &Path) -> anyhow::Result> -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { format!( "Failed to get tenant storage path for local path '{}'", @@ -440,14 +376,37 @@ where ) })?; - storage + let timelines = storage .list_prefixes(Some(&tenant_storage_path)) .await .with_context(|| { format!( "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download" ) - }) + })?; + + if timelines.is_empty() { + anyhow::bail!("no timelines found on the remote storage") + } + + let mut sync_ids = HashSet::new(); + + for timeline_remote_storage_key in timelines { + let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + })?; + + let timeline_id: ZTimelineId = object_name.parse().with_context(|| { + format!("failed to parse object name into timeline id '{object_name}'") + })?; + + sync_ids.insert(ZTenantTimelineId { + tenant_id, + timeline_id, + }); + } + + Ok(sync_ids) } async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { @@ -459,6 +418,7 @@ mod tests { use std::{ collections::{BTreeSet, HashSet}, num::NonZeroUsize, + path::PathBuf, }; use remote_storage::{LocalFs, RemoteStorage}; @@ -482,7 +442,7 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -494,7 +454,8 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = local_storage.remote_object_id(&local_path)?; + let remote_path = + local_storage.resolve_in_storage(&storage.remote_object_id(&local_path)?)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -580,7 +541,7 @@ mod tests { let harness = RepoHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -639,7 +600,7 @@ mod tests { let harness = RepoHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -663,9 +624,10 @@ mod tests { let local_index_part_path = metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); - let storage_path = local_storage.remote_object_id(&local_index_part_path)?; - fs::create_dir_all(storage_path.parent().unwrap()).await?; - fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; + let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; + let index_part_local_path = PathBuf::from(String::from(index_part_remote_id)); + fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; + fs::write(&index_part_local_path, serde_json::to_vec(&index_part)?).await?; let downloaded_index_part = download_index_part(harness.conf, &storage, sync_id).await?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 8dd73d9431..7070f941f5 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -34,7 +34,11 @@ pub(super) async fn upload_index_part( let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); storage - .upload_storage_object(index_part_bytes, index_part_size, &index_part_path) + .upload_storage_object( + Box::new(index_part_bytes), + index_part_size, + &index_part_path, + ) .await .with_context(|| format!("Failed to upload index part for '{sync_id}'")) } @@ -119,7 +123,7 @@ pub(super) async fn upload_timeline_layers<'a>( .len() as usize; match storage - .upload_storage_object(source_file, source_size, &source_path) + .upload_storage_object(Box::new(source_file), source_size, &source_path) .await .with_context(|| format!("Failed to upload layer file for {sync_id}")) { @@ -214,8 +218,8 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( - tempdir()?.path().to_owned(), + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), )?); let local_storage = storage.as_local().unwrap(); @@ -302,7 +306,7 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -395,7 +399,7 @@ mod tests { let harness = RepoHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = GenericRemoteStorage::Local(LocalFs::new( + let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), )?); @@ -431,13 +435,13 @@ mod tests { let index_part_path = storage_files.first().unwrap(); assert_eq!( - index_part_path.file_name().and_then(|name| name.to_str()), + index_part_path.object_name(), Some(IndexPart::FILE_NAME), "Remote index part should have the correct name" ); - - let remote_index_part: IndexPart = - serde_json::from_slice(&fs::read(&index_part_path).await?)?; + let remote_index_part: IndexPart = serde_json::from_slice( + &fs::read(local_storage.resolve_in_storage(index_part_path)?).await?, + )?; assert_eq!( index_part, remote_index_part, "Remote index part should match the local one" diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 7c82745142..041bd50737 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -134,7 +134,7 @@ impl fmt::Display for TenantState { /// are scheduled for download and added to the repository once download is completed. pub fn init_tenant_mgr( conf: &'static PageServerConf, - remote_storage: Option>, + remote_storage: Option, ) -> anyhow::Result { let (timeline_updates_sender, timeline_updates_receiver) = mpsc::unbounded_channel::(); diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 6acc70e85a..5d946e37a4 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -127,7 +127,8 @@ async fn wal_backup_launcher_main_loop( let conf_ = conf.clone(); REMOTE_STORAGE.get_or_init(|| { conf_.remote_storage.as_ref().map(|c| { - GenericRemoteStorage::new(conf_.workdir, c).expect("failed to create remote storage") + GenericRemoteStorage::from_config(conf_.workdir, c) + .expect("failed to create remote storage") }) }); @@ -417,7 +418,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { static REMOTE_STORAGE: OnceCell> = OnceCell::new(); async fn backup_object(source_file: &Path, size: usize) -> Result<()> { - let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); + let storage = REMOTE_STORAGE + .get() + .expect("failed to get remote storage") + .as_ref() + .unwrap(); let file = tokio::io::BufReader::new(File::open(&source_file).await.with_context(|| { format!( @@ -427,9 +432,7 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> { })?); storage - .as_ref() - .expect("Storage should be initialized by launcher at this point.") - .upload_storage_object(file, size, source_file) + .upload_storage_object(Box::new(file), size, source_file) .await } From 0b76b82e0ebfbbaaf9a6cf07217a5055c52ac196 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 8 Sep 2022 14:08:02 +0300 Subject: [PATCH 034/166] review clean up --- libs/remote_storage/src/lib.rs | 16 ++++++++-------- pageserver/src/storage_sync/download.rs | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 55db91dc31..e89f60de7e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -12,7 +12,7 @@ use std::{ borrow::Cow, collections::HashMap, ffi::OsStr, - fmt::Debug, + fmt::{Debug, Display}, num::{NonZeroU32, NonZeroUsize}, ops::Deref, path::{Path, PathBuf}, @@ -46,12 +46,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; #[derive(Clone, PartialEq, Eq)] pub struct RemoteObjectId(String); -impl From for String { - fn from(id: RemoteObjectId) -> Self { - id.0 - } -} - /// /// A key that refers to an object in remote storage. It works much like a Path, /// but it's a separate datatype so that you don't accidentally mix local paths @@ -80,7 +74,13 @@ impl RemoteObjectId { impl Debug for RemoteObjectId { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - self.0.fmt(fmt) + Debug::fmt(&self.0, fmt) + } +} + +impl Display for RemoteObjectId { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.0, fmt) } } diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 372ca0a463..b0beb4219a 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -625,7 +625,7 @@ mod tests { metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; - let index_part_local_path = PathBuf::from(String::from(index_part_remote_id)); + let index_part_local_path = PathBuf::from(index_part_remote_id.to_string()); fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; fs::write(&index_part_local_path, serde_json::to_vec(&index_part)?).await?; From d3f83eda52a1f4e372f9149ffc8b824ef3478a25 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 00:07:14 +0300 Subject: [PATCH 035/166] Use regular agent for triggering e2e tests --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1387514cc2..bf9de7d857 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -412,7 +412,7 @@ jobs: trigger-e2e-tests: runs-on: dev container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init needs: [ build-neon ] steps: From c9e7c2f014a2d6ce269ccb7943a22d778378e512 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 7 Sep 2022 17:03:20 +0300 Subject: [PATCH 036/166] Ensure all temporary and empty directories and files are cleansed on pageserver startup --- libs/remote_storage/src/lib.rs | 7 + libs/remote_storage/src/local_fs.rs | 5 +- pageserver/src/http/routes.rs | 5 +- pageserver/src/layered_repository.rs | 105 ++-- .../src/layered_repository/delta_layer.rs | 7 +- .../src/layered_repository/image_layer.rs | 4 +- pageserver/src/lib.rs | 79 +++ pageserver/src/storage_sync.rs | 314 +++--------- pageserver/src/storage_sync/download.rs | 5 +- pageserver/src/tenant_mgr.rs | 480 +++++++++++++----- pageserver/src/tenant_tasks.rs | 10 - pageserver/src/timelines.rs | 21 +- pageserver/src/walredo.rs | 25 +- test_runner/regress/test_broken_timeline.py | 45 +- 14 files changed, 639 insertions(+), 473 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e89f60de7e..6b3fd29a0e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -344,6 +344,8 @@ impl Debug for S3Config { } } +/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { let new_extension = match original_path .as_ref() @@ -468,6 +470,11 @@ mod tests { &path_with_suffix_extension(&p, ".temp").to_string_lossy(), "/foo/bar.baz..temp" ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); } #[test] diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 2561c0ca24..3ffbf3cb39 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -21,6 +21,8 @@ use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId} use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; + /// Convert a Path in the remote storage into a RemoteObjectId fn remote_object_id_from_path(path: &Path) -> anyhow::Result { Ok(RemoteObjectId( @@ -143,7 +145,8 @@ impl RemoteStorage for LocalFs { // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs - let temp_file_path = path_with_suffix_extension(&target_file_path, "temp"); + let temp_file_path = + path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX); let mut destination = io::BufWriter::new( fs::OpenOptions::new() .write(true) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a31c2fd2a5..59142bd9b2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -470,7 +470,7 @@ async fn tenant_list_handler(request: Request) -> Result, A let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("tenant_list").entered(); - crate::tenant_mgr::list_tenants(&remote_index) + crate::tenant_mgr::list_tenant_info(&remote_index) }) .await .map_err(ApiError::from_err)?; @@ -640,7 +640,8 @@ async fn tenant_config_handler(mut request: Request) -> Result Result { + let _guard = match self.file_lock.try_read() { + Ok(g) => g, + Err(_) => { + info!("File lock write acquired, shutting down GC"); + return Ok(GcResult::default()); + } + }; + let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -315,6 +323,14 @@ impl Repository { /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> Result<()> { + let _guard = match self.file_lock.try_read() { + Ok(g) => g, + Err(_) => { + info!("File lock write acquired, shutting down compaction"); + return Ok(()); + } + }; + // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -401,10 +417,10 @@ impl Repository { pub fn init_attach_timelines( &self, - timelines: Vec<(ZTimelineId, TimelineMetadata)>, + timelines: HashMap, ) -> anyhow::Result<()> { let sorted_timelines = if timelines.len() == 1 { - timelines + timelines.into_iter().collect() } else if !timelines.is_empty() { tree_sort_timelines(timelines)? } else { @@ -442,7 +458,7 @@ impl Repository { /// perform a topological sort, so that the parent of each timeline comes /// before the children. fn tree_sort_timelines( - timelines: Vec<(ZTimelineId, TimelineMetadata)>, + timelines: HashMap, ) -> Result> { let mut result = Vec::with_capacity(timelines.len()); @@ -567,13 +583,8 @@ impl Repository { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag) } - pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> { - let mut tenant_conf = self.tenant_conf.write().unwrap(); - - tenant_conf.update(&new_tenant_conf); - - Repository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; - Ok(()) + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) { + self.tenant_conf.write().unwrap().update(&new_tenant_conf); } fn initialize_new_timeline( @@ -648,32 +659,37 @@ impl Repository { tenant_id: ZTenantId, ) -> anyhow::Result { let target_config_path = TenantConf::path(conf, tenant_id); + let target_config_display = target_config_path.display(); - info!("load tenantconf from {}", target_config_path.display()); + info!("loading tenantconf from {target_config_display}"); // FIXME If the config file is not found, assume that we're attaching // a detached tenant and config is passed via attach command. // https://github.com/neondatabase/neon/issues/1555 if !target_config_path.exists() { - info!( - "tenant config not found in {}", - target_config_path.display() - ); - return Ok(Default::default()); + info!("tenant config not found in {target_config_display}"); + return Ok(TenantConfOpt::default()); } // load and parse file - let config = fs::read_to_string(target_config_path)?; + let config = fs::read_to_string(&target_config_path).with_context(|| { + format!("Failed to load config from path '{target_config_display}'") + })?; - let toml = config.parse::()?; + let toml = config.parse::().with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as toml file") + })?; - let mut tenant_conf: TenantConfOpt = Default::default(); + let mut tenant_conf = TenantConfOpt::default(); for (key, item) in toml.iter() { match key { "tenant_config" => { - tenant_conf = PageServerConf::parse_toml_tenant_conf(item)?; + tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as pageserver config") + })?; } - _ => bail!("unrecognized pageserver option '{}'", key), + _ => bail!("config file {target_config_display} has unrecognized pageserver option '{key}'"), + } } @@ -888,26 +904,6 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { Ok(()) } -pub fn load_metadata( - conf: &'static PageServerConf, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, -) -> anyhow::Result { - let metadata_path = metadata_path(conf, timeline_id, tenant_id); - let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { - format!( - "Failed to read metadata bytes from path {}", - metadata_path.display() - ) - })?; - TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { - format!( - "Failed to parse metadata bytes from path {}", - metadata_path.display() - ) - }) -} - #[cfg(test)] pub mod repo_harness { use bytes::{Bytes, BytesMut}; @@ -925,6 +921,7 @@ pub mod repo_harness { walredo::{WalRedoError, WalRedoManager}, }; + use super::metadata::metadata_path; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; @@ -1030,7 +1027,7 @@ pub mod repo_harness { false, ); // populate repo with locally available timelines - let mut timelines_to_load = Vec::new(); + let mut timelines_to_load = HashMap::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") { @@ -1042,7 +1039,7 @@ pub mod repo_harness { .to_string_lossy() .parse()?; let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; - timelines_to_load.push((timeline_id, timeline_metadata)); + timelines_to_load.insert(timeline_id, timeline_metadata); } repo.init_attach_timelines(timelines_to_load)?; @@ -1054,6 +1051,26 @@ pub mod repo_harness { } } + fn load_metadata( + conf: &'static PageServerConf, + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + ) -> anyhow::Result { + let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { + format!( + "Failed to read metadata bytes from path {}", + metadata_path.display() + ) + })?; + TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { + format!( + "Failed to parse metadata bytes from path {}", + metadata_path.display() + ) + }) + } + // Mock WAL redo manager that doesn't do much pub struct TestRedoManager; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index ce5cb57745..af02f84bc0 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -34,7 +34,7 @@ use crate::layered_repository::storage_layer::{ use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; -use crate::walrecord; +use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use rand::{distributions::Alphanumeric, Rng}; @@ -447,11 +447,12 @@ impl DeltaLayer { .collect(); conf.timeline_path(&timelineid, &tenantid).join(format!( - "{}-XXX__{:016X}-{:016X}.{}.temp", + "{}-XXX__{:016X}-{:016X}.{}.{}", key_start, u64::from(lsn_range.start), u64::from(lsn_range.end), - rand_string + rand_string, + TEMP_FILE_SUFFIX, )) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index bb24553afd..4fe771bb3f 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -30,7 +30,7 @@ use crate::layered_repository::storage_layer::{ use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; use crate::virtual_file::VirtualFile; -use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION}; +use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; @@ -255,7 +255,7 @@ impl ImageLayer { .collect(); conf.timeline_path(&timelineid, &tenantid) - .join(format!("{}.{}.temp", fname, rand_string)) + .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } /// diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 4731179e22..86bbf25b67 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -23,7 +23,10 @@ pub mod walreceiver; pub mod walrecord; pub mod walredo; +use std::collections::HashMap; + use tracing::info; +use utils::zid::{ZTenantId, ZTimelineId}; use crate::thread_mgr::ThreadKind; @@ -100,6 +103,50 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds } } +/// A newtype to store arbitrary data grouped by tenant and timeline ids. +/// One could use [`utils::zid::ZTenantTimelineId`] for grouping, but that would +/// not include the cases where a certain tenant has zero timelines. +/// This is sometimes important: a tenant could be registered during initial load from FS, +/// even if he has no timelines on disk. +#[derive(Debug)] +pub struct TenantTimelineValues(HashMap>); + +impl TenantTimelineValues { + fn new() -> Self { + Self(HashMap::new()) + } + + fn with_capacity(capacity: usize) -> Self { + Self(HashMap::with_capacity(capacity)) + } + + /// A convenience method to map certain values and omit some of them, if needed. + /// Tenants that won't have any timeline entries due to the filtering, will still be preserved + /// in the structure. + fn filter_map(self, map: F) -> TenantTimelineValues + where + F: Fn(T) -> Option, + { + let capacity = self.0.len(); + self.0.into_iter().fold( + TenantTimelineValues::::with_capacity(capacity), + |mut new_values, (tenant_id, old_values)| { + let new_timeline_values = new_values.0.entry(tenant_id).or_default(); + for (timeline_id, old_value) in old_values { + if let Some(new_value) = map(old_value) { + new_timeline_values.insert(timeline_id, new_value); + } + } + new_values + }, + ) + } +} + +/// A suffix to be used during file sync from the remote storage, +/// to ensure that we do not leave corrupted files that pretend to be layers. +const TEMP_FILE_SUFFIX: &str = "___temp"; + #[cfg(test)] mod backoff_defaults_tests { use super::*; @@ -130,3 +177,35 @@ mod backoff_defaults_tests { ); } } + +#[cfg(test)] +mod tests { + use crate::layered_repository::repo_harness::TIMELINE_ID; + + use super::*; + + #[test] + fn tenant_timeline_value_mapping() { + let first_tenant = ZTenantId::generate(); + let second_tenant = ZTenantId::generate(); + assert_ne!(first_tenant, second_tenant); + + let mut initial = TenantTimelineValues::new(); + initial + .0 + .entry(first_tenant) + .or_default() + .insert(TIMELINE_ID, "test_value"); + let _ = initial.0.entry(second_tenant).or_default(); + assert_eq!(initial.0.len(), 2, "Should have entries for both tenants"); + + let filtered = initial.filter_map(|_| None::<&str>).0; + assert_eq!( + filtered.len(), + 2, + "Should have entries for both tenants even after filtering away all entries" + ); + assert!(filtered.contains_key(&first_tenant)); + assert!(filtered.contains_key(&second_tenant)); + } +} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 42fd6b8ea8..57a964cb67 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -145,7 +145,6 @@ mod upload; use std::{ collections::{hash_map, HashMap, HashSet, VecDeque}, - ffi::OsStr, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, ops::ControlFlow, @@ -170,244 +169,56 @@ use self::{ index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; -use crate::metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}; use crate::{ config::PageServerConf, exponential_backoff, - layered_repository::{ - ephemeral_file::is_ephemeral_file, - metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, - }, - storage_sync::{self, index::RemoteIndex}, - tenant_mgr::attach_downloaded_tenants, + layered_repository::metadata::{metadata_path, TimelineMetadata}, + storage_sync::index::RemoteIndex, + tenant_mgr::attach_local_tenants, thread_mgr, thread_mgr::ThreadKind, }; +use crate::{ + metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, + TenantTimelineValues, +}; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; -pub use self::download::TEMP_DOWNLOAD_EXTENSION; static SYNC_QUEUE: OnceCell = OnceCell::new(); /// A timeline status to share with pageserver's sync counterpart, /// after comparing local and remote timeline state. -#[derive(Clone, Copy, Debug)] +#[derive(Clone)] pub enum LocalTimelineInitStatus { /// The timeline has every remote layer present locally. /// There could be some layers requiring uploading, /// but this does not block the timeline from any user interaction. - LocallyComplete, + LocallyComplete(TimelineMetadata), /// A timeline has some files remotely, that are not present locally and need downloading. /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, /// so the data needs to be downloaded first before the timeline can be used. NeedsSync, } -type LocalTimelineInitStatuses = HashMap>; +impl std::fmt::Debug for LocalTimelineInitStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::LocallyComplete(_) => write!(f, "LocallyComplete"), + Self::NeedsSync => write!(f, "NeedsSync"), + } + } +} /// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. /// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, /// to simplify the received code. pub struct SyncStartupData { pub remote_index: RemoteIndex, - pub local_timeline_init_statuses: LocalTimelineInitStatuses, -} - -/// Based on the config, initiates the remote storage connection and starts a separate thread -/// that ensures that pageserver and the remote storage are in sync with each other. -/// If no external configuration connection given, no thread or storage initialization is done. -/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. -pub fn start_local_timeline_sync( - config: &'static PageServerConf, - storage: Option, -) -> anyhow::Result { - let local_timeline_files = local_tenant_timeline_files(config) - .context("Failed to collect local tenant timeline files")?; - - match storage.zip(config.remote_storage_config.as_ref()) { - Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - storage, - storage_config.max_concurrent_syncs, - storage_config.max_sync_errors, - ) - .context("Failed to spawn the storage sync thread"), - None => { - info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); - for ( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - _, - ) in local_timeline_files - { - local_timeline_init_statuses - .entry(tenant_id) - .or_default() - .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete); - } - Ok(SyncStartupData { - local_timeline_init_statuses, - remote_index: RemoteIndex::default(), - }) - } - } -} - -fn local_tenant_timeline_files( - config: &'static PageServerConf, -) -> anyhow::Result)>> { - let mut local_tenant_timeline_files = HashMap::new(); - let tenants_dir = config.tenants_path(); - for tenants_dir_entry in std::fs::read_dir(&tenants_dir) - .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &tenants_dir_entry { - Ok(tenants_dir_entry) => { - match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { - Ok(collected_files) => { - local_tenant_timeline_files.extend(collected_files.into_iter()) - } - Err(e) => error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - tenants_dir_entry, - e - ), - } - } - Err(e) => error!( - "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", - tenants_dir_entry, - tenants_dir.display(), - e - ), - } - } - - Ok(local_tenant_timeline_files) -} - -fn collect_timelines_for_tenant( - config: &'static PageServerConf, - tenant_path: &Path, -) -> anyhow::Result)>> { - let mut timelines = HashMap::new(); - let tenant_id = tenant_path - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse tenant id out of the tenant dir name")?; - let timelines_dir = config.timelines_path(&tenant_id); - - for timelines_dir_entry in std::fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines dir entry for tenant {}", - tenant_id - ) - })? { - match timelines_dir_entry { - Ok(timelines_dir_entry) => { - let timeline_path = timelines_dir_entry.path(); - match collect_timeline_files(&timeline_path) { - Ok((timeline_id, metadata, timeline_files)) => { - timelines.insert( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - (metadata, timeline_files), - ); - } - Err(e) => error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_path.display(), - e - ), - } - } - Err(e) => error!( - "Failed to list timelines for entry tenant {}, reason: {:?}", - tenant_id, e - ), - } - } - - Ok(timelines) -} - -// discover timeline files and extract timeline metadata -// NOTE: ephemeral files are excluded from the list -fn collect_timeline_files( - timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { - let mut timeline_files = HashSet::new(); - let mut timeline_metadata_path = None; - - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; - let timeline_dir_entries = - std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; - for entry in timeline_dir_entries { - let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { - timeline_metadata_path = Some(entry_path); - } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { - debug!("skipping ephemeral file {}", entry_path.display()); - continue; - } else if entry_path.extension().and_then(OsStr::to_str) - == Some(TEMP_DOWNLOAD_EXTENSION) - { - info!("removing temp download file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp download file at {}", - entry_path.display() - ) - })?; - } else if entry_path.extension().and_then(OsStr::to_str) == Some("temp") { - info!("removing temp layer file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { - format!( - "failed to remove temp layer file at {}", - entry_path.display() - ) - })?; - } else { - timeline_files.insert(entry_path); - } - } - } - - // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed - // then attach is lost. There would be no retries for that, - // initial collect will fail because there is no metadata. - // We either need to start download if we see empty dir after restart or attach caller should - // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didn't appear locally. - // Check what happens with remote index in that case. - let timeline_metadata_path = match timeline_metadata_path { - Some(path) => path, - None => bail!("No metadata file found in the timeline directory"), - }; - let metadata = TimelineMetadata::from_bytes( - &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, - ) - .context("Failed to parse timeline metadata file bytes")?; - - Ok((timeline_id, metadata, timeline_files)) + pub local_timeline_init_statuses: TenantTimelineValues, } /// Global queue of sync tasks. @@ -763,9 +574,9 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread( +pub fn spawn_storage_sync_thread( conf: &'static PageServerConf, - local_timeline_files: HashMap)>, + local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, storage: GenericRemoteStorage, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, @@ -784,19 +595,43 @@ pub(super) fn spawn_storage_sync_thread( .build() .context("Failed to create storage sync runtime")?; + // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: + // * we need to list every timeline for tenant on S3, that might be a costly operation + // * we need to download every timeline for the tenant, to activate it in memory + // + // When on-demand download gets merged, we're able to do this fast by storing timeline metadata only. + let mut empty_tenants = TenantTimelineValues::::new(); + let mut keys_for_index_part_downloads = HashSet::new(); + let mut timelines_to_sync = HashMap::new(); + + for (tenant_id, timeline_data) in local_timeline_files.0 { + if timeline_data.is_empty() { + let _ = empty_tenants.0.entry(tenant_id).or_default(); + } else { + for (timeline_id, timeline_data) in timeline_data { + let id = ZTenantTimelineId::new(tenant_id, timeline_id); + keys_for_index_part_downloads.insert(id); + timelines_to_sync.insert(id, timeline_data); + } + } + } + let applicable_index_parts = runtime.block_on(download_index_parts( conf, &storage, - local_timeline_files.keys().copied().collect(), + keys_for_index_part_downloads, )); let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; - let local_timeline_init_statuses = schedule_first_sync_tasks( + let mut local_timeline_init_statuses = schedule_first_sync_tasks( &mut runtime.block_on(remote_index.write()), sync_queue, - local_timeline_files, + timelines_to_sync, ); + local_timeline_init_statuses + .0 + .extend(empty_tenants.0.into_iter()); let remote_index_clone = remote_index.clone(); thread_mgr::spawn( @@ -872,10 +707,7 @@ fn storage_sync_loop( "Sync loop step completed, {} new tenant state update(s)", updated_tenants.len() ); - let mut timelines_to_attach: HashMap< - ZTenantId, - Vec<(ZTimelineId, TimelineMetadata)>, - > = HashMap::new(); + let mut timelines_to_attach = TenantTimelineValues::new(); let index_accessor = runtime.block_on(index.read()); for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { @@ -901,7 +733,7 @@ fn storage_sync_loop( // and register them all at once in a repository for download // to be submitted in a single operation to repository // so it can apply them at once to internal timeline map. - timelines_to_attach.insert( + timelines_to_attach.0.insert( tenant_id, tenant_entry .iter() @@ -912,7 +744,9 @@ fn storage_sync_loop( } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - attach_downloaded_tenants(conf, &index, timelines_to_attach); + if let Err(e) = attach_local_tenants(conf, &index, timelines_to_attach) { + error!("Failed to attach new timelines: {e:?}"); + }; } } ControlFlow::Break(()) => { @@ -1443,11 +1277,10 @@ fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, local_timeline_files: HashMap)>, -) -> LocalTimelineInitStatuses { - let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); +) -> TenantTimelineValues { + let mut local_timeline_init_statuses = TenantTimelineValues::new(); - let mut new_sync_tasks = - VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); + let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len()); for (sync_id, (local_metadata, local_files)) in local_timeline_files { match index.timeline_entry_mut(&sync_id) { @@ -1459,18 +1292,27 @@ fn schedule_first_sync_tasks( local_files, remote_timeline, ); - let was_there = local_timeline_init_statuses + match local_timeline_init_statuses + .0 .entry(sync_id.tenant_id) .or_default() - .insert(sync_id.timeline_id, timeline_status); - - if was_there.is_some() { - // defensive check - warn!( - "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", - sync_id.timeline_id - ); + .entry(sync_id.timeline_id) + { + hash_map::Entry::Occupied(mut o) => { + { + // defensive check + warn!( + "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", + sync_id.timeline_id + ); + } + o.insert(timeline_status); + } + hash_map::Entry::Vacant(v) => { + v.insert(timeline_status); + } } + remote_timeline.awaits_download = awaits_download; } None => { @@ -1481,15 +1323,16 @@ fn schedule_first_sync_tasks( SyncTask::upload(LayersUpload { layers_to_upload: local_files, uploaded_layers: HashSet::new(), - metadata: Some(local_metadata), + metadata: Some(local_metadata.clone()), }), )); local_timeline_init_statuses + .0 .entry(sync_id.tenant_id) .or_default() .insert( sync_id.timeline_id, - LocalTimelineInitStatus::LocallyComplete, + LocalTimelineInitStatus::LocallyComplete(local_metadata), ); } } @@ -1523,7 +1366,10 @@ fn compare_local_and_remote_timeline( // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { - (LocalTimelineInitStatus::LocallyComplete, false) + ( + LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), + false, + ) }; let layers_to_upload = local_files diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index b0beb4219a..91ee557b79 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -18,6 +18,7 @@ use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, + TEMP_FILE_SUFFIX, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -26,8 +27,6 @@ use super::{ LayersDownload, SyncData, SyncQueue, }; -pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; - // We collect timelines remotely available for each tenant // in case we failed to gather all index parts (due to an error) // Poisoned variant is returned. @@ -251,7 +250,7 @@ pub(super) async fn download_timeline_layers<'a>( // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = - path_with_suffix_extension(&layer_destination_path, TEMP_DOWNLOAD_EXTENSION); + path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX); let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 041bd50737..baa58f5eb5 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,24 +3,26 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::metadata::TimelineMetadata; -use crate::layered_repository::{load_metadata, Repository, Timeline}; +use crate::layered_repository::ephemeral_file::is_ephemeral_file; +use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; +use crate::layered_repository::{Repository, Timeline}; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; use crate::walredo::PostgresRedoManager; -use crate::{thread_mgr, timelines, walreceiver}; +use crate::{thread_mgr, timelines, walreceiver, TenantTimelineValues, TEMP_FILE_SUFFIX}; use anyhow::Context; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::hash_map::{self, Entry}; +use std::collections::{HashMap, HashSet}; +use std::ffi::OsStr; use std::fmt; +use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::sync::mpsc; use tracing::*; -use utils::lsn::Lsn; pub use tenants_state::try_send_timeline_update; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -136,34 +138,49 @@ pub fn init_tenant_mgr( conf: &'static PageServerConf, remote_storage: Option, ) -> anyhow::Result { + let _entered = info_span!("init_tenant_mgr").entered(); let (timeline_updates_sender, timeline_updates_receiver) = mpsc::unbounded_channel::(); tenants_state::set_timeline_update_sender(timeline_updates_sender)?; walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; - let SyncStartupData { - remote_index, - local_timeline_init_statuses, - } = storage_sync::start_local_timeline_sync(conf, remote_storage) - .context("Failed to set up local files sync with external storage")?; + let local_tenant_files = local_tenant_timeline_files(conf) + .context("Failed to collect local tenant timeline files")?; - for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses { - if let Err(err) = - init_local_repository(conf, tenant_id, local_timeline_init_statuses, &remote_index) - { - // Report the error, but continue with the startup for other tenants. An error - // loading a tenant is serious, but it's better to complete the startup and - // serve other tenants, than fail completely. - error!("Failed to initialize local tenant {tenant_id}: {:?}", err); + let (remote_index, tenants_to_attach) = if let Some(storage) = remote_storage { + let storage_config = conf + .remote_storage_config + .as_ref() + .expect("remote storage without config"); - if let Err(err) = set_tenant_state(tenant_id, TenantState::Broken) { - error!( - "Failed to set tenant state to broken {tenant_id}: {:?}", - err - ); - } - } - } + let SyncStartupData { + remote_index, + local_timeline_init_statuses, + } = storage_sync::spawn_storage_sync_thread( + conf, + local_tenant_files, + storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + .context("Failed to spawn the storage sync thread")?; + + ( + remote_index, + local_timeline_init_statuses.filter_map(|init_status| match init_status { + LocalTimelineInitStatus::LocallyComplete(metadata) => Some(metadata), + LocalTimelineInitStatus::NeedsSync => None, + }), + ) + } else { + info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); + ( + RemoteIndex::default(), + local_tenant_files.filter_map(|(metadata, _)| Some(metadata)), + ) + }; + + attach_local_tenants(conf, &remote_index, tenants_to_attach)?; Ok(remote_index) } @@ -189,35 +206,69 @@ impl std::fmt::Debug for LocalTimelineUpdate { } } -/// Updates tenants' repositories, changing their timelines state in memory. -pub fn attach_downloaded_tenants( +/// Reads local files to load tenants and their timelines given into pageserver's memory. +/// Ignores other timelines that might be present for tenant, but were not passed as a parameter. +/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", +/// and the load continues. +pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - sync_status_updates: HashMap>, -) { - if sync_status_updates.is_empty() { - debug!("No sync status updates to apply"); - return; - } - for (tenant_id, downloaded_timelines) in sync_status_updates { - info!( - "Registering downlloaded timelines for {tenant_id} {} timelines", - downloaded_timelines.len() - ); - debug!("Downloaded timelines: {downloaded_timelines:?}"); + tenants_to_attach: TenantTimelineValues, +) -> anyhow::Result<()> { + let _entered = info_span!("attach_local_tenants").entered(); + let number_of_tenants = tenants_to_attach.0.len(); - let repo = match load_local_repo(conf, tenant_id, remote_index) { - Ok(repo) => repo, - Err(e) => { - error!("Failed to load repo for tenant {tenant_id} Error: {e:?}"); - continue; + for (tenant_id, local_timelines) in tenants_to_attach.0 { + info!( + "Attaching {} timelines for {tenant_id}", + local_timelines.len() + ); + debug!("Timelines to attach: {local_timelines:?}"); + + let repository = load_local_repo(conf, tenant_id, remote_index) + .context("Failed to load repository for tenant")?; + + let repo = Arc::clone(&repository); + { + match tenants_state::write_tenants().entry(tenant_id) { + hash_map::Entry::Occupied(_) => { + anyhow::bail!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + } + hash_map::Entry::Vacant(v) => { + v.insert(Tenant { + state: TenantState::Idle, + repo, + }); + } } - }; - match repo.init_attach_timelines(downloaded_timelines) { - Ok(()) => info!("successfully loaded local timelines for tenant {tenant_id}"), - Err(e) => error!("Failed to load local timelines for tenant {tenant_id}: {e:?}"), } + // XXX: current timeline init enables walreceiver that looks for tenant in the state, so insert the tenant entry before + repository + .init_attach_timelines(local_timelines) + .context("Failed to attach timelines for tenant")?; } + + info!("Processed {number_of_tenants} local tenants during attach"); + Ok(()) +} + +fn load_local_repo( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + remote_index: &RemoteIndex, +) -> anyhow::Result> { + let repository = Repository::new( + conf, + TenantConfOpt::default(), + Arc::new(PostgresRedoManager::new(conf, tenant_id)), + tenant_id, + remote_index.clone(), + conf.remote_storage_config.is_some(), + ); + let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; + repository.update_tenant_config(tenant_conf); + + Ok(Arc::new(repository)) } /// @@ -293,13 +344,14 @@ pub fn create_tenant_repository( } pub fn update_tenant_config( + conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); - let repo = get_repository_for_tenant(tenant_id)?; + get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - repo.update_tenant_config(tenant_conf)?; + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; Ok(()) } @@ -392,7 +444,7 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow debug!("waiting for threads to shutdown"); thread_mgr::shutdown_threads(None, None, Some(timeline_id)); debug!("thread shutdown completed"); - match tenants_state::write_tenants().get_mut(&tenant_id) { + match tenants_state::read_tenants().get(&tenant_id) { Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), } @@ -428,12 +480,10 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any // need to use crossbeam-channel for (timeline_id, join_handle) in walreceiver_join_handles { info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); - join_handle.recv().context("failed to join walreceiver")?; + join_handle.recv().ok(); info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); } - tenants_state::write_tenants().remove(&tenant_id); - // If removal fails there will be no way to successfully retry detach, // because the tenant no longer exists in the in-memory map. And it needs to be removed from it // before we remove files, because it contains references to repository @@ -443,7 +493,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any let local_tenant_directory = conf.tenant_path(&tenant_id); std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( - "Failed to remove local timeline directory '{}'", + "Failed to remove local tenant directory '{}'", local_tenant_directory.display() ) })?; @@ -454,7 +504,7 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any /// /// Get list of tenants, for the mgmt API /// -pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { +pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { tenants_state::read_tenants() .iter() .map(|(id, tenant)| { @@ -478,98 +528,248 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { .collect() } -/// Check if a given timeline is "broken" \[1\]. -/// The function returns an error if the timeline is "broken". -/// -/// \[1\]: it's not clear now how should we classify a timeline as broken. -/// A timeline is categorized as broken when any of following conditions is true: -/// - failed to load the timeline's metadata -/// - the timeline's disk consistent LSN is zero -fn check_broken_timeline( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, -) -> anyhow::Result { - let metadata = - load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?; +/// Attempts to collect information about all tenant and timelines, existing on the local FS. +/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories, +/// that may appear due to such removals. +/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities. +fn local_tenant_timeline_files( + config: &'static PageServerConf, +) -> anyhow::Result)>> { + let _entered = info_span!("local_tenant_timeline_files").entered(); - // A timeline with zero disk consistent LSN can happen when the page server - // failed to checkpoint the timeline import data when creating that timeline. - if metadata.disk_consistent_lsn() == Lsn::INVALID { - anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + let mut local_tenant_timeline_files = TenantTimelineValues::new(); + let tenants_dir = config.tenants_path(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + let tenant_dir_path = tenants_dir_entry.path(); + if is_temporary(&tenant_dir_path) { + info!( + "Found temporary tenant directory, removing: {}", + tenant_dir_path.display() + ); + if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { + error!( + "Failed to remove temporary directory '{}': {:?}", + tenant_dir_path.display(), + e + ); + } + } else { + match collect_timelines_for_tenant(config, &tenant_dir_path) { + Ok((tenant_id, collected_files)) => { + if collected_files.is_empty() { + match remove_if_empty(&tenant_dir_path) { + Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()), + Ok(false) => { + // insert empty timeline entry: it has some non-temporary files inside that we cannot remove + // so make obvious for HTTP API callers, that something exists there and try to load the tenant + let _ = local_tenant_timeline_files.0.entry(tenant_id).or_default(); + }, + Err(e) => error!("Failed to remove empty tenant directory: {e:?}"), + } + } else { + local_tenant_timeline_files.0.entry(tenant_id).or_default().extend(collected_files.into_iter()) + } + }, + Err(e) => error!( + "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", + tenants_dir.display(), + tenants_dir_entry, + e + ), + } + } + } + Err(e) => error!( + "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } } - Ok(metadata) + info!( + "Collected files for {} tenants", + local_tenant_timeline_files.0.len() + ); + Ok(local_tenant_timeline_files) } -/// Note: all timelines are attached at once if and only if all of them are locally complete -fn init_local_repository( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - local_timeline_init_statuses: HashMap, - remote_index: &RemoteIndex, -) -> anyhow::Result<(), anyhow::Error> { - let mut timelines_to_attach = Vec::new(); - for (timeline_id, init_status) in local_timeline_init_statuses { - match init_status { - LocalTimelineInitStatus::LocallyComplete => { - debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - let metadata = check_broken_timeline(conf, tenant_id, timeline_id) - .context("found broken timeline")?; - timelines_to_attach.push((timeline_id, metadata)); +fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result { + let directory_is_empty = tenant_dir_path + .read_dir() + .with_context(|| { + format!( + "Failed to read directory '{}' contents", + tenant_dir_path.display() + ) + })? + .next() + .is_none(); + + if directory_is_empty { + std::fs::remove_dir_all(&tenant_dir_path).with_context(|| { + format!( + "Failed to remove empty directory '{}'", + tenant_dir_path.display(), + ) + })?; + + Ok(true) + } else { + Ok(false) + } +} + +fn is_temporary(path: &Path) -> bool { + match path.file_name() { + Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX), + None => false, + } +} + +#[allow(clippy::type_complexity)] +fn collect_timelines_for_tenant( + config: &'static PageServerConf, + tenant_path: &Path, +) -> anyhow::Result<( + ZTenantId, + HashMap)>, +)> { + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse tenant id out of the tenant dir name")?; + let timelines_dir = config.timelines_path(&tenant_id); + + let mut tenant_timelines = HashMap::new(); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? + { + match timelines_dir_entry { + Ok(timelines_dir_entry) => { + let timeline_dir = timelines_dir_entry.path(); + if is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() + ); + if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else { + match collect_timeline_files(&timeline_dir) { + Ok((timeline_id, metadata, timeline_files)) => { + tenant_timelines.insert(timeline_id, (metadata, timeline_files)); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } + } + } + } + } } - LocalTimelineInitStatus::NeedsSync => { - debug!( - "timeline {tenant_id} for tenant {timeline_id} needs sync, \ - so skipped for adding into repository until sync is finished" - ); - return Ok(()); + Err(e) => { + error!("Failed to list timelines for entry tenant {tenant_id}, reason: {e:?}") } } } - // initialize local tenant - let repo = load_local_repo(conf, tenant_id, remote_index) - .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; - - // Lets fail here loudly to be on the safe side. - // XXX: It may be a better api to actually distinguish between repository startup - // and processing of newly downloaded timelines. - repo.init_attach_timelines(timelines_to_attach) - .with_context(|| format!("Failed to init local timelines for tenant {tenant_id}"))?; - Ok(()) -} - -// Sets up wal redo manager and repository for tenant. Reduces code duplication. -// Used during pageserver startup, or when new tenant is attached to pageserver. -fn load_local_repo( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - remote_index: &RemoteIndex, -) -> anyhow::Result> { - let mut m = tenants_state::write_tenants(); - let tenant = m.entry(tenant_id).or_insert_with(|| { - // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); - - // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(Repository::new( - conf, - TenantConfOpt::default(), - Arc::new(walredo_mgr), - tenant_id, - remote_index.clone(), - conf.remote_storage_config.is_some(), - )); - Tenant { - state: TenantState::Idle, - repo, + if tenant_timelines.is_empty() { + match remove_if_empty(&timelines_dir) { + Ok(true) => info!( + "Removed empty tenant timelines directory {}", + timelines_dir.display() + ), + Ok(false) => (), + Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), } - }); + } - // Restore tenant config - let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; - tenant.repo.update_tenant_config(tenant_conf)?; - - Ok(Arc::clone(&tenant.repo)) + Ok((tenant_id, tenant_timelines)) +} + +// discover timeline files and extract timeline metadata +// NOTE: ephemeral files are excluded from the list +fn collect_timeline_files( + timeline_dir: &Path, +) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { + let mut timeline_files = HashSet::new(); + let mut timeline_metadata_path = None; + + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse timeline id out of the timeline dir name")?; + let timeline_dir_entries = + std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + for entry in timeline_dir_entries { + let entry_path = entry.context("Failed to list timeline dir entry")?.path(); + if entry_path.is_file() { + if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { + timeline_metadata_path = Some(entry_path); + } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { + debug!("skipping ephemeral file {}", entry_path.display()); + continue; + } else if is_temporary(&entry_path) { + info!("removing temp timeline file at {}", entry_path.display()); + std::fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp download file at {}", + entry_path.display() + ) + })?; + } else { + timeline_files.insert(entry_path); + } + } + } + + // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed + // then attach is lost. There would be no retries for that, + // initial collect will fail because there is no metadata. + // We either need to start download if we see empty dir after restart or attach caller should + // be aware of that and retry attach if awaits_download for timeline switched from true to false + // but timelinne didn't appear locally. + // Check what happens with remote index in that case. + let timeline_metadata_path = match timeline_metadata_path { + Some(path) => path, + None => anyhow::bail!("No metadata file found in the timeline directory"), + }; + let metadata = TimelineMetadata::from_bytes( + &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + ) + .context("Failed to parse timeline metadata file bytes")?; + + anyhow::ensure!( + metadata.ancestor_timeline().is_some() || !timeline_files.is_empty(), + "Timeline has no ancestor and no layer files" + ); + + Ok((timeline_id, metadata, timeline_files)) } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 11be13b80c..4e9a5fc6ec 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -34,11 +34,6 @@ async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { // Break if we're not allowed to write to disk let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - // TODO do this inside repo.compaction_iteration instead. - let _guard = match repo.file_lock.try_read() { - Ok(g) => g, - Err(_) => return Ok(ControlFlow::Break(())), - }; // Run compaction let compaction_period = repo.get_compaction_period(); @@ -233,11 +228,6 @@ async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { // Break if we're not allowed to write to disk let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - // TODO do this inside repo.gc_iteration instead. - let _guard = match repo.file_lock.try_read() { - Ok(g) => g, - Err(_) => return Ok(ControlFlow::Break(())), - }; // Run gc let gc_period = repo.get_gc_period(); diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 936699c2ec..9356893908 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -3,6 +3,7 @@ // use anyhow::{bail, ensure, Context, Result}; +use remote_storage::path_with_suffix_extension; use std::{ fs, @@ -18,12 +19,12 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::import_datadir; use crate::tenant_mgr; use crate::CheckpointConfig; use crate::{ config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; +use crate::{import_datadir, TEMP_FILE_SUFFIX}; use crate::{ layered_repository::{Repository, Timeline}, walredo::WalRedoManager, @@ -105,13 +106,17 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // fn bootstrap_timeline( conf: &'static PageServerConf, - tenantid: ZTenantId, - tli: ZTimelineId, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, repo: &Repository, ) -> Result> { - let initdb_path = conf - .tenant_path(&tenantid) - .join(format!("tmp-timeline-{}", tli)); + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + conf.timelines_path(&tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); // Init temporarily repo to get bootstrap data run_initdb(conf, &initdb_path)?; @@ -123,7 +128,7 @@ fn bootstrap_timeline( // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(tli, lsn)?; + let timeline = repo.create_empty_timeline(timeline_id, lsn)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -134,7 +139,7 @@ fn bootstrap_timeline( info!( "created root timeline {} timeline.lsn {}", - tli, + timeline_id, timeline.get_last_record_lsn() ); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 4e49fd9373..dd946659bb 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,6 +21,7 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; +use remote_storage::path_with_suffix_extension; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -37,7 +38,6 @@ use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; -use crate::config::PageServerConf; use crate::metrics::{ WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, }; @@ -45,6 +45,7 @@ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; +use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, @@ -569,20 +570,24 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result { + fn launch(conf: &PageServerConf, tenant_id: &ZTenantId) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. - let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir"); + let datadir = path_with_suffix_extension( + conf.tenant_path(tenant_id).join("wal-redo-datadir"), + TEMP_FILE_SUFFIX, + ); // Create empty data directory for wal-redo postgres, deleting old one first. if datadir.exists() { - info!("directory {:?} exists, removing", &datadir); - if let Err(e) = fs::remove_dir_all(&datadir) { - error!("could not remove old wal-redo-datadir: {:#}", e); - } + info!( + "old temporary datadir {} exists, removing", + datadir.display() + ); + fs::remove_dir_all(&datadir)?; } - info!("running initdb in {:?}", datadir.display()); + info!("running initdb in {}", datadir.display()); let initdb = Command::new(conf.pg_bin_dir().join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") @@ -591,7 +596,7 @@ impl PostgresRedoProcess { .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .close_fds() .output() - .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?; + .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; if !initdb.status.success() { return Err(Error::new( @@ -645,7 +650,7 @@ impl PostgresRedoProcess { })?; info!( - "launched WAL redo postgres process on {:?}", + "launched WAL redo postgres process on {}", datadir.display() ); diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 4aba2494e9..1d083b3ef9 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -32,33 +32,34 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # Leave the first timeline alone, but corrupt the others in different ways (tenant0, timeline0, pg0) = tenant_timelines[0] + log.info(f"Timeline {tenant0}/{timeline0} is left intact") - # Corrupt metadata file on timeline 1 (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1) - print(f"overwriting metadata file at {metadata_path}") + metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata" f = open(metadata_path, "w") f.write("overwritten with garbage!") f.close() + log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") - # Missing layer files file on timeline 2. (This would actually work - # if we had Cloud Storage enabled in this test.) (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2) + timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Remove it os.remove(f"{timeline_path}/{filename}") + log.info( + f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)" + ) - # Corrupt layer files file on timeline 3 (tenant3, timeline3, pg3) = tenant_timelines[3] - timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3) + timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it f = open(f"{timeline_path}/{filename}", "w") f.write("overwritten with garbage!") f.close() + log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled") env.pageserver.start() @@ -69,20 +70,28 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # But all others are broken # First timeline would not get loaded into pageserver due to corrupt metadata file - (_tenant, _timeline, pg) = tenant_timelines[1] with pytest.raises( Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" ) as err: - pg.start() + pg1.start() + log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") + + # Second timeline has no ancestors, only the metadata file and no layer files + # We don't have the remote storage enabled, which means timeline is in an incorrect state, + # it's not loaded at all + with pytest.raises( + Exception, match=f"Could not get timeline {timeline2} in tenant {tenant2}" + ) as err: + pg2.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline - for n in range(2, 4): - (_tenant, _timeline, pg) = tenant_timelines[n] + for n in range(3, 4): + (bad_tenant, bad_timeline, pg) = tenant_timelines[n] with pytest.raises(Exception, match="extracting base backup failed") as err: pg.start() log.info( - f"compute startup failed lazily for timeline with corrupt layers, during basebackup preparation: {err}" + f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}" ) @@ -107,6 +116,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): tenant_id, _ = env.neon_cli.create_tenant() + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + # Introduce failpoint when creating a new timeline env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") with pytest.raises(Exception, match="before-checkpoint-new-timeline"): @@ -116,6 +127,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env.neon_cli.pageserver_stop(immediate=True) env.neon_cli.pageserver_start() - # Check that tenant with "broken" timeline is not loaded. - with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id}"): - env.neon_cli.list_timelines(tenant_id) + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" From 31ec3b790686615448ee2d00e5b4b9b5ce143b74 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 5 Sep 2022 10:13:36 +0300 Subject: [PATCH 037/166] Use the toolchain file to define current rustc version used --- .dockerignore | 1 + README.md | 19 +++++++++++++++---- rust-toolchain.toml | 7 +++++++ 3 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 rust-toolchain.toml diff --git a/.dockerignore b/.dockerignore index 9f8a22d598..4bc8e5fa13 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ * +!rust-toolchain.toml !Cargo.toml !Cargo.lock !Makefile diff --git a/README.md b/README.md index 57d0a144cb..eb13b111f5 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,17 @@ brew install libpq brew link --force libpq ``` +#### Rustc version + +The project uses [rust toolchain file](./rust-toolchain.toml) to define the version it's built with in CI for testing and local builds. + +This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. + +rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. + +non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. +Never rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. + #### Building on Linux 1. Build neon and patched postgres @@ -78,9 +89,9 @@ brew link --force libpq git clone --recursive https://github.com/neondatabase/neon.git cd neon -# The preferred and default is to make a debug build. This will create a +# The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`nproc`" +# build, utilize "BUILD_TYPE=release make -j`nproc`" make -j`nproc` ``` @@ -94,9 +105,9 @@ make -j`nproc` git clone --recursive https://github.com/neondatabase/neon.git cd neon -# The preferred and default is to make a debug build. This will create a +# The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" make -j`sysctl -n hw.logicalcpu` ``` diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000000..ee699464c6 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,7 @@ +[toolchain] +channel = "1.60" +profile = "default" +# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. +# https://rust-lang.github.io/rustup/concepts/profiles.html +# but we also need `llvm-tools-preview` for coverage data merges on CI +components = ["llvm-tools-preview", "rustfmt", "clippy"] From 923f642549c9b3b96cb53b959f34f2cb47d799e1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 5 Sep 2022 11:18:22 +0300 Subject: [PATCH 038/166] Collect cargo build timings --- .github/workflows/build_and_test.yml | 27 +++++++++++++++++++++------ .github/workflows/codestyle.yml | 5 ++--- README.md | 10 +++++----- rust-toolchain.toml | 8 +++++++- test_runner/fixtures/utils.py | 5 ++++- 5 files changed, 39 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bf9de7d857..7ee694fa16 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,7 +54,11 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] + # TODO this version is currently needed to make build statuses more informative + # and to clear cargo caches in a more transparent way. + # We should rather read this value from the file in the root of the repo, `rust-toolchain.toml` since it's + # truly setting what version of compiler the sources are built with + rust_toolchain: [ '1.60' ] env: BUILD_TYPE: ${{ matrix.build_type }} @@ -100,11 +104,11 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="" - CARGO_FLAGS="--locked" + CARGO_FLAGS="--locked --timings" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features profiling" - CARGO_FLAGS="--locked --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -218,6 +222,17 @@ jobs: name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact path: /tmp/neon + - name: Prepare cargo build timing stats for storing + run: | + mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/" + cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/" + shell: bash -euxo pipefail {0} + - name: Upload cargo build stats + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-build-stats + path: /tmp/neon/cargo-timings/ + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data if: matrix.build_type == 'debug' @@ -233,7 +248,7 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -269,7 +284,7 @@ jobs: fail-fast: false matrix: build_type: [ release ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -341,7 +356,7 @@ jobs: fail-fast: false matrix: build_type: [ debug ] - rust_toolchain: [ 1.58 ] + rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index bc21054e18..ac6bfe655f 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -24,9 +24,8 @@ jobs: strategy: fail-fast: false matrix: - # If we want to duplicate this job for different - # Rust toolchains (e.g. nightly or 1.37.0), add them here. - rust_toolchain: [1.58] + # TODO read from `rust-toolchain.toml` and do the same in the build and test workflow too. + rust_toolchain: ['1.60'] os: [ubuntu-latest, macos-latest] # To support several Postgres versions, add them here. postgres_version: [v14, v15] diff --git a/README.md b/README.md index eb13b111f5..977afc2a2c 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ This file is automatically picked up by [`rustup`](https://rust-lang.github.io/r rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. -Never rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. +Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. #### Building on Linux @@ -90,8 +90,8 @@ git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a -# demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`nproc`" +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`nproc`" make -j`nproc` ``` @@ -106,8 +106,8 @@ git clone --recursive https://github.com/neondatabase/neon.git cd neon # The preferred and default is to make a debug build. This will create a -# demonstrably slower build than a release build. If you want to use a release -# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" make -j`sysctl -n hw.logicalcpu` ``` diff --git a/rust-toolchain.toml b/rust-toolchain.toml index ee699464c6..8023348aae 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,11 @@ [toolchain] -channel = "1.60" +# We try to stick to a toolchain version that is widely available on popular distributions, so that most people +# can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later +# version, we can consider updating. As of this writing, 1.60 is available on Debian 'experimental' but not yet on +# 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach +# 'testing' soon (and similarly for the other distributions). +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. +channel = "1.60" # do update CI matrix values when updating this profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 726116e53c..5fb91344ad 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -155,7 +155,7 @@ def get_scale_for_db(size_mb: int) -> int: ATTACHMENT_NAME_REGEX = re.compile( - r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs" + r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs|.+\.html" ) @@ -180,6 +180,9 @@ def allure_attach_from_dir(dir: Path): elif source.endswith(".svg"): attachment_type = "image/svg+xml" extension = "svg" + elif source.endswith(".html"): + attachment_type = "text/html" + extension = "html" else: attachment_type = "text/plain" extension = attachment.suffix.removeprefix(".") From 648e86e9df9c06f3a961cdcca6f1c23f88272b6e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 16:02:29 +0300 Subject: [PATCH 039/166] Use Debian images with libc 2.31 to build legacy compute tools --- Dockerfile.compute-node.legacy | 4 ++-- rust-toolchain.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy index 7689167156..6653d81019 100644 --- a/Dockerfile.compute-node.legacy +++ b/Dockerfile.compute-node.legacy @@ -22,7 +22,7 @@ FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps # # Image with Postgres build deps # -FROM debian:buster-slim AS build-deps +FROM debian:bullseye-slim AS build-deps RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ libcurl4-openssl-dev libossp-uuid-dev @@ -59,7 +59,7 @@ WORKDIR /pg # # Final compute node image to be exported # -FROM debian:buster-slim +FROM debian:bullseye-slim # libreadline-dev is required to run psql RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 8023348aae..1a27e92fec 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -5,7 +5,7 @@ # 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach # 'testing' soon (and similarly for the other distributions). # See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. -channel = "1.60" # do update CI matrix values when updating this +channel = "1.60" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From 18dafbb9ba0f49e65b6382acf009255a13861eab Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 9 Sep 2022 16:47:09 +0300 Subject: [PATCH 040/166] Remove deceiving rust version from the CI files --- .../actions/run-python-test-set/action.yml | 5 +---- .github/workflows/build_and_test.yml | 22 +++++-------------- .github/workflows/codestyle.yml | 17 +++++--------- 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f04f5d11b8..4c18641938 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -5,9 +5,6 @@ inputs: build_type: description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster' required: true - rust_toolchain: - description: 'Rust toolchain version to fetch the caches' - required: false test_selection: description: 'A python test suite to run' required: true @@ -55,7 +52,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact path: /tmp/neon - name: Checkout diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7ee694fa16..d586741d68 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,11 +54,6 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - # TODO this version is currently needed to make build statuses more informative - # and to clear cargo caches in a more transparent way. - # We should rather read this value from the file in the root of the repo, `rust-toolchain.toml` since it's - # truly setting what version of compiler the sources are built with - rust_toolchain: [ '1.60' ] env: BUILD_TYPE: ${{ matrix.build_type }} @@ -130,8 +125,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v8-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -219,7 +214,7 @@ jobs: - name: Upload Neon artifact uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Prepare cargo build timing stats for storing @@ -230,7 +225,7 @@ jobs: - name: Upload cargo build stats uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-build-stats + name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats path: /tmp/neon/cargo-timings/ # XXX: keep this after the binaries.list is formed, so the coverage can properly work later @@ -248,7 +243,6 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -260,7 +254,6 @@ jobs: uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: regress needs_postgres_source: true run_with_real_s3: true @@ -284,7 +277,6 @@ jobs: fail-fast: false matrix: build_type: [ release ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -296,7 +288,6 @@ jobs: uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} test_selection: performance run_in_parallel: false save_perf_report: true @@ -356,7 +347,6 @@ jobs: fail-fast: false matrix: build_type: [ debug ] - rust_toolchain: [ '1.60' ] steps: - name: Checkout uses: actions/checkout@v3 @@ -373,12 +363,12 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Get coverage artifact diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index ac6bfe655f..53d0f9c5d8 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -24,8 +24,11 @@ jobs: strategy: fail-fast: false matrix: - # TODO read from `rust-toolchain.toml` and do the same in the build and test workflow too. - rust_toolchain: ['1.60'] + # XXX: both OSes have rustup + # * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools + # * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools + # this is all we need to install our toolchain later via rust-toolchain.toml + # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] # To support several Postgres versions, add them here. postgres_version: [v14, v15] @@ -40,14 +43,6 @@ jobs: submodules: true fetch-depth: 2 - - name: Install rust toolchain ${{ matrix.rust_toolchain }} - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: ${{ matrix.rust_toolchain }} - components: rustfmt, clippy - override: true - - name: Check formatting run: cargo fmt --all -- --check @@ -106,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh From a48f9f377df5c076f0f6afa8b1812709ea334d35 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 10 Sep 2022 01:23:19 +0300 Subject: [PATCH 041/166] Fix typo in issue template --- .github/ISSUE_TEMPLATE/epic-template.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 33ad7b1ef5..7707e0aa67 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -1,6 +1,6 @@ --- name: Epic Template -about: A set of related tasks contributing towards specific outcome, comprizing of +about: A set of related tasks contributing towards specific outcome, comprising of more than 1 week of work. title: 'Epic: ' labels: t/Epic From 698d6d0badad9aa2a12b033a33d28c19ffaec79c Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 12 Sep 2022 00:07:34 +0300 Subject: [PATCH 042/166] Use stable coverage API with rustc 1.60 --- scripts/coverage | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/scripts/coverage b/scripts/coverage index af0d067419..1dc92e57cc 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -75,8 +75,6 @@ class Cargo: def rustlib_dir(self) -> Path: if not self._rustlib_dir: cmd = [ - 'cargo', - '-Zunstable-options', 'rustc', '--print=target-libdir', ] @@ -397,7 +395,7 @@ class State: # Enable LLVM's source-based coverage # see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html # see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html - '-Zinstrument-coverage', + '-Cinstrument-coverage', # Link every bit of code to prevent "holes" in coverage report # see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code '-Clink-dead-code', @@ -410,10 +408,6 @@ class State: f'--remap-path-prefix {self.cwd}=', ]) - # XXX: God, have mercy on our souls... - # see: https://github.com/rust-lang/rust/pull/90132 - os.environ['RUSTC_BOOTSTRAP'] = '1' - def _merge_profraw(self) -> bool: profdata_path = self.profdata_dir / '-'.join([ self.profraw_prefix, From 40c845e57d7060b1946e3a9e9d6bf076a8847e52 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 11 Sep 2022 21:48:01 +0300 Subject: [PATCH 043/166] Switch to async for all concurrency in the pageserver. Instead of spawning helper threads, we now use Tokio tasks. There are multiple Tokio runtimes, for different kinds of tasks. One for serving libpq client connections, another for background operations like GC and compaction, and so on. That's not strictly required, we could use just one runtime, but with this you can still get an overview of what's happening with "top -H". There's one subtle behavior in how TenantState is updated. Before this patch, if you deleted all timelines from a tenant, its GC and compaction loops were stopped, and the tenant went back to Idle state. We no longer do that. The empty tenant stays Active. The changes to test_tenant_tasks.py are related to that. There's still plenty of synchronous code and blocking. For example, we still use blocking std::io functions for all file I/O, and the communication with WAL redo processes is still uses low-level unix poll(). We might want to rewrite those later, but this will do for now. The model is that local file I/O is considered to be fast enough that blocking - and preventing other tasks running in the same thread - is acceptable. --- Cargo.lock | 15 +- docs/pageserver-thread-mgmt.md | 47 +- libs/utils/Cargo.toml | 2 + libs/utils/src/lib.rs | 4 +- libs/utils/src/postgres_backend_async.rs | 485 +++++++++++++++ libs/utils/src/seqwait.rs | 53 +- libs/utils/src/seqwait_async.rs | 224 ------- pageserver/Cargo.toml | 5 +- pageserver/src/basebackup.rs | 5 +- pageserver/src/bin/pageserver.rs | 77 ++- pageserver/src/http/routes.rs | 35 +- pageserver/src/layered_repository.rs | 44 +- pageserver/src/layered_repository/timeline.rs | 180 +++--- pageserver/src/lib.rs | 27 +- pageserver/src/page_service.rs | 551 +++++++++--------- pageserver/src/storage_sync.rs | 71 +-- pageserver/src/storage_sync/upload.rs | 2 +- pageserver/src/task_mgr.rs | 463 +++++++++++++++ pageserver/src/tenant_mgr.rs | 255 +++----- pageserver/src/tenant_tasks.rs | 306 +++------- pageserver/src/thread_mgr.rs | 409 ------------- pageserver/src/timelines.rs | 49 +- pageserver/src/walreceiver.rs | 291 ++------- .../src/walreceiver/connection_manager.rs | 87 ++- .../src/walreceiver/walreceiver_connection.rs | 75 +-- test_runner/regress/test_tenant_tasks.py | 15 +- workspace_hack/Cargo.toml | 4 +- 27 files changed, 1840 insertions(+), 1941 deletions(-) create mode 100644 libs/utils/src/postgres_backend_async.rs delete mode 100644 libs/utils/src/seqwait_async.rs create mode 100644 pageserver/src/task_mgr.rs delete mode 100644 pageserver/src/thread_mgr.rs diff --git a/Cargo.lock b/Cargo.lock index 563a998601..e9ebcdc5ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1831,6 +1831,8 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", + "async-trait", "byteorder", "bytes", "chrono", @@ -1871,6 +1873,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", + "tokio-util", "toml_edit", "tracing", "url", @@ -3481,9 +3484,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.34" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09" +checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" dependencies = [ "cfg-if", "log", @@ -3505,11 +3508,11 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.26" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f" +checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" dependencies = [ - "lazy_static", + "once_cell", "valuable", ] @@ -3626,6 +3629,7 @@ name = "utils" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "bincode", "byteorder", "bytes", @@ -3653,6 +3657,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "tokio-rustls", "tracing", "tracing-subscriber", "workspace_hack", diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index 9ee3e40085..e351c972cb 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -1,26 +1,39 @@ ## Thread management -Each thread in the system is tracked by the `thread_mgr` module. It -maintains a registry of threads, and which tenant or timeline they are -operating on. This is used for safe shutdown of a tenant, or the whole -system. +The pageserver uses Tokio for handling concurrency. Everything runs in +Tokio tasks, although some parts are written in blocking style and use +spawn_blocking(). + +Each Tokio task is tracked by the `task_mgr` module. It maintains a +registry of tasks, and which tenant or timeline they are operating +on. ### Handling shutdown -When a tenant or timeline is deleted, we need to shut down all threads -operating on it, before deleting the data on disk. A thread registered -in the thread registry can check if it has been requested to shut down, -by calling `is_shutdown_requested()`. For async operations, there's also -a `shudown_watcher()` async task that can be used to wake up on shutdown. +When a tenant or timeline is deleted, we need to shut down all tasks +operating on it, before deleting the data on disk. There's a function, +`shutdown_tasks`, to request all tasks of a particular tenant or +timeline to shutdown. It will also wait for them to finish. + +A task registered in the task registry can check if it has been +requested to shut down, by calling `is_shutdown_requested()`. There's +also a `shudown_watcher()` Future that can be used with `tokio::select!` +or similar, to wake up on shutdown. + ### Sync vs async -The primary programming model in the page server is synchronous, -blocking code. However, there are some places where async code is -used. Be very careful when mixing sync and async code. - -Async is primarily used to wait for incoming data on network -connections. For example, all WAL receivers have a shared thread pool, -with one async Task for each connection. Once a piece of WAL has been -received from the network, the thread calls the blocking functions in +We use async to wait for incoming data on network connections, and to +perform other long-running operations. For example, each WAL receiver +connection is handled by a tokio Task. Once a piece of WAL has been +received from the network, the task calls the blocking functions in the Repository to process the WAL. + +The core storage code in `layered_repository/` is synchronous, with +blocking locks and I/O calls. The current model is that we consider +disk I/Os to be short enough that we perform them while running in a +Tokio task. If that becomes a problem, we should use `spawn_blocking` +before entering the synchronous parts of the code, or switch to using +tokio I/O functions. + +Be very careful when mixing sync and async code! diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 28ad658de4..ce55277f29 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +async-trait = "0.1" anyhow = "1.0" bincode = "1.3" bytes = "1.0.1" @@ -16,6 +17,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" thiserror = "1.0" tokio = { version = "1.17", features = ["macros"]} +tokio-rustls = "0.23" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } nix = "0.23.0" diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index fa7a37adf1..caa7ac6c09 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -14,11 +14,9 @@ pub mod simple_rcu; /// append only ordered map implemented with a Vec pub mod vec_map; -// Async version of SeqWait. Currently unused. -// pub mod seqwait_async; - pub mod bin_ser; pub mod postgres_backend; +pub mod postgres_backend_async; pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs new file mode 100644 index 0000000000..383ad3742f --- /dev/null +++ b/libs/utils/src/postgres_backend_async.rs @@ -0,0 +1,485 @@ +//! Server-side asynchronous Postgres connection, as limited as we need. +//! To use, create PostgresBackend and run() it, passing the Handler +//! implementation determining how to process the queries. Currently its API +//! is rather narrow, but we can extend it once required. + +use crate::postgres_backend::AuthType; +use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; +use anyhow::{bail, Context, Result}; +use bytes::{Bytes, BytesMut}; +use rand::Rng; +use std::future::Future; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::task::Poll; +use tracing::{debug, error, trace}; + +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_rustls::TlsAcceptor; + +#[async_trait::async_trait] +pub trait Handler { + /// Handle single query. + /// postgres_backend will issue ReadyForQuery after calling this (this + /// might be not what we want after CopyData streaming, but currently we don't + /// care). + async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + + /// Called on startup packet receival, allows to process params. + /// + /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users + /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow + /// to override whole init logic in implementations. + fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + Ok(()) + } + + /// Check auth md5 + fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { + bail!("MD5 auth failed") + } + + /// Check auth jwt + fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { + bail!("JWT auth failed") + } +} + +/// PostgresBackend protocol state. +/// XXX: The order of the constructors matters. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub enum ProtoState { + Initialization, + Encrypted, + Authentication, + Established, + Closed, +} + +#[derive(Clone, Copy)] +pub enum ProcessMsgResult { + Continue, + Break, +} + +/// Always-writeable sock_split stream. +/// May not be readable. See [`PostgresBackend::take_stream_in`] +pub enum Stream { + Unencrypted(tokio::net::TcpStream), + Tls(Box>), + Broken, +} + +impl AsyncWrite for Stream { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Broken => unreachable!(), + } + } + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), + Self::Tls(stream) => Pin::new(stream).poll_flush(cx), + Self::Broken => unreachable!(), + } + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Broken => unreachable!(), + } + } +} +impl AsyncRead for Stream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Broken => unreachable!(), + } + } +} + +pub struct PostgresBackend { + stream: Stream, + // Output buffer. c.f. BeMessage::write why we are using BytesMut here. + buf_out: BytesMut, + + pub state: ProtoState, + + md5_salt: [u8; 4], + auth_type: AuthType, + + peer_addr: SocketAddr, + pub tls_config: Option>, +} + +pub fn query_from_cstring(query_string: Bytes) -> Vec { + let mut query_string = query_string.to_vec(); + if let Some(ch) = query_string.last() { + if *ch == 0 { + query_string.pop(); + } + } + query_string +} + +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); + std::str::from_utf8(without_null).map_err(|e| e.into()) +} + +impl PostgresBackend { + pub fn new( + socket: tokio::net::TcpStream, + auth_type: AuthType, + tls_config: Option>, + ) -> std::io::Result { + let peer_addr = socket.peer_addr()?; + + Ok(Self { + stream: Stream::Unencrypted(socket), + buf_out: BytesMut::with_capacity(10 * 1024), + state: ProtoState::Initialization, + md5_salt: [0u8; 4], + auth_type, + tls_config, + peer_addr, + }) + } + + pub fn get_peer_addr(&self) -> &SocketAddr { + &self.peer_addr + } + + /// Read full message or return None if connection is closed. + pub async fn read_message(&mut self) -> Result> { + use ProtoState::*; + match self.state { + Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, + Authentication | Established => FeMessage::read_fut(&mut self.stream).await, + Closed => Ok(None), + } + } + + /// Flush output buffer into the socket. + pub async fn flush(&mut self) -> std::io::Result<&mut Self> { + self.stream.write_all(&self.buf_out).await?; + self.buf_out.clear(); + Ok(self) + } + + /// Write message into internal output buffer. + pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> { + BeMessage::write(&mut self.buf_out, message)?; + Ok(self) + } + + // Wrapper for run_message_loop() that shuts down socket when we are done + pub async fn run(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + let ret = self.run_message_loop(handler, shutdown_watcher).await; + let _ = self.stream.shutdown(); + ret + } + + async fn run_message_loop( + &mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + trace!("postgres backend to {:?} started", self.peer_addr); + + tokio::select!( + biased; + + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received during handshake"); + return Ok(()) + }, + + result = async { + while self.state < ProtoState::Established { + if let Some(msg) = self.read_message().await? { + trace!("got message {msg:?} during handshake"); + + match self.process_handshake_message(handler, msg).await? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + } else { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + Ok::<(), anyhow::Error>(()) + } => { + // Handshake complete. + result?; + } + ); + + // Authentication completed + let mut query_string = Bytes::new(); + while let Some(msg) = tokio::select!( + biased; + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received in run_message_loop"); + Ok(None) + }, + msg = self.read_message() => { msg }, + )? { + trace!("got message {:?}", msg); + + let result = self.process_message(handler, msg, &mut query_string).await; + self.flush().await?; + match result? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => break, + } + } + + trace!("postgres backend to {:?} exited", self.peer_addr); + Ok(()) + } + + async fn start_tls(&mut self) -> anyhow::Result<()> { + if let Stream::Unencrypted(plain_stream) = + std::mem::replace(&mut self.stream, Stream::Broken) + { + let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap()); + let tls_stream = acceptor.accept(plain_stream).await?; + + self.stream = Stream::Tls(Box::new(tls_stream)); + return Ok(()); + }; + bail!("TLS already started"); + } + + async fn process_handshake_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + ) -> Result { + assert!(self.state < ProtoState::Established); + let have_tls = self.tls_config.is_some(); + match msg { + FeMessage::StartupPacket(m) => { + trace!("got startup message {m:?}"); + + match m { + FeStartupPacket::SslRequest => { + debug!("SSL requested"); + + self.write_message(&BeMessage::EncryptionResponse(have_tls))?; + if have_tls { + self.start_tls().await?; + self.state = ProtoState::Encrypted; + } + } + FeStartupPacket::GssEncRequest => { + debug!("GSS requested"); + self.write_message(&BeMessage::EncryptionResponse(false))?; + } + FeStartupPacket::StartupMessage { .. } => { + if have_tls && !matches!(self.state, ProtoState::Encrypted) { + self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; + bail!("client did not connect with TLS"); + } + + // NB: startup() may change self.auth_type -- we are using that in proxy code + // to bypass auth for new users. + handler.startup(self, &m)?; + + match self.auth_type { + AuthType::Trust => { + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + // The async python driver requires a valid server_version + .write_message(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion("14.1"), + ))? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + AuthType::MD5 => { + rand::thread_rng().fill(&mut self.md5_salt); + self.write_message(&BeMessage::AuthenticationMD5Password( + self.md5_salt, + ))?; + self.state = ProtoState::Authentication; + } + AuthType::ZenithJWT => { + self.write_message(&BeMessage::AuthenticationCleartextPassword)?; + self.state = ProtoState::Authentication; + } + } + } + FeStartupPacket::CancelRequest { .. } => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + } + + FeMessage::PasswordMessage(m) => { + trace!("got password message '{:?}'", m); + + assert!(self.state == ProtoState::Authentication); + + match self.auth_type { + AuthType::Trust => unreachable!(), + AuthType::MD5 => { + let (_, md5_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_md5(self, md5_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + AuthType::ZenithJWT => { + let (_, jwt_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_jwt(self, jwt_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + } + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + + _ => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + Ok(ProcessMsgResult::Continue) + } + + async fn process_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + unnamed_query_string: &mut Bytes, + ) -> Result { + // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth + // TODO: change that to proper top-level match of protocol state with separate message handling for each state + assert!(self.state == ProtoState::Established); + + match msg { + FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { + bail!("protocol violation"); + } + + FeMessage::Query(body) => { + // remove null terminator + let query_string = cstr_to_str(&body)?; + + trace!("got query {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + // ":?" uses the alternate formatting style, which makes anyhow display the + // full cause of the error, not just the top-level context + its trace. + // We don't want to send that in the ErrorResponse though, + // because it's not relevant to the compute node logs. + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + // TODO: untangle convoluted control flow + if e.to_string().contains("failed to run") { + return Ok(ProcessMsgResult::Break); + } + } + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Parse(m) => { + *unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete)?; + } + + FeMessage::Describe(_) => { + self.write_message(&BeMessage::ParameterDescription)? + .write_message(&BeMessage::NoData)?; + } + + FeMessage::Bind(_) => { + self.write_message(&BeMessage::BindComplete)?; + } + + FeMessage::Close(_) => { + self.write_message(&BeMessage::CloseComplete)?; + } + + FeMessage::Execute(_) => { + let query_string = cstr_to_str(unnamed_query_string)?; + trace!("got execute {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + } + // NOTE there is no ReadyForQuery message. This handler is used + // for basebackup and it uses CopyOut which doesn't require + // ReadyForQuery message and backend just switches back to + // processing mode after sending CopyDone or ErrorResponse. + } + + FeMessage::Sync => { + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Terminate => { + return Ok(ProcessMsgResult::Break); + } + + // We prefer explicit pattern matching to wildcards, because + // this helps us spot the places where new variants are missing + FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { + bail!("unexpected message type: {:?}", msg); + } + } + + Ok(ProcessMsgResult::Continue) + } +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index a531975d60..467b900a13 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -4,9 +4,10 @@ use std::cmp::{Eq, Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fmt::Debug; use std::mem; -use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Mutex; use std::time::Duration; +use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::time::timeout; /// An error happened while waiting for a number #[derive(Debug, PartialEq, Eq, thiserror::Error)] @@ -141,10 +142,10 @@ where /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. - pub fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { + pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown), + Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown), Err(e) => Err(e), } } @@ -156,13 +157,18 @@ where /// /// If that hasn't happened after the specified timeout duration, /// [`SeqWaitError::Timeout`] will be returned. - pub fn wait_for_timeout(&self, num: V, timeout_duration: Duration) -> Result<(), SeqWaitError> { + pub async fn wait_for_timeout( + &self, + num: V, + timeout_duration: Duration, + ) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e { - std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout, - std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown, - }), + Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await { + Ok(Ok(())) => Ok(()), + Ok(Err(_)) => Err(SeqWaitError::Shutdown), + Err(_) => Err(SeqWaitError::Timeout), + }, Err(e) => Err(e), } } @@ -179,7 +185,7 @@ where } // Create a new channel. - let (tx, rx) = channel(); + let (tx, rx) = channel(()); internal.waiters.push(Waiter { wake_num: num, wake_channel: tx, @@ -235,7 +241,6 @@ mod tests { use super::*; use std::sync::Arc; use std::thread::sleep; - use std::thread::spawn; use std::time::Duration; impl MonotonicCounter for i32 { @@ -248,25 +253,25 @@ mod tests { } } - #[test] - fn seqwait() { + #[tokio::test] + async fn seqwait() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - spawn(move || { - seq2.wait_for(42).expect("wait_for 42"); + tokio::task::spawn(async move { + seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).expect_err("no 999"); + seq2.wait_for(999).await.expect_err("no 999"); }); - spawn(move || { - seq3.wait_for(42).expect("wait_for 42"); - seq3.wait_for(0).expect("wait_for 0"); + tokio::task::spawn(async move { + seq3.wait_for(42).await.expect("wait_for 42"); + seq3.wait_for(0).await.expect("wait_for 0"); }); sleep(Duration::from_secs(1)); let old = seq.advance(99); assert_eq!(old, 0); - seq.wait_for(100).expect("wait_for 100"); + seq.wait_for(100).await.expect("wait_for 100"); // Calling advance with a smaller value is a no-op assert_eq!(seq.advance(98), 100); @@ -275,16 +280,16 @@ mod tests { seq.shutdown(); } - #[test] - fn seqwait_timeout() { + #[tokio::test] + async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - spawn(move || { + tokio::task::spawn(async move { let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout); + let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_secs(1)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); diff --git a/libs/utils/src/seqwait_async.rs b/libs/utils/src/seqwait_async.rs deleted file mode 100644 index f685e2b569..0000000000 --- a/libs/utils/src/seqwait_async.rs +++ /dev/null @@ -1,224 +0,0 @@ -//! -//! Async version of 'seqwait.rs' -//! -//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. -//! - -#![warn(missing_docs)] - -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::mem; -use std::sync::Mutex; -use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; -use tokio::time::timeout; - -/// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] -#[error("SeqWaitError")] -pub enum SeqWaitError { - /// The wait timeout was reached - Timeout, - /// [`SeqWait::shutdown`] was called - Shutdown, -} - -/// Internal components of a `SeqWait` -struct SeqWaitInt -where - T: Ord, -{ - waiters: BTreeMap, Receiver<()>)>, - current: T, - shutdown: bool, -} - -/// A tool for waiting on a sequence number -/// -/// This provides a way to await the arrival of a number. -/// As soon as the number arrives by another caller calling -/// [`advance`], then the waiter will be woken up. -/// -/// This implementation takes a blocking Mutex on both [`wait_for`] -/// and [`advance`], meaning there may be unexpected executor blocking -/// due to thread scheduling unfairness. There are probably better -/// implementations, but we can probably live with this for now. -/// -/// [`wait_for`]: SeqWait::wait_for -/// [`advance`]: SeqWait::advance -/// -pub struct SeqWait -where - T: Ord, -{ - internal: Mutex>, -} - -impl SeqWait -where - T: Ord + Debug + Copy, -{ - /// Create a new `SeqWait`, initialized to a particular number - pub fn new(starting_num: T) -> Self { - let internal = SeqWaitInt { - waiters: BTreeMap::new(), - current: starting_num, - shutdown: false, - }; - SeqWait { - internal: Mutex::new(internal), - } - } - - /// Shut down a `SeqWait`, causing all waiters (present and - /// future) to return an error. - pub fn shutdown(&self) { - let waiters = { - // Prevent new waiters; wake all those that exist. - // Wake everyone with an error. - let mut internal = self.internal.lock().unwrap(); - - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) - - // Drop the lock as we exit this scope. - }; - - // When we drop the waiters list, each Receiver will - // be woken with an error. - // This drop doesn't need to be explicit; it's done - // here to make it easier to read the code and understand - // the order of events. - drop(waiters); - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> { - let mut rx = { - let mut internal = self.internal.lock().unwrap(); - if internal.current >= num { - return Ok(()); - } - if internal.shutdown { - return Err(SeqWaitError::Shutdown); - } - - // If we already have a channel for waiting on this number, reuse it. - if let Some((_, rx)) = internal.waiters.get_mut(&num) { - // an Err from changed() means the sender was dropped. - rx.clone() - } else { - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.insert(num, (tx, rx.clone())); - rx - } - // Drop the lock as we exit this scope. - }; - rx.changed().await.map_err(|_| SeqWaitError::Shutdown) - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - /// - /// If that hasn't happened after the specified timeout duration, - /// [`SeqWaitError::Timeout`] will be returned. - pub async fn wait_for_timeout( - &self, - num: T, - timeout_duration: Duration, - ) -> Result<(), SeqWaitError> { - timeout(timeout_duration, self.wait_for(num)) - .await - .unwrap_or(Err(SeqWaitError::Timeout)) - } - - /// Announce a new number has arrived - /// - /// All waiters at this value or below will be woken. - /// - /// `advance` will panic if you send it a lower number than - /// a previous call. - pub fn advance(&self, num: T) { - let wake_these = { - let mut internal = self.internal.lock().unwrap(); - - if internal.current > num { - panic!( - "tried to advance backwards, from {:?} to {:?}", - internal.current, num - ); - } - internal.current = num; - - // split_off will give me all the high-numbered waiters, - // so split and then swap. Everything at or above `num` - // stays. - let mut split = internal.waiters.split_off(&num); - std::mem::swap(&mut split, &mut internal.waiters); - - // `split_at` didn't get the value at `num`; if it's - // there take that too. - if let Some(sleeper) = internal.waiters.remove(&num) { - split.insert(num, sleeper); - } - - split - }; - - for (_wake_num, (tx, _rx)) in wake_these { - // This can fail if there are no receivers. - // We don't care; discard the error. - let _ = tx.send(()); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::Arc; - use tokio::time::{sleep, Duration}; - - #[tokio::test] - async fn seqwait() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - let seq3 = Arc::clone(&seq); - tokio::spawn(async move { - seq2.wait_for(42).await.expect("wait_for 42"); - seq2.advance(100); - seq2.wait_for(999).await.expect_err("no 999"); - }); - tokio::spawn(async move { - seq3.wait_for(42).await.expect("wait_for 42"); - seq3.wait_for(0).await.expect("wait_for 0"); - }); - sleep(Duration::from_secs(1)).await; - seq.advance(99); - seq.wait_for(100).await.expect("wait_for 100"); - seq.shutdown(); - } - - #[tokio::test] - async fn seqwait_timeout() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - tokio::spawn(async move { - let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout).await; - assert_eq!(res, Err(SeqWaitError::Timeout)); - }); - sleep(Duration::from_secs(1)).await; - // This will attempt to wake, but nothing will happen - // because the waiter already dropped its Receiver. - seq.advance(99); - } -} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 902765f424..e73c73bd9c 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,8 @@ profiling = ["pprof"] failpoints = ["fail/failpoints"] [dependencies] +async-stream = "0.3" +async-trait = "0.1" chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" @@ -24,6 +26,7 @@ itertools = "0.10.3" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio-util = { version = "0.7.3", features = ["io", "io-util"] } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } @@ -43,7 +46,7 @@ pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallcl toml_edit = { version = "0.13", features = ["easy"] } scopeguard = "1.1.0" const_format = "0.2.21" -tracing = "0.1.27" +tracing = "0.1.36" signal-hook = "0.3.10" url = "2" nix = "0.23" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index cd99c3c67d..61facc852d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -81,9 +81,8 @@ where // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { - // Backup was requested at a particular LSN. Wait for it to arrive. - info!("waiting for {}", req_lsn); - timeline.wait_lsn(req_lsn)?; + // Backup was requested at a particular LSN. The caller should've + // already checked that it's a valid LSN. // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5a43516728..ec71e5b320 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,7 +4,7 @@ use remote_storage::GenericRemoteStorage; use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use clap::{App, Arg}; use daemonize::Daemonize; @@ -12,13 +12,15 @@ use daemonize::Daemonize; use fail::FailScenario; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, - thread_mgr::ThreadKind, - virtual_file, LOG_FILE_NAME, + http, page_cache, page_service, profiling, task_mgr, + task_mgr::TaskKind, + task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, + }, + tenant_mgr, virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, - http::endpoint, logging, postgres_backend::AuthType, project_git_version, @@ -286,7 +288,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // start profiler (if enabled) let profiler_guard = profiling::init_profiler(conf); - pageserver::tenant_tasks::init_tenant_task_pool()?; + WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?; // initialize authentication for incoming connections let auth = match &conf.auth_type { @@ -307,35 +309,54 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }) .transpose() .context("Failed to init generic remote storage")?; + let remote_index = { + let _rt_guard = BACKGROUND_RUNTIME.enter(); + tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())? + }; - let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?; - - // Spawn a new thread for the http endpoint + // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME. // bind before launching separate thread so the error reported before startup exits - let auth_cloned = auth.clone(); - thread_mgr::spawn( - ThreadKind::HttpEndpointListener, - None, - None, - "http_endpoint_thread", - true, - move || { - let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?; - endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) - }, - )?; - // Spawn a thread to listen for libpq connections. It will spawn further threads + // Create a Service from the router above to handle incoming requests. + { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + + let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?; + let service = + utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(service) + .with_graceful_shutdown(task_mgr::shutdown_watcher()); + + task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::HttpEndpointListener, + None, + None, + "http endpoint listener", + true, + async { + server.await?; + Ok(()) + }, + ); + } + + // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. - thread_mgr::spawn( - ThreadKind::LibpqEndpointListener, + task_mgr::spawn( + COMPUTE_REQUEST_RUNTIME.handle(), + TaskKind::LibpqEndpointListener, None, None, - "libpq endpoint thread", + "libpq endpoint listener", true, - move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), - )?; + async move { + page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await + }, + ); + // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { info!( @@ -352,7 +373,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() signal.name() ); profiling::exit_profiler(conf, &profiler_guard); - pageserver::shutdown_pageserver(0); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } }) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 59142bd9b2..78f83511cb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -161,16 +161,14 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; @@ -184,9 +182,10 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists Err(err) => Err(err), } - }) - .await - .map_err(ApiError::from_err)??; + } + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) + .await + .map_err(ApiError::from_err)?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -426,12 +425,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; - tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_detach", tenant = %tenant_id).entered(); - tenant_mgr::detach_tenant(conf, tenant_id) - }) - .await - .map_err(ApiError::from_err)??; + tenant_mgr::detach_tenant(conf, tenant_id) + .instrument(info_span!("tenant_detach", tenant = %tenant_id)) + .await + .map_err(ApiError::from_err)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -583,7 +578,7 @@ async fn tenant_create_handler(mut request: Request) -> Result, - // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -284,7 +270,7 @@ impl Repository { } /// perform one garbage collection iteration, removing old data files from disk. - /// this function is periodically called by gc thread. + /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// /// 'timelineid' specifies the timeline to GC, or None for all. @@ -299,14 +285,6 @@ impl Repository { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down GC"); - return Ok(GcResult::default()); - } - }; - let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); @@ -319,18 +297,10 @@ impl Repository { } /// Perform one compaction iteration. - /// This function is periodically called by compactor thread. + /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. pub fn compaction_iteration(&self) -> Result<()> { - let _guard = match self.file_lock.try_read() { - Ok(g) => g, - Err(_) => { - info!("File lock write acquired, shutting down compaction"); - return Ok(()); - } - }; - // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -624,10 +594,7 @@ impl Repository { .load_layer_map(new_disk_consistent_lsn) .context("failed to load layermap")?; - crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(self.tenant_id(), new_timeline_id), - timeline: Arc::clone(&new_timeline), - }); + new_timeline.launch_wal_receiver()?; Ok(new_timeline) } @@ -642,7 +609,6 @@ impl Repository { ) -> Repository { Repository { tenant_id, - file_lock: RwLock::new(()), conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), @@ -846,7 +812,7 @@ impl Repository { // See comments in [`Repository::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { // We were requested to shut down. Stop and return with the progress we // made. break; diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index aa9d636739..60abbe33e6 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -5,16 +5,17 @@ use bytes::Bytes; use fail::fail_point; use itertools::Itertools; use once_cell::sync::OnceCell; +use tokio::task::spawn_blocking; use tracing::*; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; +use std::fs; use std::ops::{Deref, Range}; use std::path::PathBuf; use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{mpsc, Arc, Mutex, MutexGuard, RwLock, TryLockError}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; -use std::{fs, thread}; use crate::layered_repository::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, @@ -46,8 +47,9 @@ use utils::{ use crate::repository::GcResult; use crate::repository::{Key, Value}; -use crate::thread_mgr; -use crate::walreceiver::IS_WAL_RECEIVER; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{page_cache, storage_sync}; @@ -56,7 +58,7 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - tenant_id: ZTenantId, + pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, pub layers: RwLock, @@ -110,11 +112,11 @@ pub struct Timeline { /// to avoid deadlock. write_lock: Mutex<()>, - /// Used to ensure that there is only one thread + /// Used to ensure that there is only task performing flushing at a time layer_flush_lock: Mutex<()>, /// Layer removal lock. - /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. + /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], /// and [`Repository::delete_timeline`]. layer_removal_cs: Mutex<()>, @@ -142,10 +144,7 @@ pub struct Timeline { /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, - // TODO task management should be done outside timeline, managed along with other tasks. - #[allow(clippy::type_complexity)] - initial_size_computation_task: - Mutex>, mpsc::Receiver<()>)>>, + initial_size_computation_started: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -413,23 +412,23 @@ impl Timeline { /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// - pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead + pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver, because that could lead // to a deadlock. ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection), + "wait_lsn cannot be called in WAL receiver" ); - self.metrics.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; + let _timer = self.metrics.wait_lsn_time_histo.start_timer(); + + self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await + .with_context(|| + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + )?; Ok(()) } @@ -587,7 +586,7 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - initial_size_computation_task: Mutex::new(None), + initial_size_computation_started: AtomicBool::new(false), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -598,6 +597,43 @@ impl Timeline { result } + pub fn launch_wal_receiver(self: &Arc) -> anyhow::Result<()> { + if !is_etcd_client_initialized() { + if cfg!(test) { + info!("not launching WAL receiver because etcd client hasn't been initialized"); + return Ok(()); + } else { + panic!("etcd client not initialized"); + } + } + + info!( + "launching WAL receiver for timeline {} of tenant {}", + self.timeline_id, self.tenant_id + ); + let tenant_conf_guard = self.tenant_conf.read().unwrap(); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); + let walreceiver_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); + drop(tenant_conf_guard); + let self_clone = Arc::clone(self); + let _ = spawn_connection_manager_task( + self.conf.broker_etcd_prefix.clone(), + self_clone, + walreceiver_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + )?; + + Ok(()) + } + /// /// Scan the timeline directory to populate the layer map. /// Returns all timeline-related files that were found and loaded. @@ -715,61 +751,34 @@ impl Timeline { fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { let timeline_id = self.timeline_id; - let mut task_guard = match self.initial_size_computation_task.try_lock() { - Ok(guard) => guard, - Err(_) => { - debug!("Skipping timeline logical size init: task lock is taken already"); - return; - } - }; - - if let Some((old_task, task_finish_signal)) = task_guard.take() { - // TODO rust 1.61 would allow to remove `task_finish_signal` entirely and call `old_task.is_finished()` instead - match task_finish_signal.try_recv() { - // task has either signaled successfully that it finished or panicked and dropped the sender part without signalling - Ok(()) | Err(mpsc::TryRecvError::Disconnected) => { - match old_task.join() { - // we're here due to OnceCell::get not returning the value - Ok(Ok(())) => { - error!("Timeline {timeline_id} size init task finished, yet the size was not updated, rescheduling the computation") - } - Ok(Err(task_error)) => { - error!("Error during timeline {timeline_id} size init: {task_error:?}") - } - Err(e) => error!("Timeline {timeline_id} size init task panicked: {e:?}"), - } - } - // task had not yet finished: no signal was sent and the sender channel is not dropped - Err(mpsc::TryRecvError::Empty) => { - // let the task finish - *task_guard = Some((old_task, task_finish_signal)); - return; - } - } - } - - if task_guard.is_none() { - let thread_timeline = Arc::clone(self); - let (finish_sender, finish_receiver) = mpsc::channel(); - - match thread::Builder::new() - .name(format!( - "Timeline {timeline_id} initial logical size calculation" - )) - .spawn(move || { - let _enter = info_span!("initial_logical_size_calculation", timeline = %timeline_id).entered(); - let calculated_size = thread_timeline.calculate_logical_size(init_lsn)?; - match thread_timeline.current_logical_size.initial_logical_size.set(calculated_size) { + // Atomically check if the timeline size calculation had already started. + // If the flag was not already set, this sets it. + if !self + .initial_size_computation_started + .swap(true, AtomicOrdering::SeqCst) + { + // We need to start the computation task. + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "initial size calculation", + false, + async move { + let calculated_size = self_clone.calculate_logical_size(init_lsn)?; + let result = spawn_blocking(move || { + self_clone.current_logical_size.initial_logical_size.set(calculated_size) + }).await?; + match result { Ok(()) => info!("Successfully calculated initial logical size"), Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), } - - finish_sender.send(()).ok(); Ok(()) - }) { - Ok(guard) => *task_guard = Some((guard, finish_receiver)), - Err(e) => error!("Failed to spawn timeline {timeline_id} size init task: {e}"), - } + } + .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id)) + ); } } @@ -1099,22 +1108,23 @@ impl Timeline { self.last_freeze_at.store(last_lsn); *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - // Launch a thread to flush the frozen layer to disk, unless - // a thread was already running. (If the thread was running + // Launch a task to flush the frozen layer to disk, unless + // a task was already running. (If the task was running // at the time that we froze the layer, it must've seen the // the layer we just froze before it exited; see comments // in flush_frozen_layers()) if let Ok(guard) = self.layer_flush_lock.try_lock() { drop(guard); let self_clone = Arc::clone(self); - thread_mgr::spawn( - thread_mgr::ThreadKind::LayerFlushThread, + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, Some(self.tenant_id), Some(self.timeline_id), - "layer flush thread", + "layer flush task", false, - move || self_clone.flush_frozen_layers(false), - )?; + async move { self_clone.flush_frozen_layers(false) }, + ); } } } @@ -1123,8 +1133,8 @@ impl Timeline { /// Flush all frozen layers to disk. /// - /// Only one thread at a time can be doing layer-flushing for a - /// given timeline. If 'wait' is true, and another thread is + /// Only one task at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another task is /// currently doing the flushing, this function will wait for it /// to finish. If 'wait' is false, this function will return /// immediately instead. diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 86bbf25b67..8b9251229e 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,10 +12,10 @@ pub mod profiling; pub mod reltag; pub mod repository; pub mod storage_sync; +pub mod task_mgr; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_tasks; -pub mod thread_mgr; pub mod timelines; pub mod virtual_file; pub mod walingest; @@ -28,7 +28,7 @@ use std::collections::HashMap; use tracing::info; use utils::zid::{ZTenantId, ZTimelineId}; -use crate::thread_mgr::ThreadKind; +use crate::task_mgr::TaskKind; /// Current storage format version /// @@ -52,30 +52,31 @@ pub enum CheckpointConfig { Forced, } -pub fn shutdown_pageserver(exit_code: i32) { - // Shut down the libpq endpoint thread. This prevents new connections from +pub async fn shutdown_pageserver(exit_code: i32) { + // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await; - // Shut down any page service threads. - thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); + // Shut down any page service tasks. + task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await; // Shut down all the tenants. This flushes everything to disk and kills - // the checkpoint and GC threads. - tenant_mgr::shutdown_all_tenants(); + // the checkpoint and GC tasks. + tenant_mgr::shutdown_all_tenants().await; // Stop syncing with remote storage. // - // FIXME: Does this wait for the sync thread to finish syncing what's queued up? + // FIXME: Does this wait for the sync tasks to finish syncing what's queued up? // Should it? - thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. - thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None); + // FIXME: We should probably stop accepting commands like attach/detach earlier. + task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await; // There should be nothing left, but let's be sure - thread_mgr::shutdown_threads(None, None, None); + task_mgr::shutdown_tasks(None, None, None).await; info!("Shut down successfully completed"); std::process::exit(exit_code); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 783fcb2412..149144bfe4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -11,17 +11,21 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +use futures::{Stream, StreamExt}; use regex::Regex; -use std::io::{self, Read}; +use std::io; use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::Arc; +use tokio_util::io::StreamReader; +use tokio_util::io::SyncIoBridge; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, lsn::Lsn, - postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend}, + postgres_backend::AuthType, + postgres_backend_async::{self, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, simple_rcu::RcuReadGuard, zid::{ZTenantId, ZTimelineId}, @@ -35,9 +39,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; +use crate::task_mgr; +use crate::task_mgr::TaskKind; use crate::tenant_mgr; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; @@ -201,93 +205,49 @@ impl PagestreamBeMessage { } } -/// Implements Read for the server side of CopyIn -struct CopyInReader<'a> { - pgb: &'a mut PostgresBackend, +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { + async_stream::try_stream! { + loop { + let msg = tokio::select! { + biased; - /// Overflow buffer for bytes sent in CopyData messages - /// that the reader (caller of read) hasn't asked for yet. - /// TODO use BytesMut? - buf: Vec, + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + let msg = format!("pageserver is shutting down"); + let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg)); + Err(anyhow::anyhow!(msg)) + } - /// Bytes before `buf_begin` are considered as dropped. - /// This allows us to implement O(1) pop_front on Vec. - /// The Vec won't grow large because we only add to it - /// when it's empty. - buf_begin: usize, -} + msg = pgb.read_message() => { msg } + }; -impl<'a> CopyInReader<'a> { - // NOTE: pgb should be in copy in state already - fn new(pgb: &'a mut PostgresBackend) -> Self { - Self { - pgb, - buf: Vec::<_>::new(), - buf_begin: 0, - } - } -} - -impl<'a> Drop for CopyInReader<'a> { - fn drop(&mut self) { - // Finalize copy protocol so that self.pgb can be reused - // TODO instead, maybe take ownership of pgb and give it back at the end - let mut buf: Vec = vec![]; - let _ = self.read_to_end(&mut buf); - } -} - -impl<'a> Read for CopyInReader<'a> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - while !thread_mgr::is_shutdown_requested() { - // Return from buffer if nonempty - if self.buf_begin < self.buf.len() { - let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin); - buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]); - self.buf_begin += bytes_to_read; - return Ok(bytes_to_read); - } - - // Delete garbage - self.buf.clear(); - self.buf_begin = 0; - - // Wait for client to send CopyData bytes - match self.pgb.read_message() { + match msg { Ok(Some(message)) => { let copy_data_bytes = match message { FeMessage::CopyData(bytes) => bytes, - FeMessage::CopyDone => return Ok(0), + FeMessage::CopyDone => { break }, FeMessage::Sync => continue, m => { let msg = format!("unexpected message {:?}", m); - self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; + break; } }; - // Return as much as we can, saving the rest in self.buf - let mut reader = copy_data_bytes.reader(); - let bytes_read = reader.read(buf)?; - reader.read_to_end(&mut self.buf)?; - return Ok(bytes_read); + yield copy_data_bytes; } Ok(None) => { let msg = "client closed connection"; - self.pgb.write_message(&BeMessage::ErrorResponse(msg))?; - return Err(io::Error::new(io::ErrorKind::Other, msg)); + pgb.write_message(&BeMessage::ErrorResponse(msg))?; + pgb.flush().await?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; } Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(io::Error::new(io::ErrorKind::Other, e)); - } + Err(io::Error::new(io::ErrorKind::Other, e))?; } - } + }; } - - // Shutting down - let msg = "Importer thread was shut down"; - Err(io::Error::new(io::ErrorKind::Other, msg)) } } @@ -296,61 +256,49 @@ impl<'a> Read for CopyInReader<'a> { /// /// Main loop of the page service. /// -/// Listens for connections, and launches a new handler thread for each. +/// Listens for connections, and launches a new handler task for each. /// -pub fn thread_main( +pub async fn libpq_listener_main( conf: &'static PageServerConf, auth: Option>, listener: TcpListener, auth_type: AuthType, ) -> anyhow::Result<()> { listener.set_nonblocking(true)?; - let basic_rt = tokio::runtime::Builder::new_current_thread() - .enable_io() - .build()?; - - let tokio_listener = { - let _guard = basic_rt.enter(); - tokio::net::TcpListener::from_std(listener) - }?; + let tokio_listener = tokio::net::TcpListener::from_std(listener)?; // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = basic_rt.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - biased; + while let Some(res) = tokio::select! { + biased; - _ = shutdown_watcher => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + None } - }) { + + res = tokio_listener.accept() => { + Some(res) + } + } { match res { Ok((socket, peer_addr)) => { - // Connection established. Spawn a new thread to handle it. + // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - // PageRequestHandler threads are not associated with any particular - // timeline in the thread manager. In practice most connections will + // PageRequestHandler tasks are not associated with any particular + // timeline in the task manager. In practice most connections will // only deal with a particular timeline, but we don't know which one // yet. - if let Err(err) = thread_mgr::spawn( - ThreadKind::PageRequestHandler, + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::PageRequestHandler, None, None, - "serving Page Service thread", + "serving compute connection task", false, - move || page_service_conn_main(conf, local_auth, socket, auth_type), - ) { - // Thread creation failed. Log the error and continue. - error!("could not spawn page service thread: {:?}", err); - } + page_service_conn_main(conf, local_auth, socket, auth_type), + ); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -364,13 +312,13 @@ pub fn thread_main( Ok(()) } -fn page_service_conn_main( +async fn page_service_conn_main( conf: &'static PageServerConf, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, ) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on thread exit. + // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); @@ -379,22 +327,17 @@ fn page_service_conn_main( gauge.dec(); } - // We use Tokio to accept the connection, but the rest of the code works with a - // regular socket. Convert. - let socket = socket - .into_std() - .context("could not convert tokio::net:TcpStream to std::net::TcpStream")?; - socket - .set_nonblocking(false) - .context("could not put socket to blocking mode")?; - socket .set_nodelay(true) .context("could not set TCP_NODELAY")?; let mut conn_handler = PageServerHandler::new(conf, auth); - let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - match pgbackend.run(&mut conn_handler) { + let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + + let result = pgbackend + .run(&mut conn_handler, task_mgr::shutdown_watcher) + .await; + match result { Ok(()) => { // we've been requested to shut down Ok(()) @@ -435,92 +378,95 @@ impl PageServerHandler { } } - fn handle_pagerequests( + #[instrument(skip(self, pgb))] + async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - timeline_id: ZTimelineId, tenant_id: ZTenantId, + timeline_id: ZTimelineId, ) -> anyhow::Result<()> { - let _enter = - info_span!("pagestream", timeline = %timeline_id, tenant = %tenant_id).entered(); - // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; - /* switch client to COPYBOTH */ + // switch client to COPYBOTH pgb.write_message(&BeMessage::CopyBothResponse)?; + pgb.flush().await?; - while !thread_mgr::is_shutdown_requested() { - let msg = pgb.read_message(); + loop { + let msg = tokio::select! { + biased; - let profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - match msg { - Ok(message) => { - if let Some(message) = message { - trace!("query: {:?}", message); - - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - _ => continue, - }; - - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - - let response = match zenith_fe_msg { - PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_rel_exists_request(&timeline, &req) - }), - PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_nblocks_request(&timeline, &req) - }), - PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(&timeline, &req) - }), - PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME - .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) - .observe_closure_duration(|| { - self.handle_db_size_request(&timeline, &req) - }), - }; - - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough - error!("error reading relation or page version: {:?}", e); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); - - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; - } else { - break; - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + break; } - Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(e); - } + + msg = pgb.read_message() => { msg } + }; + + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(m) => { + bail!("unexpected message: {m:?} during COPY"); } - } - drop(profiling_guard); + None => break, // client disconnected + }; + + trace!("query: {:?}", copy_data_bytes); + + let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_str = tenant_id.to_string(); + let timeline_str = timeline_id.to_string(); + + let response = match zenith_fe_msg { + PagestreamFeMessage::Exists(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_exists", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_rel_exists_request(&timeline, &req).await + } + PagestreamFeMessage::Nblocks(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_rel_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_nblocks_request(&timeline, &req).await + } + PagestreamFeMessage::GetPage(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_page_at_lsn", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_get_page_at_lsn_request(&timeline, &req).await + } + PagestreamFeMessage::DbSize(req) => { + let _timer = SMGR_QUERY_TIME + .with_label_values(&["get_db_size", &tenant_str, &timeline_str]) + .start_timer(); + self.handle_db_size_request(&timeline, &req).await + } + }; + + let response = response.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough + error!("error reading relation or page version: {:?}", e); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); + + pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.flush().await?; } Ok(()) } - fn handle_import_basebackup( + #[instrument(skip(self, pgb))] + async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -528,10 +474,7 @@ impl PageServerHandler { base_lsn: Lsn, _end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered(); - + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -550,8 +493,24 @@ impl PageServerHandler { // Import basebackup provided via CopyData info!("importing basebackup"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_basebackup_from_tar(&*timeline, reader, base_lsn)?; + pgb.flush().await?; + + // import_basebackup_from_tar() is not async, mainly because the Tar crate + // it uses is not async. So we need to jump through some hoops: + // - convert the input from client connection to a synchronous Read + // - use block_in_place() + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?; + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -563,11 +522,14 @@ impl PageServerHandler { info!("flushing layers"); timeline.checkpoint(CheckpointConfig::Flush)?; + timeline.launch_wal_receiver()?; + info!("done"); Ok(()) } - fn handle_import_wal( + #[instrument(skip(self, pgb))] + async fn handle_import_wal( &self, pgb: &mut PostgresBackend, tenant_id: ZTenantId, @@ -575,9 +537,7 @@ impl PageServerHandler { start_lsn: Lsn, end_lsn: Lsn, ) -> anyhow::Result<()> { - thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let _enter = - info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let timeline = repo @@ -591,8 +551,22 @@ impl PageServerHandler { // Import wal provided via CopyData info!("importing wal"); pgb.write_message(&BeMessage::CopyInResponse)?; - let reader = CopyInReader::new(pgb); - import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?; + pgb.flush().await?; + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| { + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn) + })?; + info!("wal import complete"); + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } // TODO Does it make sense to overshoot? ensure!(timeline.get_last_record_lsn() >= end_lsn); @@ -619,7 +593,7 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( + async fn wait_or_get_last_lsn( timeline: &Timeline, mut lsn: Lsn, latest: bool, @@ -647,7 +621,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -657,7 +631,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -667,15 +641,15 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_rel_exists_request( &self, timeline: &Timeline, req: &PagestreamExistsRequest, ) -> Result { - let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; @@ -684,14 +658,15 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_nblocks_request( &self, timeline: &Timeline, req: &PagestreamNblocksRequest, ) -> Result { - let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; @@ -700,14 +675,15 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + async fn handle_db_size_request( &self, timeline: &Timeline, req: &PagestreamDbSizeRequest, ) -> Result { - let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; @@ -719,15 +695,15 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, ) -> Result { - let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) - .entered(); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; /* // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. @@ -736,6 +712,11 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ + + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -743,23 +724,23 @@ impl PageServerHandler { })) } - fn handle_basebackup_request( + #[instrument(skip(self, pgb))] + async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, + tenant_id: ZTenantId, timeline_id: ZTimelineId, lsn: Option, prev_lsn: Option, - tenant_id: ZTenantId, full_backup: bool, ) -> anyhow::Result<()> { - let span = info_span!("basebackup", timeline = %timeline_id, tenant = %tenant_id, lsn = field::Empty); - let _enter = span.enter(); - info!("starting"); - // check that the timeline exists let timeline = get_local_timeline(tenant_id, timeline_id)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { + // Backup was requested at a particular LSN. Wait for it to arrive. + info!("waiting for {}", lsn); + timeline.wait_lsn(lsn).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -767,18 +748,22 @@ impl PageServerHandler { // switch client to COPYOUT pgb.write_message(&BeMessage::CopyOutResponse)?; + pgb.flush().await?; /* Send a tarball of the latest layer on the timeline */ - { - let mut writer = CopyDataSink { pgb }; - + let mut writer = CopyDataSink { + pgb, + rt: tokio::runtime::Handle::current(), + }; + tokio::task::block_in_place(|| { let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; - span.record("lsn", &basebackup.lsn.to_string().as_str()); - basebackup.send_tarball()?; - } + tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str()); + basebackup.send_tarball() + })?; pgb.write_message(&BeMessage::CopyDone)?; - info!("done"); + pgb.flush().await?; + info!("basebackup complete"); Ok(()) } @@ -801,7 +786,8 @@ impl PageServerHandler { } } -impl postgres_backend::Handler for PageServerHandler { +#[async_trait::async_trait] +impl postgres_backend_async::Handler for PageServerHandler { fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, @@ -831,11 +817,7 @@ impl postgres_backend::Handler for PageServerHandler { Ok(()) } - fn is_shutdown_requested(&self) -> bool { - thread_mgr::is_shutdown_requested() - } - - fn process_query( + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, @@ -849,12 +831,13 @@ impl postgres_backend::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, timelineid, tenantid)?; + self.handle_pagerequests(pgb, tenant_id, timeline_id) + .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); @@ -864,10 +847,10 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; let lsn = if params.len() == 3 { Some(Lsn::from_str(params[2])?) @@ -876,8 +859,9 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -897,11 +881,11 @@ impl postgres_backend::Handler for PageServerHandler { let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), RowDescriptor::text_col(b"last_lsn"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(end_of_timeline.prev.to_string().as_bytes()), Some(end_of_timeline.last.to_string().as_bytes()), ]))? @@ -917,8 +901,8 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for fullbackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { @@ -932,11 +916,12 @@ impl postgres_backend::Handler for PageServerHandler { None }; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. // Assumes the tenant already exists on this pageserver. @@ -952,18 +937,21 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.starts_with("import wal ") { @@ -974,24 +962,27 @@ impl postgres_backend::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant = ZTenantId::from_str(params[0])?; - let timeline = ZTimelineId::from_str(params[1])?; + let tenant_id = ZTenantId::from_str(params[0])?; + let timeline_id = ZTimelineId::from_str(params[1])?; let start_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - self.check_permission(Some(tenant))?; + self.check_permission(Some(tenant_id))?; - match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + match self + .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? } }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("failpoints ") { ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); @@ -1016,7 +1007,7 @@ impl postgres_backend::Handler for PageServerHandler { bail!("Invalid failpoints format"); } } - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -1024,7 +1015,7 @@ impl postgres_backend::Handler for PageServerHandler { ensure!(params.len() == 1, "invalid param number for config command"); let tenantid = ZTenantId::from_str(params[0])?; let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"compaction_target_size"), @@ -1035,7 +1026,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"image_creation_threshold"), RowDescriptor::int8_col(b"pitr_interval"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(repo.get_checkpoint_distance().to_string().as_bytes()), Some( repo.get_checkpoint_timeout() @@ -1072,10 +1063,10 @@ impl postgres_backend::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let gc_horizon: u64 = caps .get(4) @@ -1084,8 +1075,8 @@ impl postgres_backend::Handler for PageServerHandler { // Use tenant's pitr setting let pitr = repo.get_pitr_interval(); - let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ + let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), RowDescriptor::int8_col(b"layers_needed_by_pitr"), @@ -1094,7 +1085,7 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"layers_removed"), RowDescriptor::int8_col(b"elapsed"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ + .write_message(&BeMessage::DataRow(&[ Some(result.layers_total.to_string().as_bytes()), Some(result.layers_needed_by_cutoff.to_string().as_bytes()), Some(result.layers_needed_by_pitr.to_string().as_bytes()), @@ -1121,8 +1112,8 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("checkpoint ") { // Run checkpoint immediately on given timeline. @@ -1140,8 +1131,8 @@ impl postgres_backend::Handler for PageServerHandler { // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). timeline.checkpoint(CheckpointConfig::Forced)?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message(&SINGLE_COL_ROWDESC)? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static @@ -1158,7 +1149,7 @@ impl postgres_backend::Handler for PageServerHandler { let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { @@ -1167,14 +1158,12 @@ impl postgres_backend::Handler for PageServerHandler { LsnForTimestamp::Past(_lsn) => "past".into(), LsnForTimestamp::NoData(_lsn) => "nodata".into(), }; - pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; + pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } - pgb.flush()?; - Ok(()) } } @@ -1194,6 +1183,7 @@ fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result< /// struct CopyDataSink<'a> { pgb: &'a mut PostgresBackend, + rt: tokio::runtime::Handle, } impl<'a> io::Write for CopyDataSink<'a> { @@ -1205,6 +1195,7 @@ impl<'a> io::Write for CopyDataSink<'a> { // FIXME: flush isn't really required, but makes it easier // to view in wireshark self.pgb.write_message(&BeMessage::CopyData(data))?; + self.rt.block_on(self.pgb.flush())?; trace!("CopyData sent for {} bytes!", data.len()); Ok(data.len()) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 57a964cb67..8ebfa6a935 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -37,7 +37,7 @@ //! | access to this storage | //! +------------------------+ //! -//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. +//! First, during startup, the pageserver inits the storage sync task with the async loop, or leaves the loop uninitialised, if configured so. //! The loop inits the storage connection and checks the remote files stored. //! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). //! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can @@ -158,7 +158,6 @@ use once_cell::sync::OnceCell; use remote_storage::GenericRemoteStorage; use tokio::{ fs, - runtime::Runtime, time::{Duration, Instant}, }; use tracing::*; @@ -174,9 +173,10 @@ use crate::{ exponential_backoff, layered_repository::metadata::{metadata_path, TimelineMetadata}, storage_sync::index::RemoteIndex, + task_mgr, + task_mgr::TaskKind, + task_mgr::BACKGROUND_RUNTIME, tenant_mgr::attach_local_tenants, - thread_mgr, - thread_mgr::ThreadKind, }; use crate::{ metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, @@ -264,7 +264,7 @@ impl SyncQueue { .unwrap() .0; - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { return (HashMap::new(), q.len()); } } @@ -574,7 +574,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub fn spawn_storage_sync_thread( +pub fn spawn_storage_sync_task( conf: &'static PageServerConf, local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet)>, storage: GenericRemoteStorage, @@ -590,11 +590,6 @@ pub fn spawn_storage_sync_thread( None => bail!("Could not get sync queue during the sync loop step, aborting"), }; - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("Failed to create storage sync runtime")?; - // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: // * we need to list every timeline for tenant on S3, that might be a costly operation // * we need to download every timeline for the tenant, to activate it in memory @@ -616,7 +611,7 @@ pub fn spawn_storage_sync_thread( } } - let applicable_index_parts = runtime.block_on(download_index_parts( + let applicable_index_parts = BACKGROUND_RUNTIME.block_on(download_index_parts( conf, &storage, keys_for_index_part_downloads, @@ -625,7 +620,7 @@ pub fn spawn_storage_sync_thread( let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; let mut local_timeline_init_statuses = schedule_first_sync_tasks( - &mut runtime.block_on(remote_index.write()), + &mut BACKGROUND_RUNTIME.block_on(remote_index.write()), sync_queue, timelines_to_sync, ); @@ -634,31 +629,30 @@ pub fn spawn_storage_sync_thread( .extend(empty_tenants.0.into_iter()); let remote_index_clone = remote_index.clone(); - thread_mgr::spawn( - ThreadKind::StorageSync, + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::StorageSync, None, None, - "Remote storage sync thread", + "Remote storage sync task", false, - move || { + async move { storage_sync_loop( - runtime, conf, (storage, remote_index_clone, sync_queue), max_sync_errors, - ); + ) + .await; Ok(()) }, - ) - .context("Failed to spawn remote storage sync thread")?; + ); Ok(SyncStartupData { remote_index, local_timeline_init_statuses, }) } -fn storage_sync_loop( - runtime: Runtime, +async fn storage_sync_loop( conf: &'static PageServerConf, (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, @@ -669,7 +663,7 @@ fn storage_sync_loop( let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); - if thread_mgr::is_shutdown_requested() { + if task_mgr::is_shutdown_requested() { info!("Shutdown requested, stopping"); break; } @@ -683,20 +677,19 @@ fn storage_sync_loop( } // Concurrently perform all the tasks in the batch - let loop_step = runtime.block_on(async { - tokio::select! { - step = process_batches( - conf, - max_sync_errors, - loop_storage, - &index, - batched_tasks, - sync_queue, - ) - .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step), - _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()), - } - }); + let loop_step = tokio::select! { + step = process_batches( + conf, + max_sync_errors, + loop_storage, + &index, + batched_tasks, + sync_queue, + ) + .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step) + , + _ = task_mgr::shutdown_watcher() => ControlFlow::Break(()), + }; match loop_step { ControlFlow::Continue(updated_tenants) => { @@ -708,7 +701,7 @@ fn storage_sync_loop( updated_tenants.len() ); let mut timelines_to_attach = TenantTimelineValues::new(); - let index_accessor = runtime.block_on(index.read()); + let index_accessor = index.read().await; for tenant_id in updated_tenants { let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { Some(tenant_entry) => tenant_entry, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 7070f941f5..a4285e426b 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -153,7 +153,7 @@ pub(super) async fn upload_timeline_layers<'a>( // We have run the upload sync task, but the file we wanted to upload is gone. // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and - // run compaction/gc threads, removing redundant files from disk. + // run compaction/gc tasks, removing redundant files from disk. // It's not good to pause GC/compaction because of those and we would rather skip such uploads. // // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance). diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs new file mode 100644 index 0000000000..2aa803d119 --- /dev/null +++ b/pageserver/src/task_mgr.rs @@ -0,0 +1,463 @@ +//! +//! This module provides centralized handling of tokio tasks in the Page Server. +//! +//! We provide a few basic facilities: +//! - A global registry of tasks that lists what kind of tasks they are, and +//! which tenant or timeline they are working on +//! +//! - The ability to request a task to shut down. +//! +//! +//! # How it works? +//! +//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new +//! task is spawned, a PageServerTask entry is added there, and when a +//! task dies, it removes itself from the hashmap. If you want to kill a +//! task, you can scan the hashmap to find it. +//! +//! # Task shutdown +//! +//! To kill a task, we rely on co-operation from the victim. Each task is +//! expected to periodically call the `is_shutdown_requested()` function, and +//! if it returns true, exit gracefully. In addition to that, when waiting for +//! the network or other long-running operation, you can use +//! `shutdown_watcher()` function to get a Future that will become ready if +//! the current task has been requested to shut down. You can use that with +//! Tokio select!(). +//! +//! +//! TODO: This would be a good place to also handle panics in a somewhat sane way. +//! Depending on what task panics, we might want to kill the whole server, or +//! only a single tenant or timeline. +//! + +// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. +// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. +#![allow(clippy::declare_interior_mutable_const)] + +use std::collections::HashMap; +use std::future::Future; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; + +use futures::FutureExt; +use tokio::runtime::Runtime; +use tokio::sync::watch; +use tokio::task::JoinHandle; +use tokio::task_local; + +use tracing::{debug, error, info, warn}; + +use once_cell::sync::Lazy; + +use utils::zid::{ZTenantId, ZTimelineId}; + +use crate::shutdown_pageserver; + +// +// There are four runtimes: +// +// Compute request runtime +// - used to handle connections from compute nodes. Any tasks related to satisfying +// GetPage requests, base backups, import, and other such compute node operations +// are handled by the Compute request runtime +// - page_service.rs +// - this includes layer downloads from remote storage, if a layer is needed to +// satisfy a GetPage request +// +// Management request runtime +// - used to handle HTTP API requests +// +// WAL receiver runtime: +// - used to handle WAL receiver connections. +// - and to receiver updates from etcd +// +// Background runtime +// - layer flushing +// - garbage collection +// - compaction +// - remote storage uploads +// - initial tenant loading +// +// Everything runs in a tokio task. If you spawn new tasks, spawn it using the correct +// runtime. +// +// There might be situations when one task needs to wait for a task running in another +// Runtime to finish. For example, if a background operation needs a layer from remote +// storage, it will start to download it. If a background operation needs a remote layer, +// and the download was already initiated by a GetPage request, the background task +// will wait for the download - running in the Page server runtime - to finish. +// Another example: the initial tenant loading tasks are launched in the background ops +// runtime. If a GetPage request comes in before the load of a tenant has finished, the +// GetPage request will wait for the tenant load to finish. +// +// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to +// protect data structures. Let's keep it that way. Synchronous code is easier to debug +// and analyze, and there's a lot of hairy, low-level, performance critical code there. +// +// It's nice to have different runtimes, so that you can quickly eyeball how much CPU +// time each class of operations is taking, with 'top -H' or similar. +// +// It's also good to avoid hogging all threads that would be needed to process +// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't +// happen, but still. +// +pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("compute request worker") + .enable_all() + .build() + .expect("Failed to create compute request runtime") +}); + +pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("mgmt request worker") + .enable_all() + .build() + .expect("Failed to create mgmt request runtime") +}); + +pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("walreceiver worker") + .enable_all() + .build() + .expect("Failed to create walreceiver runtime") +}); + +pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("background op worker") + .enable_all() + .build() + .expect("Failed to create background op runtime") +}); + +pub struct PageserverTaskId(u64); + +/// Each task that we track is associated with a "task ID". It's just an +/// increasing number that we assign. Note that it is different from tokio::task::Id. +static NEXT_TASK_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); + +/// Global registry of tasks +static TASKS: Lazy>>> = + Lazy::new(|| Mutex::new(HashMap::new())); + +task_local! { + // There is a Tokio watch channel for each task, which can be used to signal the + // task that it needs to shut down. This task local variable holds the receiving + // end of the channel. The sender is kept in the global registry, so that anyone + // can send the signal to request task shutdown. + static SHUTDOWN_RX: watch::Receiver; + + // Each task holds reference to its own PageServerTask here. + static CURRENT_TASK: Arc; +} + +/// +/// There are many kinds of tasks in the system. Some are associated with a particular +/// tenant or timeline, while others are global. +/// +/// Note that we don't try to limit how many task of a certain kind can be running +/// at the same time. +/// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TaskKind { + // libpq listener task. It just accepts connection and spawns a + // PageRequestHandler task for each connection. + LibpqEndpointListener, + + // HTTP endpoint listener. + HttpEndpointListener, + + // Task that handles a single connection. A PageRequestHandler task + // starts detached from any particular tenant or timeline, but it can be + // associated with one later, after receiving a command from the client. + PageRequestHandler, + + // Manages the WAL receiver connection for one timeline. It subscribes to + // events from etcd, decides which safekeeper to connect to. It spawns a + // separate WalReceiverConnection task to handle each connection. + WalReceiverManager, + + // Handles a connection to a safekeeper, to stream WAL to a timeline. + WalReceiverConnection, + + // Garbage collection worker. One per tenant + GarbageCollector, + + // Compaction. One per tenant. + Compaction, + + // Initial logical size calculation + InitialLogicalSizeCalculation, + + // Task that flushes frozen in-memory layers to disk + LayerFlushTask, + + // Task that manages the remote upload queue + StorageSync, + + // task that handles the initial downloading of all tenants + InitialLoad, + + // task that handles attaching a tenant + Attach, +} + +#[derive(Default)] +struct MutableTaskState { + /// Tenant and timeline that this task is associated with. + tenant_id: Option, + timeline_id: Option, + + /// Handle for waiting for the task to exit. It can be None, if the + /// the task has already exited. + join_handle: Option>, +} + +struct PageServerTask { + #[allow(dead_code)] // unused currently + task_id: PageserverTaskId, + + kind: TaskKind, + + name: String, + + // To request task shutdown, send 'true' to the channel to notify the task. + shutdown_tx: watch::Sender, + + mutable: Mutex, +} + +/// Launch a new task +/// Note: if shutdown_process_on_error is set to true failure +/// of the task will lead to shutdown of entire process +pub fn spawn( + runtime: &tokio::runtime::Handle, + kind: TaskKind, + tenant_id: Option, + timeline_id: Option, + name: &str, + shutdown_process_on_error: bool, + future: F, +) -> PageserverTaskId +where + F: Future> + Send + 'static, +{ + let (shutdown_tx, shutdown_rx) = watch::channel(false); + let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); + let task = Arc::new(PageServerTask { + task_id: PageserverTaskId(task_id), + kind, + name: name.to_string(), + shutdown_tx, + mutable: Mutex::new(MutableTaskState { + tenant_id, + timeline_id, + join_handle: None, + }), + }); + + TASKS.lock().unwrap().insert(task_id, Arc::clone(&task)); + + let mut task_mut = task.mutable.lock().unwrap(); + + let task_name = name.to_string(); + let task_cloned = Arc::clone(&task); + let join_handle = runtime.spawn(task_wrapper( + task_name, + task_id, + task_cloned, + shutdown_rx, + shutdown_process_on_error, + future, + )); + task_mut.join_handle = Some(join_handle); + drop(task_mut); + + // The task is now running. Nothing more to do here + PageserverTaskId(task_id) +} + +/// This wrapper function runs in a newly-spawned task. It initializes the +/// task-local variables and calls the payload function. +async fn task_wrapper( + task_name: String, + task_id: u64, + task: Arc, + shutdown_rx: watch::Receiver, + shutdown_process_on_error: bool, + future: F, +) where + F: Future> + Send + 'static, +{ + debug!("Starting task '{}'", task_name); + + let result = SHUTDOWN_RX + .scope( + shutdown_rx, + CURRENT_TASK.scope(task, { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + AssertUnwindSafe(future).catch_unwind() + }), + ) + .await; + task_finish(result, task_name, task_id, shutdown_process_on_error).await; +} + +async fn task_finish( + result: std::result::Result< + anyhow::Result<()>, + std::boxed::Box, + >, + task_name: String, + task_id: u64, + shutdown_process_on_error: bool, +) { + // Remove our entry from the global hashmap. + let task = TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); + + let mut shutdown_process = false; + { + let task_mut = task.mutable.lock().unwrap(); + + match result { + Ok(Ok(())) => { + debug!("Task '{}' exited normally", task_name); + } + Ok(Err(err)) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + Err(err) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + } + } + + if shutdown_process { + shutdown_pageserver(1).await; + } +} + +// expected to be called from the task of the given id. +pub fn associate_with(tenant_id: Option, timeline_id: Option) { + CURRENT_TASK.with(|ct| { + let mut task_mut = ct.mutable.lock().unwrap(); + task_mut.tenant_id = tenant_id; + task_mut.timeline_id = timeline_id; + }); +} + +/// Is there a task running that matches the criteria + +/// Signal and wait for tasks to shut down. +/// +/// +/// The arguments are used to select the tasks to kill. Any None arguments are +/// ignored. For example, to shut down all WalReceiver tasks: +/// +/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None) +/// +/// Or to shut down all tasks for given timeline: +/// +/// shutdown_tasks(None, Some(tenantid), Some(timelineid)) +/// +pub async fn shutdown_tasks( + kind: Option, + tenant_id: Option, + timeline_id: Option, +) { + let mut victim_tasks = Vec::new(); + + { + let tasks = TASKS.lock().unwrap(); + for task in tasks.values() { + let task_mut = task.mutable.lock().unwrap(); + if (kind.is_none() || Some(task.kind) == kind) + && (tenant_id.is_none() || task_mut.tenant_id == tenant_id) + && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) + { + let _ = task.shutdown_tx.send_replace(true); + victim_tasks.push(Arc::clone(task)); + } + } + } + + for task in victim_tasks { + let join_handle = { + let mut task_mut = task.mutable.lock().unwrap(); + info!("waiting for {} to shut down", task.name); + let join_handle = task_mut.join_handle.take(); + drop(task_mut); + join_handle + }; + if let Some(join_handle) = join_handle { + let _ = join_handle.await; + } else { + // Possibly one of: + // * The task had not even fully started yet. + // * It was shut down concurrently and already exited + } + } +} + +pub fn current_task_kind() -> Option { + CURRENT_TASK.try_with(|ct| ct.kind).ok() +} + +/// A Future that can be used to check if the current task has been requested to +/// shut down. +pub async fn shutdown_watcher() { + let mut shutdown_rx = SHUTDOWN_RX + .try_with(|rx| rx.clone()) + .expect("shutdown_requested() called in an unexpected task or thread"); + + while !*shutdown_rx.borrow() { + if shutdown_rx.changed().await.is_err() { + break; + } + } +} + +/// Has the current task been requested to shut down? +pub fn is_shutdown_requested() -> bool { + if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) { + *shutdown_rx.borrow() + } else { + if !cfg!(test) { + warn!("is_shutdown_requested() called in an unexpected task or thread"); + } + false + } +} diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index baa58f5eb5..db256b0f65 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -5,14 +5,14 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; use crate::layered_repository::ephemeral_file::is_ephemeral_file; use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; -use crate::layered_repository::{Repository, Timeline}; +use crate::layered_repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::task_mgr::{self, TaskKind}; use crate::tenant_config::TenantConfOpt; -use crate::thread_mgr::ThreadKind; -use crate::walredo::PostgresRedoManager; -use crate::{thread_mgr, timelines, walreceiver, TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::Context; +use crate::walredo::{PostgresRedoManager, WalRedoManager}; +use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; +use anyhow::{ensure, Context}; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::{self, Entry}; @@ -21,34 +21,24 @@ use std::ffi::OsStr; use std::fmt; use std::path::{Path, PathBuf}; use std::sync::Arc; -use tokio::sync::mpsc; use tracing::*; -pub use tenants_state::try_send_timeline_update; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::crashsafe_dir; +use utils::zid::{ZTenantId, ZTimelineId}; mod tenants_state { - use anyhow::ensure; use once_cell::sync::Lazy; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; - use tokio::sync::mpsc; - use tracing::{debug, error}; use utils::zid::ZTenantId; - use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; + use crate::tenant_mgr::Tenant; static TENANTS: Lazy>> = Lazy::new(|| RwLock::new(HashMap::new())); - /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, - /// so that it can enable/disable corresponding processes. - static TIMELINE_UPDATE_SENDER: Lazy< - RwLock>>, - > = Lazy::new(|| RwLock::new(None)); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { TENANTS .read() @@ -60,39 +50,6 @@ mod tenants_state { .write() .expect("Failed to write() tenants lock, it got poisoned") } - - pub(super) fn set_timeline_update_sender( - timeline_updates_sender: mpsc::UnboundedSender, - ) -> anyhow::Result<()> { - let mut sender_guard = TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned"); - ensure!(sender_guard.is_none(), "Timeline update sender already set"); - *sender_guard = Some(timeline_updates_sender); - Ok(()) - } - - pub fn try_send_timeline_update(update: LocalTimelineUpdate) { - match TIMELINE_UPDATE_SENDER - .read() - .expect("Failed to read() timeline_update_sender lock, it got poisoned") - .as_ref() - { - Some(sender) => { - if let Err(e) = sender.send(update) { - error!("Failed to send timeline update: {}", e); - } - } - None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"), - } - } - - pub(super) fn stop_timeline_update_sender() { - TIMELINE_UPDATE_SENDER - .write() - .expect("Failed to write() timeline_update_sender lock, it got poisoned") - .take(); - } } struct Tenant { @@ -103,9 +60,6 @@ struct Tenant { #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] pub enum TenantState { - // All data for this tenant is complete on local disk, but we haven't loaded the Repository, - // Timeline and Layer structs into memory yet, so it cannot be accessed yet. - //Ready, // This tenant exists on local disk, and the layer map has been loaded into memory. // The local disk might have some newer files that don't exist in cloud storage yet. Active, @@ -139,10 +93,6 @@ pub fn init_tenant_mgr( remote_storage: Option, ) -> anyhow::Result { let _entered = info_span!("init_tenant_mgr").entered(); - let (timeline_updates_sender, timeline_updates_receiver) = - mpsc::unbounded_channel::(); - tenants_state::set_timeline_update_sender(timeline_updates_sender)?; - walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; let local_tenant_files = local_tenant_timeline_files(conf) .context("Failed to collect local tenant timeline files")?; @@ -156,7 +106,7 @@ pub fn init_tenant_mgr( let SyncStartupData { remote_index, local_timeline_init_statuses, - } = storage_sync::spawn_storage_sync_thread( + } = storage_sync::spawn_storage_sync_task( conf, local_tenant_files, storage, @@ -185,27 +135,6 @@ pub fn init_tenant_mgr( Ok(remote_index) } -pub enum LocalTimelineUpdate { - Detach { - id: ZTenantTimelineId, - // used to signal to the detach caller that walreceiver successfully terminated for specified id - join_confirmation_sender: std::sync::mpsc::Sender<()>, - }, - Attach { - id: ZTenantTimelineId, - timeline: Arc, - }, -} - -impl std::fmt::Debug for LocalTimelineUpdate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(), - Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(), - } - } -} - /// Reads local files to load tenants and their timelines given into pageserver's memory. /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", @@ -274,24 +203,26 @@ fn load_local_repo( /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// -pub fn shutdown_all_tenants() { - tenants_state::stop_timeline_update_sender(); - let mut m = tenants_state::write_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - match tenant.state { - TenantState::Active | TenantState::Idle | TenantState::Stopping => { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) +pub async fn shutdown_all_tenants() { + let tenantids = { + let mut m = tenants_state::write_tenants(); + let mut tenantids = Vec::new(); + for (tenantid, tenant) in m.iter_mut() { + match tenant.state { + TenantState::Active | TenantState::Idle | TenantState::Stopping => { + tenant.state = TenantState::Stopping; + tenantids.push(*tenantid) + } + TenantState::Broken => {} } - TenantState::Broken => {} } - } - drop(m); + drop(m); + tenantids + }; - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); + task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - // Ok, no background threads running anymore. Flush any remaining data in + // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from @@ -314,7 +245,40 @@ pub fn shutdown_all_tenants() { } } -pub fn create_tenant_repository( +fn create_repo( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: ZTenantId, + wal_redo_manager: Arc, + remote_index: RemoteIndex, +) -> anyhow::Result> { + let repo_dir = conf.tenant_path(&tenant_id); + ensure!( + !repo_dir.exists(), + "cannot create new tenant repo: '{}' directory already exists", + tenant_id + ); + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&repo_dir) + .with_context(|| format!("could not create directory {}", repo_dir.display()))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; + info!("created directory structure in {}", repo_dir.display()); + + // Save tenant's config + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + + Ok(Arc::new(Repository::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + ))) +} + +pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, @@ -327,17 +291,12 @@ pub fn create_tenant_repository( } Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let repo = timelines::create_repo( - conf, - tenant_conf, - tenant_id, - wal_redo_manager, - remote_index, - )?; + let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; v.insert(Tenant { - state: TenantState::Idle, + state: TenantState::Active, repo, }); + crate::tenant_tasks::start_background_loops(tenant_id); Ok(Some(tenant_id)) } } @@ -360,13 +319,15 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { } pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - let old_state = tenant.state; - tenant.state = new_state; - drop(m); + let old_state = { + let mut m = tenants_state::write_tenants(); + let tenant = m + .get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))?; + let old_state = tenant.state; + tenant.state = new_state; + old_state + }; match (old_state, new_state) { (TenantState::Broken, TenantState::Broken) @@ -389,24 +350,15 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. - // TODO maybe use tokio::sync::watch instead? - crate::tenant_tasks::start_compaction_loop(tenant_id)?; - crate::tenant_tasks::start_gc_loop(tenant_id)?; + crate::tenant_tasks::start_background_loops(tenant_id); } (TenantState::Idle, TenantState::Stopping) => { info!("stopping idle tenant {tenant_id}"); } (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { - info!("stopping tenant {tenant_id} threads due to new state {new_state}"); - thread_mgr::shutdown_threads( - Some(ThreadKind::WalReceiverManager), - Some(tenant_id), - None, - ); + info!("stopping tenant {tenant_id} tasks due to new state {new_state}"); - // Wait until all gc/compaction tasks finish - let repo = get_repository_for_tenant(tenant_id)?; - let _guard = repo.file_lock.write().unwrap(); + // Note: The caller is responsible for waiting for any tasks to finish. } } @@ -422,28 +374,28 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result<()> { +pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the threads to join + // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join // we cannot create new timelines and tenants, and that can take quite some time, // it can even become stuck due to a bug making whole pageserver unavailable for some operations // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests // will synchronize and either fail with the not found error or succeed - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - debug!("waiting for wal receiver to shutdown"); - let _ = receiver.recv(); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(tenant_id), + Some(timeline_id), + ) + .await; debug!("wal receiver shutdown confirmed"); - debug!("waiting for threads to shutdown"); - thread_mgr::shutdown_threads(None, None, Some(timeline_id)); - debug!("thread shutdown completed"); + + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; + info!("timeline task shutdown completed"); match tenants_state::read_tenants().get(&tenant_id) { Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), @@ -452,36 +404,17 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow Ok(()) } -pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> { +pub async fn detach_tenant( + conf: &'static PageServerConf, + tenant_id: ZTenantId, +) -> anyhow::Result<()> { set_tenant_state(tenant_id, TenantState::Stopping)?; - // shutdown the tenant and timeline threads: gc, compaction, page service threads) - thread_mgr::shutdown_threads(None, Some(tenant_id), None); + // shutdown all tenant and timeline tasks: gc, compaction, page service) + task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - let mut walreceiver_join_handles = Vec::new(); - let removed_tenant = { + { let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id) - }; - if let Some(tenant) = removed_tenant { - for (timeline_id, _) in tenant.repo.list_timelines() { - let (sender, receiver) = std::sync::mpsc::channel::<()>(); - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, timeline_id), - join_confirmation_sender: sender, - }); - walreceiver_join_handles.push((timeline_id, receiver)); - } - } - - // wait for wal receivers to stop without holding the lock, because walreceiver - // will attempt to change tenant state which is protected by the same global tenants lock. - // TODO do we need a timeout here? how to handle it? - // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 - // need to use crossbeam-channel - for (timeline_id, join_handle) in walreceiver_join_handles { - info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); - join_handle.recv().ok(); - info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); + tenants_accessor.remove(&tenant_id); } // If removal fails there will be no way to successfully retry detach, diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 4e9a5fc6ec..9aaafe7f92 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -1,270 +1,130 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC -use std::collections::HashMap; -use std::ops::ControlFlow; use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant_mgr; use crate::tenant_mgr::TenantState; -use crate::thread_mgr::ThreadKind; -use crate::{tenant_mgr, thread_mgr}; -use anyhow::{self, Context}; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use once_cell::sync::OnceCell; -use tokio::sync::mpsc; -use tokio::sync::watch; use tracing::*; use utils::zid::ZTenantId; +pub fn start_background_loops(tenant_id: ZTenantId) { + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::Compaction, + Some(tenant_id), + None, + &format!("compactor for tenant {tenant_id}"), + false, + compaction_loop(tenant_id), + ); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::GarbageCollector, + Some(tenant_id), + None, + &format!("garbage collector for tenant {tenant_id}"), + false, + gc_loop(tenant_id), + ); +} + /// /// Compaction task's main loop /// -async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting compaction loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); + + // Run blocking part of the task - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run compaction - let compaction_period = repo.get_compaction_period(); - repo.compaction_iteration()?; - Ok(ControlFlow::Continue(compaction_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { + let mut sleep_duration = repo.get_compaction_period(); + if let Err(e) = repo.compaction_iteration() { error!("Compaction failed, retrying: {}", e); - Duration::from_secs(2) + sleep_duration = Duration::from_secs(2) } - Err(e) => { - error!("Compaction join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } } } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - trace!( + info!( "compaction loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); -} - -static START_GC_LOOP: OnceCell> = OnceCell::new(); -static START_COMPACTION_LOOP: OnceCell> = OnceCell::new(); - -/// Spawn a task that will periodically schedule garbage collection until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_GC_LOOP - .get() - .context("Failed to get START_GC_LOOP")? - .blocking_send(tenantid) - .context("Failed to send to START_GC_LOOP channel")?; - Ok(()) -} - -/// Spawn a task that will periodically schedule compaction until -/// the tenant becomes inactive. This should be called on tenant -/// activation. -pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> { - START_COMPACTION_LOOP - .get() - .context("failed to get START_COMPACTION_LOOP")? - .blocking_send(tenantid) - .context("failed to send to START_COMPACTION_LOOP")?; - Ok(()) -} - -/// Spawn the TenantTaskManager -/// This needs to be called before start_gc_loop or start_compaction_loop -pub fn init_tenant_task_pool() -> anyhow::Result<()> { - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("tenant-task-worker") - .enable_all() - .on_thread_start(|| { - thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker") - }) - .on_thread_stop(thread_mgr::deregister) - .build()?; - - let (gc_send, mut gc_recv) = mpsc::channel::(100); - START_GC_LOOP - .set(gc_send) - .expect("Failed to set START_GC_LOOP"); - - let (compaction_send, mut compaction_recv) = mpsc::channel::(100); - START_COMPACTION_LOOP - .set(compaction_send) - .expect("Failed to set START_COMPACTION_LOOP"); - - // TODO this is getting repetitive - let mut gc_loops = HashMap::>::new(); - let mut compaction_loops = HashMap::>::new(); - - thread_mgr::spawn( - ThreadKind::TenantTaskManager, - None, - None, - "Tenant task manager main thread", - true, - move || { - runtime.block_on(async move { - let mut futures = FuturesUnordered::new(); - loop { - tokio::select! { - _ = thread_mgr::shutdown_watcher() => { - // Send cancellation to all tasks - for (_, cancel) in gc_loops.drain() { - cancel.send(()).ok(); - } - for (_, cancel) in compaction_loops.drain() { - cancel.send(()).ok(); - } - - // Exit after all tasks finish - while let Some(result) = futures.next().await { - match result { - Ok(()) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Err(e) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - } - } - break; - }, - tenantid = gc_recv.recv() => { - let tenantid = tenantid.expect("Gc task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(gc_loop(tenantid, cancel_recv) - .instrument(info_span!("gc loop", tenant = %tenantid))); - if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - tenantid = compaction_recv.recv() => { - let tenantid = tenantid.expect("Compaction task channel closed unexpectedly"); - - // Spawn new task, request cancellation of the old one if exists - let (cancel_send, cancel_recv) = watch::channel(()); - let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv) - .instrument(info_span!("compaction loop", tenant = %tenantid))); - if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) { - old_cancel_send.send(()).ok(); - } - - // Update metrics, remember handle - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - futures.push(handle); - }, - result = futures.next() => { - // Log and count any unhandled panics - match result { - Some(Ok(())) => { - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - }, - Some(Err(e)) => { - TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); - error!("loop join error {}", e) - }, - None => {}, - }; - }, - } - } - }); - Ok(()) - }, - )?; - - Ok(()) + result } /// /// GC task's main loop /// -async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { - loop { - trace!("waking up"); +async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { + info!("starting gc loop for {tenant_id}"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + let result = async { + loop { + trace!("waking up"); - // Run blocking part of the task - let period: Result, _> = tokio::task::spawn_blocking(move || { // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - return Ok(ControlFlow::Break(())); + if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { + break Ok(()); } - - // Break if we're not allowed to write to disk - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // This should not fail. If someone started us, it means that the tenant exists. + // And before you remove a tenant, you have to wait until all the associated tasks + // exit. + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; // Run gc let gc_period = repo.get_gc_period(); let gc_horizon = repo.get_gc_horizon(); + let mut sleep_duration = gc_period; if gc_horizon > 0 { - repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; + if let Err(e) = repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false) + { + error!("Gc failed, retrying: {}", e); + sleep_duration = Duration::from_secs(2) + } } - Ok(ControlFlow::Continue(gc_period)) - }) - .await; - - // Decide whether to sleep or break - let sleep_duration = match period { - Ok(Ok(ControlFlow::Continue(period))) => period, - Ok(Ok(ControlFlow::Break(()))) => break, - Ok(Err(e)) => { - error!("Gc failed, retrying: {}", e); - Duration::from_secs(2) + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + trace!("received cancellation request"); + break Ok(()); + }, + _ = tokio::time::sleep(sleep_duration) => {}, } - Err(e) => { - error!("Gc join error, retrying: {}", e); - Duration::from_secs(2) - } - }; - - // Sleep - tokio::select! { - _ = cancel.changed() => { - trace!("received cancellation request"); - break; - }, - _ = tokio::time::sleep(sleep_duration) => {}, } } - trace!( + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + info!( "GC loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenantid) + tenant_mgr::get_tenant_state(tenant_id) ); + result } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs deleted file mode 100644 index cdd38febbc..0000000000 --- a/pageserver/src/thread_mgr.rs +++ /dev/null @@ -1,409 +0,0 @@ -//! -//! This module provides centralized handling of threads in the Page Server. -//! -//! We provide a few basic facilities: -//! - A global registry of threads that lists what kind of threads they are, and -//! which tenant or timeline they are working on -//! -//! - The ability to request a thread to shut down. -//! -//! -//! # How it works? -//! -//! There is a global hashmap of all the threads (`THREADS`). Whenever a new -//! thread is spawned, a PageServerThread entry is added there, and when a -//! thread dies, it removes itself from the hashmap. If you want to kill a -//! thread, you can scan the hashmap to find it. -//! -//! # Thread shutdown -//! -//! To kill a thread, we rely on co-operation from the victim. Each thread is -//! expected to periodically call the `is_shutdown_requested()` function, and -//! if it returns true, exit gracefully. In addition to that, when waiting for -//! the network or other long-running operation, you can use -//! `shutdown_watcher()` function to get a Future that will become ready if -//! the current thread has been requested to shut down. You can use that with -//! Tokio select!(), but note that it relies on thread-local storage, so it -//! will only work with the "current-thread" Tokio runtime! -//! -//! -//! TODO: This would be a good place to also handle panics in a somewhat sane way. -//! Depending on what thread panics, we might want to kill the whole server, or -//! only a single tenant or timeline. -//! - -use std::cell::RefCell; -use std::collections::HashMap; -use std::panic; -use std::panic::AssertUnwindSafe; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::{Arc, Mutex}; -use std::thread; -use std::thread::JoinHandle; - -use tokio::sync::watch; - -use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - -use utils::zid::{ZTenantId, ZTimelineId}; - -use crate::shutdown_pageserver; - -/// Each thread that we track is associated with a "thread ID". It's just -/// an increasing number that we assign, not related to any system thread -/// id. -static NEXT_THREAD_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); - -/// Global registry of threads -static THREADS: Lazy>>> = - Lazy::new(|| Mutex::new(HashMap::new())); - -// There is a Tokio watch channel for each thread, which can be used to signal the -// thread that it needs to shut down. This thread local variable holds the receiving -// end of the channel. The sender is kept in the global registry, so that anyone -// can send the signal to request thread shutdown. -thread_local!(static SHUTDOWN_RX: RefCell>> = RefCell::new(None)); - -// Each thread holds reference to its own PageServerThread here. -thread_local!(static CURRENT_THREAD: RefCell>> = RefCell::new(None)); - -/// -/// There are many kinds of threads in the system. Some are associated with a particular -/// tenant or timeline, while others are global. -/// -/// Note that we don't try to limit how may threads of a certain kind can be running -/// at the same time. -/// -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ThreadKind { - // libpq listener thread. It just accepts connection and spawns a - // PageRequestHandler thread for each connection. - LibpqEndpointListener, - - // HTTP endpoint listener. - HttpEndpointListener, - - // Thread that handles a single connection. A PageRequestHandler thread - // starts detached from any particular tenant or timeline, but it can be - // associated with one later, after receiving a command from the client. - PageRequestHandler, - - // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. - WalReceiverManager, - - // Thread that schedules new compaction and gc jobs - TenantTaskManager, - - // Worker thread for tenant tasks thread pool - TenantTaskWorker, - - // Thread that flushes frozen in-memory layers to disk - LayerFlushThread, - - // Thread for synchronizing pageserver layer files with the remote storage. - // Shared by all tenants. - StorageSync, -} - -#[derive(Default)] -struct MutableThreadState { - /// Tenant and timeline that this thread is associated with. - tenant_id: Option, - timeline_id: Option, - - /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. OR if this thread is managed externally - /// and was not spawned through thread_mgr.rs::spawn function. - join_handle: Option>, -} - -struct PageServerThread { - thread_id: u64, - - kind: ThreadKind, - - name: String, - - // To request thread shutdown, set the flag, and send a dummy message to the - // channel to notify it. - shutdown_requested: AtomicBool, - shutdown_tx: watch::Sender<()>, - - mutable: Mutex, -} - -/// Launch a new thread -/// Note: if shutdown_process_on_error is set to true failure -/// of the thread will lead to shutdown of entire process -pub fn spawn( - kind: ThreadKind, - tenant_id: Option, - timeline_id: Option, - name: &str, - shutdown_process_on_error: bool, - f: F, -) -> std::io::Result -where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_string(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id, - timeline_id, - join_handle: None, - }), - }); - - THREADS - .lock() - .unwrap() - .insert(thread_id, Arc::clone(&thread)); - - let mut thread_mut = thread.mutable.lock().unwrap(); - - let thread_cloned = Arc::clone(&thread); - let thread_name = name.to_string(); - let join_handle = match thread::Builder::new() - .name(name.to_string()) - .spawn(move || { - thread_wrapper( - thread_name, - thread_id, - thread_cloned, - shutdown_rx, - shutdown_process_on_error, - f, - ) - }) { - Ok(handle) => handle, - Err(err) => { - error!("Failed to spawn thread '{}': {}", name, err); - // Could not spawn the thread. Remove the entry - THREADS.lock().unwrap().remove(&thread_id); - return Err(err); - } - }; - thread_mut.join_handle = Some(join_handle); - drop(thread_mut); - - // The thread is now running. Nothing more to do here - Ok(thread_id) -} - -/// This wrapper function runs in a newly-spawned thread. It initializes the -/// thread-local variables and calls the payload function -fn thread_wrapper( - thread_name: String, - thread_id: u64, - thread: Arc, - shutdown_rx: watch::Receiver<()>, - shutdown_process_on_error: bool, - f: F, -) where - F: FnOnce() -> anyhow::Result<()> + Send + 'static, -{ - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - CURRENT_THREAD.with(|ct| { - *ct.borrow_mut() = Some(thread); - }); - - debug!("Starting thread '{}'", thread_name); - - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - let result = panic::catch_unwind(AssertUnwindSafe(f)); - - // Remove our entry from the global hashmap. - let thread = THREADS - .lock() - .unwrap() - .remove(&thread_id) - .expect("no thread in registry"); - - let thread_mut = thread.mutable.lock().unwrap(); - match result { - Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), - Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - shutdown_pageserver(1); - } else { - error!( - "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err - ); - } - } - } -} - -// expected to be called from the thread of the given id. -pub fn associate_with(tenant_id: Option, timeline_id: Option) { - CURRENT_THREAD.with(|ct| { - let borrowed = ct.borrow(); - let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap(); - thread_mut.tenant_id = tenant_id; - thread_mut.timeline_id = timeline_id; - }); -} - -/// Is there a thread running that matches the criteria - -/// Signal and wait for threads to shut down. -/// -/// -/// The arguments are used to select the threads to kill. Any None arguments are -/// ignored. For example, to shut down all WalReceiver threads: -/// -/// shutdown_threads(Some(ThreadKind::WalReceiver), None, None) -/// -/// Or to shut down all threads for given timeline: -/// -/// shutdown_threads(None, Some(timelineid), None) -/// -pub fn shutdown_threads( - kind: Option, - tenant_id: Option, - timeline_id: Option, -) { - let mut victim_threads = Vec::new(); - - let threads = THREADS.lock().unwrap(); - for thread in threads.values() { - let thread_mut = thread.mutable.lock().unwrap(); - if (kind.is_none() || Some(thread.kind) == kind) - && (tenant_id.is_none() || thread_mut.tenant_id == tenant_id) - && (timeline_id.is_none() || thread_mut.timeline_id == timeline_id) - { - thread.shutdown_requested.store(true, Ordering::Relaxed); - // FIXME: handle error? - let _ = thread.shutdown_tx.send(()); - victim_threads.push(Arc::clone(thread)); - } - } - drop(threads); - - for thread in victim_threads { - let mut thread_mut = thread.mutable.lock().unwrap(); - info!("waiting for {} to shut down", thread.name); - if let Some(join_handle) = thread_mut.join_handle.take() { - drop(thread_mut); - let _ = join_handle.join(); - } else { - // Possibly one of: - // * The thread had not even fully started yet. - // * It was shut down concurrently and already exited - // * Is managed through `register`/`deregister` fns without providing a join handle - } - } -} - -/// A Future that can be used to check if the current thread has been requested to -/// shut down. -pub async fn shutdown_watcher() { - let _ = SHUTDOWN_RX - .with(|rx| { - rx.borrow() - .as_ref() - .expect("shutdown_requested() called in an unexpected thread") - .clone() - }) - .changed() - .await; -} - -/// Has the current thread been requested to shut down? -pub fn is_shutdown_requested() -> bool { - CURRENT_THREAD.with(|ct| { - if let Some(ct) = ct.borrow().as_ref() { - ct.shutdown_requested.load(Ordering::Relaxed) - } else { - if !cfg!(test) { - warn!("is_shutdown_requested() called in an unexpected thread"); - } - false - } - }) -} - -/// Needed to register threads that were not spawned through spawn function. -/// For example tokio blocking threads. This function is expected to be used -/// in tandem with `deregister`. -/// NOTE: threads registered through this function cannot be joined -pub fn register(kind: ThreadKind, name: &str) { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - if borrowed.is_some() { - panic!("thread already registered") - }; - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - - let thread = Arc::new(PageServerThread { - thread_id, - kind, - name: name.to_owned(), - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - mutable: Mutex::new(MutableThreadState { - tenant_id: None, - timeline_id: None, - join_handle: None, - }), - }); - - *borrowed = Some(Arc::clone(&thread)); - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - - THREADS.lock().unwrap().insert(thread_id, thread); - }); -} - -// Expected to be used in tandem with `register`. See the doc for `register` for more details -pub fn deregister() { - CURRENT_THREAD.with(|ct| { - let mut borrowed = ct.borrow_mut(); - let thread = match borrowed.take() { - Some(thread) => thread, - None => panic!("calling deregister on unregistered thread"), - }; - - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = None; - }); - - THREADS.lock().unwrap().remove(&thread.thread_id) - }); -} diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 9356893908..35dec54d5c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,7 +2,7 @@ //! Timeline management code // -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, Context, Result}; use remote_storage::path_with_suffix_extension; use std::{ @@ -14,21 +14,15 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::config::PageServerConf; +use crate::layered_repository::{Repository, Timeline}; use crate::tenant_mgr; use crate::CheckpointConfig; -use crate::{ - config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, -}; use crate::{import_datadir, TEMP_FILE_SUFFIX}; -use crate::{ - layered_repository::{Repository, Timeline}, - walredo::WalRedoManager, -}; #[derive(Debug, Clone, Copy)] pub struct PointInTime { @@ -36,39 +30,6 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn create_repo( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, - wal_redo_manager: Arc, - remote_index: RemoteIndex, -) -> Result> { - let repo_dir = conf.tenant_path(&tenant_id); - ensure!( - !repo_dir.exists(), - "cannot create new tenant repo: '{}' directory already exists", - tenant_id - ); - - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); - - // Save tenant's config - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; - - Ok(Arc::new(Repository::new( - conf, - tenant_conf, - wal_redo_manager, - tenant_id, - remote_index, - conf.remote_storage_config.is_some(), - ))) -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -158,7 +119,7 @@ fn bootstrap_timeline( /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, /// a new unique ID is generated. /// -pub(crate) fn create_timeline( +pub(crate) async fn create_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, new_timeline_id: Option, @@ -187,7 +148,7 @@ pub(crate) fn create_timeline( // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn)?; + ancestor_timeline.wait_lsn(*lsn).await?; let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index d6420e1d18..deac299747 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -23,131 +23,61 @@ mod connection_manager; mod walreceiver_connection; +use crate::config::PageServerConf; +use crate::task_mgr::WALRECEIVER_RUNTIME; + use anyhow::{ensure, Context}; use etcd_broker::Client; use itertools::Itertools; -use std::cell::Cell; -use std::collections::{hash_map, HashMap, HashSet}; +use once_cell::sync::OnceCell; use std::future::Future; -use std::num::NonZeroU64; use std::sync::Arc; -use std::thread_local; -use std::time::Duration; -use tokio::{ - select, - sync::{mpsc, watch}, - task::JoinHandle, -}; +use tokio::sync::watch; use tracing::*; use url::Url; -use crate::config::PageServerConf; -use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; -use crate::thread_mgr::{self, ThreadKind}; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +pub use connection_manager::spawn_connection_manager_task; -thread_local! { - // Boolean that is true only for WAL receiver threads - // - // This is used in `wait_lsn` to guard against usage that might lead to a deadlock. - pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); -} +static ETCD_CLIENT: OnceCell = OnceCell::new(); -/// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. -/// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. -pub fn init_wal_receiver_main_thread( - conf: &'static PageServerConf, - mut timeline_updates_receiver: mpsc::UnboundedReceiver, -) -> anyhow::Result<()> { +/// +/// Initialize the etcd client. This must be called once at page server startup. +/// +pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> { let etcd_endpoints = conf.broker_endpoints.clone(); ensure!( !etcd_endpoints.is_empty(), "Cannot start wal receiver: etcd endpoints are empty" ); - let broker_prefix = &conf.broker_etcd_prefix; - info!( - "Starting wal receiver main thread, etcd endpoints: {}", - etcd_endpoints.iter().map(Url::to_string).join(", ") - ); - let runtime = tokio::runtime::Builder::new_multi_thread() - .thread_name("wal-receiver-runtime-thread") - .enable_all() - .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) - .build() - .context("Failed to create storage sync runtime")?; - let etcd_client = runtime - .block_on(Client::connect(etcd_endpoints, None)) + let etcd_client = Client::connect(etcd_endpoints.clone(), None) + .await .context("Failed to connect to etcd")?; - thread_mgr::spawn( - ThreadKind::WalReceiverManager, - None, - None, - "WAL receiver manager main thread", - true, - move || { - runtime.block_on(async move { - let mut local_timeline_wal_receivers = HashMap::new(); - loop { - select! { - _ = thread_mgr::shutdown_watcher() => { - info!("Shutdown signal received"); - shutdown_all_wal_connections(&mut local_timeline_wal_receivers).await; - break; - }, - _ = wal_receiver_main_thread_loop_step( - broker_prefix, - &etcd_client, - &mut timeline_updates_receiver, - &mut local_timeline_wal_receivers, - ) => {}, - } - } - }.instrument(info_span!("wal_receiver_main"))); + // FIXME: Should we still allow the pageserver to start, if etcd + // doesn't work? It could still serve GetPage requests, with the + // data it has locally and from what it can download from remote + // storage + if ETCD_CLIENT.set(etcd_client).is_err() { + panic!("etcd already initialized"); + } - info!("Wal receiver main thread stopped"); - Ok(()) - }, - ) - .map(|_thread_id| ()) - .context("Failed to spawn wal receiver main thread") + info!( + "Initialized etcd client with endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") + ); + Ok(()) } -async fn shutdown_all_wal_connections( - local_timeline_wal_receivers: &mut HashMap>>, -) { - info!("Shutting down all WAL connections"); - let mut broker_join_handles = Vec::new(); - for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { - for (timeline_id, handles) in timelines { - handles.cancellation.send(()).ok(); - broker_join_handles.push(( - ZTenantTimelineId::new(tenant_id, timeline_id), - handles.handle, - )); - } - } +/// +/// Get a handle to the etcd client +/// +pub fn get_etcd_client() -> &'static etcd_broker::Client { + ETCD_CLIENT.get().expect("etcd client not initialized") +} - let mut tenants = HashSet::with_capacity(broker_join_handles.len()); - for (id, broker_join_handle) in broker_join_handles { - tenants.insert(id.tenant_id); - debug!("Waiting for wal broker for timeline {id} to finish"); - if let Err(e) = broker_join_handle.await { - error!("Failed to join on wal broker for timeline {id}: {e}"); - } - } - if let Err(e) = tokio::task::spawn_blocking(move || { - for tenant_id in tenants { - if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { - error!("Failed to make tenant {tenant_id} idle: {e:?}"); - } - } - }) - .await - { - error!("Failed to await a task to make all tenants idle: {e:?}"); - } +pub fn is_etcd_client_initialized() -> bool { + ETCD_CLIENT.get().is_some() } /// A handle of an asynchronous task. @@ -157,8 +87,7 @@ async fn shutdown_all_wal_connections( /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. /// That may lead to certain events not being observed by the listener. #[derive(Debug)] -struct TaskHandle { - handle: JoinHandle>, +pub struct TaskHandle { events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } @@ -167,7 +96,7 @@ struct TaskHandle { pub enum TaskEvent { Started, NewEvent(E), - End(Result<(), String>), + End, } impl TaskHandle { @@ -184,164 +113,28 @@ impl TaskHandle { let events_sender = Arc::new(events_sender); let sender = Arc::clone(&events_sender); - let handle = tokio::task::spawn(async move { + let _ = WALRECEIVER_RUNTIME.spawn(async move { events_sender.send(TaskEvent::Started).ok(); task(sender, cancellation_receiver).await }); TaskHandle { - handle, events_receiver, cancellation, } } async fn next_task_event(&mut self) -> TaskEvent { - select! { - next_task_event = self.events_receiver.changed() => match next_task_event { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await, - }, - task_completion_result = join_on_handle(&mut self.handle) => task_completion_result, + match self.events_receiver.changed().await { + Ok(()) => self.events_receiver.borrow().clone(), + Err(_task_channel_part_dropped) => TaskEvent::End, } } /// Aborts current task, waiting for it to finish. - async fn shutdown(self) { + pub async fn shutdown(mut self) { self.cancellation.send(()).ok(); - if let Err(e) = self.handle.await { - error!("Task failed to shut down: {e}") - } + // wait until the sender is dropped + while self.events_receiver.changed().await.is_ok() {} } } - -async fn join_on_handle(handle: &mut JoinHandle>) -> TaskEvent { - match handle.await { - Ok(task_result) => TaskEvent::End(task_result), - Err(e) => { - if e.is_cancelled() { - TaskEvent::End(Ok(())) - } else { - TaskEvent::End(Err(format!("WAL receiver task panicked: {e}"))) - } - } - } -} - -/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. -/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. -/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. -/// -/// Cannot fail, should always try to process the next timeline event even if the other one was not processed properly. -async fn wal_receiver_main_thread_loop_step<'a>( - broker_prefix: &'a str, - etcd_client: &'a Client, - timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, - local_timeline_wal_receivers: &'a mut HashMap>>, -) { - // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. - match timeline_updates_receiver.recv().await { - Some(update) => { - info!("Processing timeline update: {update:?}"); - match update { - // Timeline got detached, stop all related tasks and remove public timeline data. - LocalTimelineUpdate::Detach { - id, - join_confirmation_sender, - } => { - match local_timeline_wal_receivers.get_mut(&id.tenant_id) { - Some(wal_receivers) => { - if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { - o.remove().shutdown().await - } - if wal_receivers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { - error!("Failed to make tenant idle for id {id}: {e:#}"); - } - } - } - None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), - }; - if let Err(e) = join_confirmation_sender.send(()) { - warn!("cannot send wal_receiver shutdown confirmation {e}") - } else { - info!("confirm walreceiver shutdown for {id}"); - } - } - // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. - LocalTimelineUpdate::Attach { id, timeline } => { - let timeline_connection_managers = local_timeline_wal_receivers - .entry(id.tenant_id) - .or_default(); - - if timeline_connection_managers.is_empty() { - if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await - { - error!("Failed to make tenant active for id {id}: {e:#}"); - return; - } - } - - let vacant_connection_manager_entry = - match timeline_connection_managers.entry(id.timeline_id) { - hash_map::Entry::Occupied(_) => { - debug!("Attepted to readd an existing timeline {id}, ignoring"); - return; - } - hash_map::Entry::Vacant(v) => v, - }; - - let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = - match fetch_tenant_settings(id.tenant_id).await { - Ok(settings) => settings, - Err(e) => { - error!("Failed to fetch tenant settings for id {id}: {e:#}"); - return; - } - }; - - vacant_connection_manager_entry.insert( - connection_manager::spawn_connection_manager_task( - id, - broker_prefix.to_owned(), - etcd_client.clone(), - timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - ), - ); - } - } - } - None => { - info!("Local timeline update channel closed"); - shutdown_all_wal_connections(local_timeline_wal_receivers).await; - } - } -} - -async fn fetch_tenant_settings( - tenant_id: ZTenantId, -) -> anyhow::Result<(Duration, Duration, NonZeroU64)> { - tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - Ok::<_, anyhow::Error>(( - repo.get_wal_receiver_connect_timeout(), - repo.get_lagging_wal_timeout(), - repo.get_max_lsn_wal_lag(), - )) - }) - .await - .with_context(|| format!("Failed to join on tenant {tenant_id} settings fetch task"))? -} - -async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - tokio::task::spawn_blocking(move || { - tenant_mgr::set_tenant_state(tenant_id, new_state) - .with_context(|| format!("Failed to activate tenant {tenant_id}")) - }) - .await - .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? -} diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 0261203049..1fcb768ddf 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -17,6 +17,9 @@ use std::{ }; use crate::layered_repository::Timeline; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::task_mgr::WALRECEIVER_RUNTIME; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -26,7 +29,10 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use crate::{ + exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, +}; use utils::{ lsn::Lsn, zid::{NodeId, ZTenantTimelineId}, @@ -35,29 +41,38 @@ use utils::{ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; /// Spawns the loop to take care of the timeline's WAL streaming connection. -pub(super) fn spawn_connection_manager_task( - id: ZTenantTimelineId, +pub fn spawn_connection_manager_task( broker_loop_prefix: String, - mut client: Client, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, -) -> TaskHandle<()> { - TaskHandle::spawn(move |_, mut cancellation| { +) -> anyhow::Result<()> { + let mut etcd_client = get_etcd_client().clone(); + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverManager, + Some(tenant_id), + Some(timeline_id), + &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + false, async move { info!("WAL receiver broker started, connecting to etcd"); let mut walreceiver_state = WalreceiverState::new( - id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, ); loop { select! { - _ = cancellation.changed() => { - info!("Broker subscription init cancelled, shutting down"); + _ = task_mgr::shutdown_watcher() => { + info!("WAL receiver shutdown requested, shutting down"); + // Kill current connection, if any if let Some(wal_connection) = walreceiver_state.wal_connection.take() { wal_connection.connection_task.shutdown().await; @@ -67,14 +82,15 @@ pub(super) fn spawn_connection_manager_task( _ = connection_manager_loop_step( &broker_loop_prefix, - &mut client, + &mut etcd_client, &mut walreceiver_state, ) => {}, } } } - .instrument(info_span!("wal_connection_manager", id = %id)) - }) + .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + ); + Ok(()) } /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. @@ -85,7 +101,10 @@ async fn connection_manager_loop_step( etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, ) { - let id = walreceiver_state.id; + let id = ZTenantTimelineId { + tenant_id: walreceiver_state.timeline.tenant_id, + timeline_id: walreceiver_state.timeline.timeline_id, + }; // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, // running the entire loop step as much as possible to an end. @@ -98,6 +117,14 @@ async fn connection_manager_loop_step( loop { let time_until_next_retry = walreceiver_state.time_until_next_retry(); + // These things are happening concurrently: + // + // - keep receiving WAL on the current connection + // - if the shared state says we need to change connection, disconnect and return + // - this runs in a separate task and we receive updates via a watch channel + // - change connection if the rules decide so, or if the current connection dies + // - receive updates from broker + // - this might change the current desired connection select! { broker_connection_result = &mut broker_subscription.watcher_handle => { cleanup_broker_connection(broker_connection_result, walreceiver_state); @@ -110,7 +137,8 @@ async fn connection_manager_loop_step( None => None, } } => { - let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); + let wal_connection = walreceiver_state.wal_connection.as_mut() + .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { TaskEvent::Started => {}, TaskEvent::NewEvent(status) => { @@ -123,16 +151,14 @@ async fn connection_manager_loop_step( } wal_connection.status = status; }, - TaskEvent::End(end_result) => { - match end_result { - Ok(()) => debug!("WAL receiving task finished"), - Err(e) => warn!("WAL receiving task failed: {e}"), - }; + TaskEvent::End => { + debug!("WAL receiving task finished"); walreceiver_state.drop_old_connection(false).await; }, } }, + // Got a new update from etcd broker_update = broker_subscription.value_updates.recv() => { match broker_update { Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), @@ -241,8 +267,9 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, + /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -299,15 +326,18 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( - id: ZTenantTimelineId, - local_timeline: Arc, + timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, ) -> Self { + let id = ZTenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }; Self { id, - local_timeline, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, @@ -323,10 +353,11 @@ impl WalreceiverState { let id = self.id; let connect_timeout = self.wal_connect_timeout; + let timeline = Arc::clone(&self.timeline); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { super::walreceiver_connection::handle_walreceiver_connection( - id, + timeline, &new_wal_source_connstr, events_sender.as_ref(), cancellation, @@ -520,7 +551,7 @@ impl WalreceiverState { let current_lsn = match existing_wal_connection.status.streaming_lsn { Some(lsn) => lsn, - None => self.local_timeline.get_last_record_lsn(), + None => self.timeline.get_last_record_lsn(), }; let current_commit_lsn = existing_wal_connection .status @@ -1328,7 +1359,7 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, - local_timeline: harness + timeline: harness .load() .create_empty_timeline(TIMELINE_ID, Lsn(0)) .expect("Failed to create an empty timeline for dummy wal connection manager"), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 4c30481e02..e8fa9f9aca 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -21,11 +21,17 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ - layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, + layered_repository::{Timeline, WalReceiverInfo}, + task_mgr, + task_mgr::TaskKind, + task_mgr::WALRECEIVER_RUNTIME, + tenant_mgr, + walingest::WalIngest, walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; +use utils::zid::ZTenantTimelineId; +use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; /// Status of the connection. #[derive(Debug, Clone)] @@ -48,7 +54,7 @@ pub struct WalConnectionStatus { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. pub async fn handle_walreceiver_connection( - id: ZTenantTimelineId, + timeline: Arc, wal_source_connstr: &str, events_sender: &watch::Sender>, mut cancellation: watch::Receiver<()>, @@ -83,24 +89,31 @@ pub async fn handle_walreceiver_connection( // The connection object performs the actual communication with the database, // so spawn it off to run on its own. let mut connection_cancellation = cancellation.clone(); - tokio::spawn( + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverConnection, + Some(timeline.tenant_id), + Some(timeline.timeline_id), + "walreceiver connection", + false, async move { select! { - connection_result = connection => match connection_result{ - Ok(()) => info!("Walreceiver db connection closed"), - Err(connection_error) => { - if connection_error.is_closed() { - info!("Connection closed regularly: {connection_error}") - } else { - warn!("Connection aborted: {connection_error}") - } - } - }, + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, - _ = connection_cancellation.changed() => info!("Connection cancelled"), + _ = connection_cancellation.changed() => info!("Connection cancelled"), } + Ok(()) } - .instrument(info_span!("safekeeper_handle_db")), + .instrument(info_span!("walreceiver connection")), ); // Immediately increment the gauge, then create a job to decrement it on task exit. @@ -117,10 +130,6 @@ pub async fn handle_walreceiver_connection( let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = id; connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); @@ -130,17 +139,10 @@ pub async fn handle_walreceiver_connection( return Ok(()); } - let (repo, timeline) = tokio::task::spawn_blocking(move || { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; - let timeline = repo.get_timeline(timeline_id) - .with_context(|| { - format!("local timeline {timeline_id} not found for tenant {tenant_id}") - })?; - Ok::<_, anyhow::Error>((repo, timeline)) - }) - .await - .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??; + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; // // Start streaming the WAL, from where we left off previously. @@ -273,11 +275,12 @@ pub async fn handle_walreceiver_connection( } } - let timeline_to_check = Arc::clone(&timeline); - tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) - .await - .with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))? - .with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?; + timeline.check_checkpoint_distance().with_context(|| { + format!( + "Failed to check checkpoint distance for timeline {}", + timeline.timeline_id + ) + })?; if let Some(last_lsn) = status_update { let remote_index = repo.get_remote_index(); diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index befa4616be..315ec7f306 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_until from fixtures.types import ZTenantId, ZTimelineId @@ -39,9 +40,6 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) - def assert_idle(tenant): - assert get_state(tenant) == "Idle" - # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) @@ -51,18 +49,21 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Stop compute pg.stop() - # Detach all tenants and wait for them to go idle - # TODO they should be already idle since there are no active computes + # Delete all timelines on all tenants for tenant_info in client.tenant_list(): tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) - wait_until(10, 0.2, lambda: assert_idle(tenant_id)) - # Assert that all tasks finish quickly after tenants go idle + # Assert that all tasks finish quickly after tenant is detached + assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + client.tenant_detach(tenant) + client.tenant_detach(env.initial_tenant) + def assert_tasks_finish(): tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended assert tasks_panicked == 0 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index bfe61b9ced..096b3a5d70 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -47,9 +47,9 @@ scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } -tokio-util = { version = "0.7", features = ["codec", "io", "tracing"] } +tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } From 2a837d7de71a3f8bd74bbaa0d85f056bdac6f861 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 00:04:33 +0300 Subject: [PATCH 044/166] Create tenants in temporary directory first (#2426) --- pageserver/src/layered_repository.rs | 59 ++++++++++--- pageserver/src/tenant_mgr.rs | 127 ++++++++++++++++++++------- test_runner/regress/test_tenants.py | 41 ++++++++- 3 files changed, 182 insertions(+), 45 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 768bdd396b..ecc0bfe3b5 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -21,6 +21,8 @@ use std::collections::BTreeSet; use std::collections::HashMap; use std::fs; use std::fs::File; +use std::fs::OpenOptions; +use std::io::Write; use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; @@ -38,6 +40,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::metrics::STORAGE_TIME; use crate::repository::GcResult; use crate::task_mgr; +use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -663,14 +666,14 @@ impl Repository { } pub fn persist_tenant_config( - conf: &'static PageServerConf, - tenant_id: ZTenantId, + target_config_path: &Path, tenant_conf: TenantConfOpt, + first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving tenantconf").entered(); - let target_config_path = TenantConf::path(conf, tenant_id); - info!("save tenantconf to {}", target_config_path.display()); + info!("persisting tenantconf to {}", target_config_path.display()); + // TODO this will prepend comments endlessly let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. @@ -681,12 +684,48 @@ impl Repository { // Convert the config to a toml file. conf_content += &toml_edit::easy::to_string(&tenant_conf)?; - fs::write(&target_config_path, conf_content).with_context(|| { - format!( - "Failed to write config file into path '{}'", - target_config_path.display() - ) - }) + let mut target_config_file = VirtualFile::open_with_options( + target_config_path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + target_config_file + .write(conf_content.as_bytes()) + .context("Failed to write toml bytes into file") + .and_then(|_| { + target_config_file + .sync_all() + .context("Faile to fsync config file") + }) + .with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + })?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + target_config_path + .parent() + .context("Config file does not have a parent") + .and_then(|target_config_parent| { + File::open(target_config_parent).context("Failed to open config parent") + }) + .and_then(|tenant_dir| { + tenant_dir + .sync_all() + .context("Failed to fsync config parent") + }) + .with_context(|| { + format!( + "Failed to fsync on firts save for config {}", + target_config_path.display() + ) + })?; + } + + Ok(()) } // diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index db256b0f65..a9f015229f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,16 +9,14 @@ use crate::layered_repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::task_mgr::{self, TaskKind}; -use crate::tenant_config::TenantConfOpt; +use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::walredo::{PostgresRedoManager, WalRedoManager}; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::{ensure, Context}; -use remote_storage::GenericRemoteStorage; -use serde::{Deserialize, Serialize}; -use std::collections::hash_map::{self, Entry}; -use std::collections::{HashMap, HashSet}; +use anyhow::Context; +use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; +use std::collections::{hash_map, HashMap, HashSet}; use std::ffi::OsStr; -use std::fmt; +use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; use tracing::*; @@ -58,7 +56,7 @@ struct Tenant { repo: Arc, } -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TenantState { // This tenant exists on local disk, and the layer map has been loaded into memory. // The local disk might have some newer files that don't exist in cloud storage yet. @@ -74,8 +72,8 @@ pub enum TenantState { Broken, } -impl fmt::Display for TenantState { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for TenantState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Active => f.write_str("Active"), Self::Idle => f.write_str("Idle"), @@ -252,21 +250,71 @@ fn create_repo( wal_redo_manager: Arc, remote_index: RemoteIndex, ) -> anyhow::Result> { - let repo_dir = conf.tenant_path(&tenant_id); - ensure!( - !repo_dir.exists(), - "cannot create new tenant repo: '{}' directory already exists", - tenant_id + let target_tenant_directory = conf.tenant_path(&tenant_id); + anyhow::ensure!( + !target_tenant_directory.exists(), + "cannot create new tenant repo: '{tenant_id}' directory already exists", ); - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; - info!("created directory structure in {}", repo_dir.display()); + let temporary_tenant_dir = + path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX); + debug!( + "Creating temporary directory structure in {}", + temporary_tenant_dir.display() + ); - // Save tenant's config - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + let temporary_tenant_timelines_dir = rebase_directory( + &conf.timelines_path(&tenant_id), + &target_tenant_directory, + &temporary_tenant_dir, + )?; + let temporary_tenant_config_path = rebase_directory( + &TenantConf::path(conf, tenant_id), + &target_tenant_directory, + &temporary_tenant_dir, + )?; + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| { + format!( + "could not create temporary tenant directory {}", + temporary_tenant_dir.display() + ) + })?; + // first, create a config in the top-level temp directory, fsync the file + Repository::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; + // then, create a subdirectory in the top-level temp directory, fsynced + crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { + format!( + "could not create temporary tenant timelines directory {}", + temporary_tenant_timelines_dir.display() + ) + })?; + + fail::fail_point!("tenant-creation-before-tmp-rename", |_| { + anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); + }); + + // move-rename tmp directory with all files synced into a permanent directory, fsync its parent + fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| { + format!( + "failed to move temporary tenant directory {} into the permanent one {}", + temporary_tenant_dir.display(), + target_tenant_directory.display() + ) + })?; + let target_dir_parent = target_tenant_directory.parent().with_context(|| { + format!( + "Failed to get tenant dir parent for {}", + target_tenant_directory.display() + ) + })?; + fs::File::open(target_dir_parent)?.sync_all()?; + + info!( + "created directory structure in {}", + target_tenant_directory.display() + ); Ok(Arc::new(Repository::new( conf, @@ -278,6 +326,17 @@ fn create_repo( ))) } +fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result { + let relative_path = original_path.strip_prefix(base).with_context(|| { + format!( + "Failed to strip base prefix '{}' off path '{}'", + base.display(), + original_path.display() + ) + })?; + Ok(new_base.join(relative_path)) +} + pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, @@ -285,11 +344,11 @@ pub fn create_tenant( remote_index: RemoteIndex, ) -> anyhow::Result> { match tenants_state::write_tenants().entry(tenant_id) { - Entry::Occupied(_) => { + hash_map::Entry::Occupied(_) => { debug!("tenant {tenant_id} already exists"); Ok(None) } - Entry::Vacant(v) => { + hash_map::Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; v.insert(Tenant { @@ -310,7 +369,7 @@ pub fn update_tenant_config( info!("configuring tenant {tenant_id}"); get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + Repository::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; Ok(()) } @@ -424,7 +483,7 @@ pub async fn detach_tenant( // we will attempt to remove files which no longer exist. This can be fixed by having shutdown // mechanism for repository that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); - std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { + fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( "Failed to remove local tenant directory '{}'", local_tenant_directory.display() @@ -472,7 +531,7 @@ fn local_tenant_timeline_files( let mut local_tenant_timeline_files = TenantTimelineValues::new(); let tenants_dir = config.tenants_path(); - for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + for tenants_dir_entry in fs::read_dir(&tenants_dir) .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? { match &tenants_dir_entry { @@ -483,7 +542,7 @@ fn local_tenant_timeline_files( "Found temporary tenant directory, removing: {}", tenant_dir_path.display() ); - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { + if let Err(e) = fs::remove_dir_all(&tenant_dir_path) { error!( "Failed to remove temporary directory '{}': {:?}", tenant_dir_path.display(), @@ -545,7 +604,7 @@ fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result { .is_none(); if directory_is_empty { - std::fs::remove_dir_all(&tenant_dir_path).with_context(|| { + fs::remove_dir_all(&tenant_dir_path).with_context(|| { format!( "Failed to remove empty directory '{}'", tenant_dir_path.display(), @@ -582,7 +641,7 @@ fn collect_timelines_for_tenant( let timelines_dir = config.timelines_path(&tenant_id); let mut tenant_timelines = HashMap::new(); - for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + for timelines_dir_entry in fs::read_dir(&timelines_dir) .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? { match timelines_dir_entry { @@ -593,7 +652,7 @@ fn collect_timelines_for_tenant( "Found temporary timeline directory, removing: {}", timeline_dir.display() ); - if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + if let Err(e) = fs::remove_dir_all(&timeline_dir) { error!( "Failed to remove temporary directory '{}': {:?}", timeline_dir.display(), @@ -660,7 +719,7 @@ fn collect_timeline_files( .parse::() .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = - std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; for entry in timeline_dir_entries { let entry_path = entry.context("Failed to list timeline dir entry")?.path(); if entry_path.is_file() { @@ -671,7 +730,7 @@ fn collect_timeline_files( continue; } else if is_temporary(&entry_path) { info!("removing temp timeline file at {}", entry_path.display()); - std::fs::remove_file(&entry_path).with_context(|| { + fs::remove_file(&entry_path).with_context(|| { format!( "failed to remove temp download file at {}", entry_path.display() @@ -695,7 +754,7 @@ fn collect_timeline_files( None => anyhow::bail!("No metadata file found in the timeline directory"), }; let metadata = TimelineMetadata::from_bytes( - &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, ) .context("Failed to parse timeline metadata file bytes")?; diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 767f94d167..bd53aae25c 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,16 +1,55 @@ import os from contextlib import closing from datetime import datetime +from pathlib import Path from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.types import Lsn, ZTenantId from prometheus_client.samples import Sample +def test_tenant_creation_fails(neon_simple_env: NeonEnv): + tenants_dir = Path(neon_simple_env.repo_dir) / "tenants" + initial_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) + + neon_simple_env.pageserver.safe_psql("failpoints tenant-creation-before-tmp-rename=return") + with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): + _ = neon_simple_env.neon_cli.create_tenant() + + new_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + assert initial_tenants == new_tenants, "should not create new tenants" + + new_tenant_dirs = list(set([d for d in tenants_dir.iterdir()]) - initial_tenant_dirs) + assert len(new_tenant_dirs) == 1, "should have new tenant directory created" + tmp_tenant_dir = new_tenant_dirs[0] + assert str(tmp_tenant_dir).endswith( + ".___temp" + ), "new tenant directory created should be a temporary one" + + neon_simple_env.pageserver.stop() + neon_simple_env.pageserver.start() + + tenants_after_restart = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + dirs_after_restart = set([d for d in tenants_dir.iterdir()]) + assert ( + tenants_after_restart == initial_tenants + ), "should load all non-corrupt tenants after restart" + assert ( + dirs_after_restart == initial_tenant_dirs + ), "pageserver should clean its temp tenant dirs on restart" + + @pytest.mark.parametrize("with_safekeepers", [False, True]) def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): if with_safekeepers: From 4f7557fb58145022450bfb926913b9016c19aab9 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 09:45:45 +0100 Subject: [PATCH 045/166] github/workflows: Create projects using API (#2403) * github/actions: add neon projects related actions * workflows/benchmarking: create projects using API * workflows/pg_clients: create projects using API --- .../actions/neon-project-create/action.yml | 81 +++++++++++++ .../actions/neon-project-delete/action.yml | 54 +++++++++ .github/workflows/benchmarking.yml | 113 +++++++++++------- .github/workflows/pg_clients.yml | 18 ++- 4 files changed, 223 insertions(+), 43 deletions(-) create mode 100644 .github/actions/neon-project-create/action.yml create mode 100644 .github/actions/neon-project-delete/action.yml diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml new file mode 100644 index 0000000000..d4fced4196 --- /dev/null +++ b/.github/actions/neon-project-create/action.yml @@ -0,0 +1,81 @@ +name: 'Create Neon Project' +description: 'Create Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + region_id: + desctiption: 'Region ID, if not set the project will be created in the default region' + required: false +outputs: + dsn: + description: 'Created Project DSN (for main database)' + value: ${{ steps.create-neon-project.outputs.dsn }} + project_id: + description: 'Created Project ID' + value: ${{ steps.create-neon-project.outputs.project_id }} + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + REGION_ID=${REGION_ID:-eu-west-1} + ;; + staging) + API_HOST=console.stage.neon.tech + REGION_ID=${REGION_ID:-us-east-1} + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "::set-output name=api_host::${API_HOST}" + echo "::set-output name=region_id::${REGION_ID}" + env: + ENVIRONMENT: ${{ inputs.environment }} + REGION_ID: ${{ inputs.region_id }} + + - name: Create Neon Project + id: create-neon-project + # A shell without `set -x` to not to expose password/dsn in logs + shell: bash -euo pipefail {0} + run: | + project=$(curl \ + "https://${API_HOST}/api/v1/projects" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" \ + --data "{ + \"project\": { + \"platform_id\": \"serverless\", + \"region_id\": \"${REGION_ID}\", + \"settings\": { } + } + }") + + # Mask password + echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')" + + dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main + echo "::add-mask::${dsn}" + echo "::set-output name=dsn::${dsn}" + + project_id=$(echo $project | jq --raw-output '.id') + echo "::set-output name=project_id::${project_id}" + env: + API_KEY: ${{ inputs.api_key }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} + REGION_ID: ${{ steps.parse-input.outputs.region_id }} diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml new file mode 100644 index 0000000000..e7c6f58901 --- /dev/null +++ b/.github/actions/neon-project-delete/action.yml @@ -0,0 +1,54 @@ +name: 'Delete Neon Project' +description: 'Delete Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + project_id: + desctiption: 'ID of the Project to delete' + required: true + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + ;; + staging) + API_HOST=console.stage.neon.tech + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "::set-output name=api_host::${API_HOST}" + env: + ENVIRONMENT: ${{ inputs.environment }} + + - name: Delete Neon Project + shell: bash -euxo pipefail {0} + run: | + # Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed + if [ -n "${PROJECT_ID}" ]; then + curl -X "POST" \ + "https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + fi + env: + API_KEY: ${{ inputs.api_key }} + PROJECT_ID: ${{ inputs.project_id }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4c58dda6b6..49fbc74dd6 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -14,6 +14,13 @@ on: - cron: '36 4 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually + inputs: + environment: + description: 'Environment to run remote tests on (dev or staging)' + required: false + region_id: + description: 'Use a particular region. If empty the default one will be used' + false: true defaults: run: @@ -62,19 +69,12 @@ jobs: echo Pgbench $POSTGRES_DISTRIB_DIR/bin/pgbench --version - # FIXME cluster setup is skipped due to various changes in console API - # for now pre created cluster is used. When API gain some stability - # after massive changes dynamic cluster setup will be revived. - # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity - - name: Setup cluster - env: - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - run: | - set -e - - echo "Starting cluster" - # wake up the cluster - $POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1" + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: ${{ github.event.inputs.environment || 'staging' }} + api_key: ${{ ( github.event.inputs.environment || 'staging' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }} - name: Run benchmark # pgbench is installed system wide from official repo @@ -97,7 +97,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" PLATFORM: "neon-staging" - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally run: | # just to be sure that no data was cached on self hosted runner @@ -115,6 +115,14 @@ jobs: run: | REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh + - name: Delete Neon Project + if: ${{ always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: staging + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 @@ -131,11 +139,12 @@ jobs: POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote + SAVE_PERF_REPORT: true strategy: fail-fast: false matrix: - connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ] + platform: [ neon-captest, rds-aurora ] runs-on: dev container: @@ -147,38 +156,52 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Calculate platform - id: calculate-platform - env: - CONNSTR: ${{ matrix.connstr }} - run: | - if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then - PLATFORM=neon-captest - elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then - PLATFORM=rds-aurora - else - echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only" - exit 1 - fi - - echo "::set-output name=PLATFORM::${PLATFORM}" - - name: Install Deps run: | sudo apt -y update sudo apt install -y postgresql-14 + - name: Create Neon Project + if: matrix.platform == 'neon-captest' + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: ${{ github.event.inputs.environment || 'dev' }} + api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${PLATFORM}" in + neon-captest) + CONNSTR=${{ steps.create-neon-project.outputs.dsn }} + ;; + rds-aurora) + CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} + ;; + *) + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest' or 'rds-aurora'" + exit 1 + ;; + esac + + echo "::set-output name=connstr::${CONNSTR}" + + psql ${CONNSTR} -c "SELECT version();" + env: + PLATFORM: ${{ matrix.platform }} + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -188,25 +211,25 @@ jobs: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - - name: Benchmark simple-update + - name: Benchmark select-only uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + PLATFORM: ${{ matrix.platform }} + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -216,6 +239,14 @@ jobs: action: generate build_type: ${{ env.BUILD_TYPE }} + - name: Delete Neon Project + if: ${{ matrix.platform == 'neon-captest' && always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: dev + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_CAPTEST_API_KEY }} + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index bf14865db2..d04d002811 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -47,11 +47,17 @@ jobs: shell: bash -euxo pipefail {0} run: ./scripts/pysync + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: staging + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Run pytest env: REMOTE_ENV: 1 - BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" - + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 shell: bash -euxo pipefail {0} run: | @@ -65,6 +71,14 @@ jobs: -m "remote_cluster" \ -rA "test_runner/pg_clients" + - name: Delete Neon Project + if: ${{ always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: staging + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. # It will be fixed after switching to gen2 runner - name: Upload python test logs From f44afbaf62efb2910cefb671457fe60ada9163d5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 13 Sep 2022 12:26:20 +0300 Subject: [PATCH 046/166] Changes of neon extension to support local prefetch (#2369) * Changes of neon extension to support local prefetch * Catch exceptions in pageserver_receive * Bump posgres version * Bump posgres version * Bump posgres version * Bump posgres version --- pgxn/neon/libpagestore.c | 158 +++++++++++++++++++++-------------- pgxn/neon/pagestore_client.h | 6 +- pgxn/neon/pagestore_smgr.c | 139 ++++++++++++++++++++++++++++-- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 5 files changed, 233 insertions(+), 74 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 649fc1037e..d0572e66cb 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -43,11 +43,6 @@ PGconn *pageserver_conn = NULL; char *page_server_connstring_raw; -static ZenithResponse *pageserver_call(ZenithRequest *request); -page_server_api api = { - .request = pageserver_call -}; - static void pageserver_connect() { @@ -154,60 +149,86 @@ retry: } -static ZenithResponse * -pageserver_call(ZenithRequest *request) +static void +pageserver_disconnect(void) +{ + /* + * If anything goes wrong while we were sending a request, it's not + * clear what state the connection is in. For example, if we sent the + * request but didn't receive a response yet, we might receive the + * response some time later after we have already sent a new unrelated + * request. Close the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } +} + +static void +pageserver_send(ZenithRequest *request) { StringInfoData req_buff; + + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + + if (!connected) + pageserver_connect(); + + req_buff = zm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output + * and TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + { + char* msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); + neon_log(ERROR, "failed to send page request: %s", msg); + } + pfree(req_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) request); + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } +} + +static ZenithResponse * +pageserver_receive(void) +{ StringInfoData resp_buff; ZenithResponse *resp; PG_TRY(); { - /* If the connection was lost for some reason, reconnect */ - if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) - { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } - - if (!connected) - pageserver_connect(); - - req_buff = zm_pack_request(request); - - /* - * Send request. - * - * In principle, this could block if the output buffer is full, and we - * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output - * and TCP buffer. - */ - if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) - { - neon_log(ERROR, "failed to send page request: %s", - PQerrorMessage(pageserver_conn)); - } - pfree(req_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = zm_to_string((ZenithMessage *) request); - - neon_log(PageStoreTrace, "sent request: %s", msg); - pfree(msg); - } - /* read response */ resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); resp_buff.cursor = 0; - if (resp_buff.len == -1) - neon_log(ERROR, "end of COPY"); - else if (resp_buff.len == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); - + if (resp_buff.len < 0) + { + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + } resp = zm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); @@ -221,20 +242,7 @@ pageserver_call(ZenithRequest *request) } PG_CATCH(); { - /* - * If anything goes wrong while we were sending a request, it's not - * clear what state the connection is in. For example, if we sent the - * request but didn't receive a response yet, we might receive the - * response some time later after we have already sent a new unrelated - * request. Close the connection to avoid getting confused. - */ - if (connected) - { - neon_log(LOG, "dropping connection to page server due to error"); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } + pageserver_disconnect(); PG_RE_THROW(); } PG_END_TRY(); @@ -243,6 +251,32 @@ pageserver_call(ZenithRequest *request) } +static void +pageserver_flush(void) +{ + if (PQflush(pageserver_conn)) + { + char* msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); + neon_log(ERROR, "failed to flush page requests: %s", msg); + } +} + +static ZenithResponse * +pageserver_call(ZenithRequest* request) +{ + pageserver_send(request); + pageserver_flush(); + return pageserver_receive(); +} + +page_server_api api = { + .request = pageserver_call, + .send = pageserver_send, + .flush = pageserver_flush, + .receive = pageserver_receive +}; + static bool check_zenith_id(char **newval, void **extra, GucSource source) { diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 93ea6771eb..5b21abc1bd 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -142,7 +142,10 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { ZenithResponse *(*request) (ZenithRequest *request); -} page_server_api; + void (*send) (ZenithRequest *request); + ZenithResponse *(*receive) (void); + void (*flush) (void); +} page_server_api; extern page_server_api *page_server; @@ -171,6 +174,7 @@ extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +extern void zenith_reset_prefetch(SMgrRelation reln); extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index d49df7af58..ebf899dfdb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -57,6 +57,8 @@ #include "postmaster/interrupt.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/relfilenode.h" +#include "storage/buf_internals.h" #include "storage/md.h" #include "fmgr.h" #include "miscadmin.h" @@ -110,6 +112,49 @@ typedef enum static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +/* + * Prefetch implementation: + * Prefetch is performed locally by each backend. + * There can be up to MAX_PREFETCH_REQUESTS registered using smgr_prefetch + * before smgr_read. All this requests are appended to primary smgr_read request. + * It is assumed that pages will be requested in prefetch order. + * Reading of prefetch responses is delayed until them are actually needed (smgr_read). + * It make it possible to parallelize processing and receiving of prefetched pages. + * In case of prefetch miss or any other SMGR request other than smgr_read, + * all prefetch responses has to be consumed. + */ + +#define MAX_PREFETCH_REQUESTS 128 + +BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; +BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; +int n_prefetch_requests; +int n_prefetch_responses; +int n_prefetched_buffers; +int n_prefetch_hits; +int n_prefetch_misses; +XLogRecPtr prefetch_lsn; + +static void +consume_prefetch_responses(void) +{ + for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { + ZenithResponse* resp = page_server->receive(); + pfree(resp); + } + n_prefetched_buffers = 0; + n_prefetch_responses = 0; +} + +static ZenithResponse* +page_server_request(void const* req) +{ + consume_prefetch_responses(); + return page_server->request((ZenithRequest*)req); +} + + StringInfoData zm_pack_request(ZenithRequest *msg) { @@ -735,7 +780,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) .forknum = forkNum }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -948,6 +993,16 @@ zenith_close(SMgrRelation reln, ForkNumber forknum) mdclose(reln, forknum); } + +/* + * zenith_reset_prefetch() -- reoe all previously rgistered prefeth requests + */ +void +zenith_reset_prefetch(SMgrRelation reln) +{ + n_prefetch_requests = 0; +} + /* * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -971,9 +1026,15 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); - return true; + if (n_prefetch_requests < MAX_PREFETCH_REQUESTS) + { + prefetch_requests[n_prefetch_requests].rnode = reln->smgr_rnode.node; + prefetch_requests[n_prefetch_requests].forkNum = forknum; + prefetch_requests[n_prefetch_requests].blockNum = blocknum; + n_prefetch_requests += 1; + return true; + } + return false; } /* @@ -1022,7 +1083,47 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno XLogRecPtr request_lsn, bool request_latest, char *buffer) { ZenithResponse *resp; + int i; + /* + * Try to find prefetched page. + * It is assumed that pages will be requested in the same order as them are prefetched, + * but some other backend may load page in shared buffers, so some prefetch responses should + * be skipped. + */ + for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) + { + resp = page_server->receive(); + if (resp->tag == T_ZenithGetPageResponse && + RelFileNodeEquals(prefetch_responses[i].rnode, rnode) && + prefetch_responses[i].forkNum == forkNum && + prefetch_responses[i].blockNum == blkno) + { + char* page = ((ZenithGetPageResponse *) resp)->page; + /* + * Check if prefetched page is still relevant. + * If it is updated by some other backend, then it should not + * be requested from smgr unless it is evicted from shared buffers. + * In the last case last_evicted_lsn should be updated and + * request_lsn should be greater than prefetch_lsn. + * Maximum with page LSN is used because page returned by page server + * may have LSN either greater either smaller than requested. + */ + if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn) + { + n_prefetched_buffers = i+1; + n_prefetch_hits += 1; + n_prefetch_requests = 0; + memcpy(buffer, page, BLCKSZ); + pfree(resp); + return; + } + } + pfree(resp); + } + n_prefetched_buffers = 0; + n_prefetch_responses = 0; + n_prefetch_misses += 1; { ZenithGetPageRequest request = { .req.tag = T_ZenithGetPageRequest, @@ -1032,10 +1133,29 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno .forknum = forkNum, .blkno = blkno }; - - resp = page_server->request((ZenithRequest *) &request); + if (n_prefetch_requests > 0) + { + /* Combine all prefetch requests with primary request */ + page_server->send((ZenithRequest *) &request); + for (i = 0; i < n_prefetch_requests; i++) + { + request.rnode = prefetch_requests[i].rnode; + request.forknum = prefetch_requests[i].forkNum; + request.blkno = prefetch_requests[i].blockNum; + prefetch_responses[i] = prefetch_requests[i]; + page_server->send((ZenithRequest *) &request); + } + page_server->flush(); + n_prefetch_responses = n_prefetch_requests; + n_prefetch_requests = 0; + prefetch_lsn = request_lsn; + resp = page_server->receive(); + } + else + { + resp = page_server->request((ZenithRequest *) &request); + } } - switch (resp->tag) { case T_ZenithGetPageResponse: @@ -1305,7 +1425,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) .forknum = forknum, }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -1365,7 +1485,7 @@ zenith_dbsize(Oid dbNode) .dbNode = dbNode, }; - resp = page_server->request((ZenithRequest *) &request); + resp = page_server_request(&request); } switch (resp->tag) @@ -1680,6 +1800,7 @@ static const struct f_smgr zenith_smgr = .smgr_unlink = zenith_unlink, .smgr_extend = zenith_extend, .smgr_prefetch = zenith_prefetch, + .smgr_reset_prefetch = zenith_reset_prefetch, .smgr_read = zenith_read, .smgr_write = zenith_write, .smgr_writeback = zenith_writeback, diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index e8518d3fc8..114676d2ed 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit e8518d3fc85e3da420d2f5a2742a21386e6585ec +Subproject commit 114676d2edd5307226d9448ec467821fdb77467d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 313769bb62..b1dbd93e2b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 313769bb6229f46380e24d8f6ff535f9185458af +Subproject commit b1dbd93e2b1691e93860f7e59b9e1fe5a6e79786 From 1a8c8b04d70bd82a20055e2653c4aa593e3bfc34 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 7 Sep 2022 18:01:49 +0300 Subject: [PATCH 047/166] Merge Repository and Tenant entities, rework tenant background jobs --- control_plane/src/bin/neon_local.rs | 8 +- pageserver/src/basebackup.rs | 2 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/bin/update_metadata.rs | 2 +- pageserver/src/config.rs | 2 +- pageserver/src/http/models.rs | 5 +- pageserver/src/http/openapi_spec.yml | 4 +- pageserver/src/http/routes.rs | 123 ++++--- pageserver/src/import_datadir.rs | 2 +- pageserver/src/lib.rs | 4 +- pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 56 ++-- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/storage_sync.rs | 34 +- pageserver/src/storage_sync/delete.rs | 6 +- pageserver/src/storage_sync/download.rs | 10 +- pageserver/src/storage_sync/index.rs | 8 +- pageserver/src/storage_sync/upload.rs | 12 +- .../src/{layered_repository.rs => tenant.rs} | 255 +++++++++----- .../{layered_repository => tenant}/blob_io.rs | 2 +- .../block_io.rs | 2 +- .../delta_layer.rs | 12 +- .../disk_btree.rs | 2 +- .../disk_btree_test_data.rs | 0 .../ephemeral_file.rs | 14 +- .../filename.rs | 0 .../image_layer.rs | 12 +- .../inmemory_layer.rs | 12 +- .../layer_map.rs | 6 +- .../metadata.rs | 4 +- .../par_fsync.rs | 0 .../storage_layer.rs | 0 .../timeline.rs | 4 +- pageserver/src/tenant_mgr.rs | 312 +++++++----------- pageserver/src/tenant_tasks.rs | 147 ++++++--- pageserver/src/timelines.rs | 31 +- pageserver/src/walingest.rs | 25 +- .../src/walreceiver/connection_manager.rs | 20 +- .../src/walreceiver/walreceiver_connection.rs | 7 +- test_runner/regress/test_broken_timeline.py | 4 +- test_runner/regress/test_tenant_tasks.py | 8 +- test_runner/regress/test_timeline_delete.py | 5 +- 43 files changed, 615 insertions(+), 563 deletions(-) rename pageserver/src/{layered_repository.rs => tenant.rs} (88%) rename pageserver/src/{layered_repository => tenant}/blob_io.rs (98%) rename pageserver/src/{layered_repository => tenant}/block_io.rs (98%) rename pageserver/src/{layered_repository => tenant}/delta_layer.rs (98%) rename pageserver/src/{layered_repository => tenant}/disk_btree.rs (99%) rename pageserver/src/{layered_repository => tenant}/disk_btree_test_data.rs (100%) rename pageserver/src/{layered_repository => tenant}/ephemeral_file.rs (97%) rename pageserver/src/{layered_repository => tenant}/filename.rs (100%) rename pageserver/src/{layered_repository => tenant}/image_layer.rs (97%) rename pageserver/src/{layered_repository => tenant}/inmemory_layer.rs (96%) rename pageserver/src/{layered_repository => tenant}/layer_map.rs (98%) rename pageserver/src/{layered_repository => tenant}/metadata.rs (98%) rename pageserver/src/{layered_repository => tenant}/par_fsync.rs (100%) rename pageserver/src/{layered_repository => tenant}/storage_layer.rs (100%) rename pageserver/src/{layered_repository => tenant}/timeline.rs (99%) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 828d6a2e5a..e3160db53b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -543,13 +543,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an match tenant_match.subcommand() { Some(("list", _)) => { for t in pageserver.tenant_list()? { - println!( - "{} {}", - t.id, - t.state - .map(|s| s.to_string()) - .unwrap_or_else(|| String::from("")) - ); + println!("{} {:?}", t.id, t.state); } } Some(("create", create_match)) => { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 61facc852d..eca6a3c87f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,8 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 87390a1b06..7e766ce859 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -3,8 +3,8 @@ //! A handy tool for debugging, that's all. use anyhow::Result; use clap::{App, Arg}; -use pageserver::layered_repository::dump_layerfile_from_path; use pageserver::page_cache; +use pageserver::tenant::dump_layerfile_from_path; use pageserver::virtual_file; use std::path::PathBuf; use utils::project_git_version; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ec71e5b320..679c6f76e7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -182,7 +182,7 @@ fn initialize_config( cfg_file_path.display() ); } else { - // We're initializing the repo, so there's no config file yet + // We're initializing the tenant, so there's no config file yet ( DEFAULT_CONFIG_FILE .parse::() diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 983fdb8647..3339564b0f 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -3,7 +3,7 @@ //! A handy tool for debugging, that's all. use anyhow::Result; use clap::{App, Arg}; -use pageserver::layered_repository::metadata::TimelineMetadata; +use pageserver::tenant::metadata::TimelineMetadata; use std::path::PathBuf; use std::str::FromStr; use utils::{lsn::Lsn, project_git_version}; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index fb70ea327d..56171f46e3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -19,7 +19,7 @@ use utils::{ zid::{NodeId, ZTenantId, ZTimelineId}, }; -use crate::layered_repository::TIMELINES_SEGMENT_NAME; +use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; pub mod defaults { diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 7c7d7f7b0c..0ccf23776c 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -7,8 +7,7 @@ use utils::{ zid::{NodeId, ZTenantId, ZTimelineId}, }; -// These enums are used in the API response fields. -use crate::tenant_mgr::TenantState; +use crate::tenant::TenantState; #[serde_as] #[derive(Serialize, Deserialize)] @@ -108,7 +107,7 @@ impl TenantConfigRequest { pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: ZTenantId, - pub state: Option, + pub state: TenantState, pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 6beb938d6a..b9a62d0f32 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -489,6 +489,7 @@ components: type: object required: - id + - state properties: id: type: string @@ -573,7 +574,6 @@ components: required: - last_record_lsn - disk_consistent_lsn - - timeline_state properties: last_record_lsn: type: string @@ -581,8 +581,6 @@ components: disk_consistent_lsn: type: string format: hex - timeline_state: - type: string ancestor_timeline_id: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 78f83511cb..36ba2e9b66 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,9 +11,9 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::Timeline; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; +use crate::tenant::{TenantState, Timeline}; use crate::tenant_config::TenantConfOpt; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ @@ -132,12 +132,11 @@ fn list_local_timelines( include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("Failed to get repo for tenant {tenant_id}"))?; - let repo_timelines = repo.list_timelines(); + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let timelines = tenant.list_timelines(); - let mut local_timeline_info = Vec::with_capacity(repo_timelines.len()); - for (timeline_id, repository_timeline) in repo_timelines { + let mut local_timeline_info = Vec::with_capacity(timelines.len()); + for (timeline_id, repository_timeline) in timelines { local_timeline_info.push(( timeline_id, local_timeline_info_from_timeline( @@ -201,23 +200,31 @@ async fn timeline_list_handler(request: Request) -> Result, query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; - let local_timeline_infos = tokio::task::spawn_blocking(move || { + let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - list_local_timelines( - tenant_id, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) + Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines()) }) .await .map_err(ApiError::from_err)??; - let mut response_data = Vec::with_capacity(local_timeline_infos.len()); - for (timeline_id, local_timeline_info) in local_timeline_infos { + let mut response_data = Vec::with_capacity(timelines.len()); + for (timeline_id, timeline) in timelines { + let local = match local_timeline_info_from_timeline( + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) { + Ok(local) => Some(local), + Err(e) => { + error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}"); + None + } + }; + response_data.push(TimelineInfo { tenant_id, timeline_id, - local: Some(local_timeline_info), + local, remote: get_state(&request) .remote_index .read() @@ -259,28 +266,25 @@ async fn timeline_detail_handler(request: Request) -> Result(local_timeline) + let timeline = tokio::task::spawn_blocking(move || { + tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id) }) .await - .ok() - .and_then(|r| r.ok()) - .flatten(); + .map_err(ApiError::from_err)?; + + let local_timeline_info = match timeline.and_then(|timeline| { + local_timeline_info_from_timeline( + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) + }) { + Ok(local_info) => Some(local_info), + Err(e) => { + error!("Failed to get local timeline info: {e:#}"); + None + } + }; let remote_timeline_info = { let remote_index_read = get_state(&request).remote_index.read().await; @@ -294,25 +298,26 @@ async fn timeline_detail_handler(request: Request) -> Result((local_timeline_info, remote_timeline_info)) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) - .await; + .await?; if local_timeline_info.is_none() && remote_timeline_info.is_none() { - return Err(ApiError::NotFound(format!( + Err(ApiError::NotFound(format!( "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely" - ))); + ))) + } else { + json_response( + StatusCode::OK, + TimelineInfo { + tenant_id, + timeline_id, + local: local_timeline_info, + remote: remote_timeline_info, + }, + ) } - - let timeline_info = TimelineInfo { - tenant_id, - timeline_id, - local: local_timeline_info, - remote: remote_timeline_info, - }; - - json_response(StatusCode::OK, timeline_info) } // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create @@ -320,10 +325,10 @@ async fn tenant_attach_handler(request: Request) -> Result, let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - info!("Handling tenant attach {}", tenant_id); + info!("Handling tenant attach {tenant_id}"); tokio::task::spawn_blocking(move || { - if tenant_mgr::get_tenant_state(tenant_id).is_some() { + if tenant_mgr::get_tenant(tenant_id, false).is_ok() { anyhow::bail!("Tenant is already present locally") }; Ok(()) @@ -426,7 +431,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, ApiErro check_permission(&request, Some(tenant_id))?; // if tenant is in progress of downloading it can be absent in global tenant map - let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id)) + let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) .await .map_err(ApiError::from_err)?; @@ -494,13 +499,25 @@ async fn tenant_status(request: Request) -> Result, ApiErro false }); + let tenant_state = match tenant { + Ok(tenant) => tenant.current_state(), + Err(e) => { + error!("Failed to get local tenant state: {e:#}"); + if has_in_progress_downloads { + TenantState::Paused + } else { + TenantState::Broken + } + } + }; + let current_physical_size = match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) .await .map_err(ApiError::from_err)? { Err(err) => { - // Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded). + // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). // In that case, put a warning message into log and operate normally. warn!("Failed to get local timelines for tenant {tenant_id}: {err}"); None diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index f8f614f8f4..ee0780f4b2 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -11,9 +11,9 @@ use bytes::Bytes; use tracing::*; use walkdir::WalkDir; -use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; use postgres_ffi::v14::relfile_utils::*; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 8b9251229e..5742568079 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,7 +3,6 @@ pub mod config; pub mod http; pub mod import_datadir; pub mod keyspace; -pub mod layered_repository; pub mod metrics; pub mod page_cache; pub mod page_service; @@ -13,6 +12,7 @@ pub mod reltag; pub mod repository; pub mod storage_sync; pub mod task_mgr; +pub mod tenant; pub mod tenant_config; pub mod tenant_mgr; pub mod tenant_tasks; @@ -181,7 +181,7 @@ mod backoff_defaults_tests { #[cfg(test)] mod tests { - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; use super::*; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 27b1400243..15c3c22dd6 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -53,8 +53,8 @@ use utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::layered_repository::writeback_ephemeral_file; use crate::repository::Key; +use crate::tenant::writeback_ephemeral_file; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 149144bfe4..b03dab20e0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -34,13 +34,13 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::layered_repository::Timeline; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; use crate::tenant_mgr; use crate::CheckpointConfig; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; @@ -477,8 +477,8 @@ impl PageServerHandler { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; + let timeline = tenant_mgr::get_tenant(tenant_id, true)? + .create_empty_timeline(timeline_id, base_lsn)?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -539,10 +539,7 @@ impl PageServerHandler { ) -> anyhow::Result<()> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - let timeline = repo - .get_timeline(timeline_id) - .with_context(|| format!("Timeline {timeline_id} was not found"))?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; ensure!(timeline.get_last_record_lsn() == start_lsn); // TODO leave clean state on error. For now you can use detach to clean @@ -770,7 +767,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenantid: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -782,7 +779,7 @@ impl PageServerHandler { .claims .as_ref() .expect("claims presence already checked"); - auth::check_permission(claims, tenantid) + auth::check_permission(claims, tenant_id) } } @@ -809,7 +806,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } info!( - "jwt auth succeeded for scope: {:#?} by tenantid: {:?}", + "jwt auth succeeded for scope: {:#?} by tenant id: {:?}", data.claims.scope, data.claims.tenant_id, ); @@ -1013,8 +1010,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); - let tenantid = ZTenantId::from_str(params[0])?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + let tenant_id = ZTenantId::from_str(params[0])?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), @@ -1027,25 +1024,27 @@ impl postgres_backend_async::Handler for PageServerHandler { RowDescriptor::int8_col(b"pitr_interval"), ]))? .write_message(&BeMessage::DataRow(&[ - Some(repo.get_checkpoint_distance().to_string().as_bytes()), + Some(tenant.get_checkpoint_distance().to_string().as_bytes()), Some( - repo.get_checkpoint_timeout() + tenant + .get_checkpoint_timeout() .as_secs() .to_string() .as_bytes(), ), - Some(repo.get_compaction_target_size().to_string().as_bytes()), + Some(tenant.get_compaction_target_size().to_string().as_bytes()), Some( - repo.get_compaction_period() + tenant + .get_compaction_period() .as_secs() .to_string() .as_bytes(), ), - Some(repo.get_compaction_threshold().to_string().as_bytes()), - Some(repo.get_gc_horizon().to_string().as_bytes()), - Some(repo.get_gc_period().as_secs().to_string().as_bytes()), - Some(repo.get_image_creation_threshold().to_string().as_bytes()), - Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()), + Some(tenant.get_compaction_threshold().to_string().as_bytes()), + Some(tenant.get_gc_horizon().to_string().as_bytes()), + Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), + Some(tenant.get_image_creation_threshold().to_string().as_bytes()), + Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("do_gc ") { @@ -1066,16 +1065,16 @@ impl postgres_backend_async::Handler for PageServerHandler { let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let gc_horizon: u64 = caps .get(4) .map(|h| h.as_str().parse()) - .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; + .unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?; // Use tenant's pitr setting - let pitr = repo.get_pitr_interval(); - let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + let pitr = tenant.get_pitr_interval(); + let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"layers_total"), RowDescriptor::int8_col(b"layers_needed_by_cutoff"), @@ -1169,12 +1168,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { - tenant_mgr::get_repository_for_tenant(tenant_id) - .and_then(|repo| { - repo.get_timeline(timeline_id) - .context("No timeline in tenant's repository") - }) - .with_context(|| format!("Could not get timeline {timeline_id} in tenant {tenant_id}")) + tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id)) } /// diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ba48a77961..2454b6f54f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,9 +7,9 @@ //! Clarify that) //! use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; +use crate::tenant::Timeline; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; @@ -1398,16 +1398,12 @@ fn is_slru_block_key(key: Key) -> bool { && key.field6 != 0xffffffff // and not SlruSegSize } -// -//-- Tests that should work the same with any Repository/Timeline implementation. -// - #[cfg(test)] pub fn create_test_timeline( - repo: &crate::layered_repository::Repository, + tenant: &crate::tenant::Tenant, timeline_id: utils::zid::ZTimelineId, ) -> Result> { - let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 8ebfa6a935..c104dba298 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -46,10 +46,10 @@ //! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. //! If the storage sync loop was successfully started before, pageserver schedules the layer files and the updated metadata file for upload, every time a layer is flushed to disk. //! The uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). -//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. +//! See [`crate::tenant`] for the upload calls and the adjacent logic. //! -//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`], -//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. +//! Synchronization logic is able to communicate back with updated timeline sync states, submitted via [`crate::tenant_mgr::attach_local_tenants`] function. +//! Tenant manager applies corresponding timeline updates in pageserver's in-memory state. //! Such submissions happen in two cases: //! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future //! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory @@ -171,11 +171,11 @@ use self::{ use crate::{ config::PageServerConf, exponential_backoff, - layered_repository::metadata::{metadata_path, TimelineMetadata}, storage_sync::index::RemoteIndex, task_mgr, task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, + tenant::metadata::{metadata_path, TimelineMetadata}, tenant_mgr::attach_local_tenants, }; use crate::{ @@ -714,17 +714,17 @@ async fn storage_sync_loop( }; if tenant_entry.has_in_progress_downloads() { - info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration"); + info!("Tenant {tenant_id} has pending timeline downloads, skipping tenant registration"); continue; } else { info!( - "Tenant {tenant_id} download completed. Picking to register in repository" + "Tenant {tenant_id} download completed. Picking to register in tenant" ); // Here we assume that if tenant has no in-progress downloads that // means that it is the last completed timeline download that triggered // sync status update. So we look at the index for available timelines - // and register them all at once in a repository for download - // to be submitted in a single operation to repository + // and register them all at once in a tenant for download + // to be submitted in a single operation to tenant // so it can apply them at once to internal timeline map. timelines_to_attach.0.insert( tenant_id, @@ -737,9 +737,7 @@ async fn storage_sync_loop( } drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - if let Err(e) = attach_local_tenants(conf, &index, timelines_to_attach) { - error!("Failed to attach new timelines: {e:?}"); - }; + attach_local_tenants(conf, &index, timelines_to_attach); } } ControlFlow::Break(()) => { @@ -1038,13 +1036,7 @@ async fn update_local_metadata( timeline_id, } = sync_id; tokio::task::spawn_blocking(move || { - crate::layered_repository::save_metadata( - conf, - timeline_id, - tenant_id, - &cloned_metadata, - true, - ) + crate::tenant::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) }) .await .with_context(|| { @@ -1411,12 +1403,12 @@ fn register_sync_status( mod test_utils { use utils::lsn::Lsn; - use crate::layered_repository::repo_harness::RepoHarness; + use crate::tenant::harness::TenantHarness; use super::*; pub(super) async fn create_local_timeline( - harness: &RepoHarness<'_>, + harness: &TenantHarness<'_>, timeline_id: ZTimelineId, filenames: &[&str], metadata: TimelineMetadata, @@ -1456,7 +1448,7 @@ mod test_utils { #[cfg(test)] mod tests { use super::test_utils::dummy_metadata; - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; use hex_literal::hex; use utils::lsn::Lsn; diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 794ecbaeb3..945f5fded8 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -112,8 +112,8 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use remote_storage::{LocalFs, RemoteStorage}; @@ -121,7 +121,7 @@ mod tests { #[tokio::test] async fn delete_timeline_negative() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline_negative")?; + let harness = TenantHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( @@ -154,7 +154,7 @@ mod tests { #[tokio::test] async fn delete_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline")?; + let harness = TenantHarness::create("delete_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 91ee557b79..32f228b447 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -17,7 +17,7 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{ - config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, + config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, TEMP_FILE_SUFFIX, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; @@ -425,18 +425,18 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, }, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use super::*; #[tokio::test] async fn download_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("download_timeline")?; + let harness = TenantHarness::create("download_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -537,7 +537,7 @@ mod tests { #[tokio::test] async fn download_timeline_negatives() -> anyhow::Result<()> { - let harness = RepoHarness::create("download_timeline_negatives")?; + let harness = TenantHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( @@ -596,7 +596,7 @@ mod tests { #[tokio::test] async fn test_download_index_part() -> anyhow::Result<()> { - let harness = RepoHarness::create("test_download_index_part")?; + let harness = TenantHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index b17bb40da4..cff14cde49 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -15,7 +15,7 @@ use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; use tracing::log::warn; -use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata}; +use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata}; use utils::{ lsn::Lsn, zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, @@ -340,11 +340,11 @@ mod tests { use std::collections::BTreeSet; use super::*; - use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; #[test] fn index_part_conversion() { - let harness = RepoHarness::create("index_part_conversion").unwrap(); + let harness = TenantHarness::create("index_part_conversion").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let metadata = TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); @@ -462,7 +462,7 @@ mod tests { #[test] fn index_part_conversion_negatives() { - let harness = RepoHarness::create("index_part_conversion_negatives").unwrap(); + let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let metadata = TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index a4285e426b..bd09e6b898 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -15,9 +15,7 @@ use super::{ LayersUpload, SyncData, SyncQueue, }; use crate::metrics::NO_LAYERS_UPLOAD; -use crate::{ - config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, -}; +use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -202,18 +200,18 @@ mod tests { use utils::lsn::Lsn; use crate::{ - layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, }, + tenant::harness::{TenantHarness, TIMELINE_ID}, }; use super::{upload_index_part, *}; #[tokio::test] async fn regular_layer_upload() -> anyhow::Result<()> { - let harness = RepoHarness::create("regular_layer_upload")?; + let harness = TenantHarness::create("regular_layer_upload")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -301,7 +299,7 @@ mod tests { // Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario. #[tokio::test] async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { - let harness = RepoHarness::create("layer_upload_after_local_fs_update")?; + let harness = TenantHarness::create("layer_upload_after_local_fs_update")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); @@ -396,7 +394,7 @@ mod tests { #[tokio::test] async fn test_upload_index_part() -> anyhow::Result<()> { - let harness = RepoHarness::create("test_upload_index_part")?; + let harness = TenantHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/tenant.rs similarity index 88% rename from pageserver/src/layered_repository.rs rename to pageserver/src/tenant.rs index ecc0bfe3b5..4ef810faba 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/tenant.rs @@ -1,6 +1,6 @@ //! //! Timeline repository implementation that keeps old data in files on disk, and -//! the recent changes in memory. See layered_repository/*_layer.rs files. +//! the recent changes in memory. See tenant/*_layer.rs files. //! The functions here are responsible for locating the correct layer for the //! get/put call, walking back the timeline branching history as needed. //! @@ -12,6 +12,7 @@ //! use anyhow::{bail, ensure, Context, Result}; +use tokio::sync::watch; use tracing::*; use std::cmp::min; @@ -71,24 +72,26 @@ use storage_layer::Layer; pub use timeline::Timeline; // re-export this function so that page_cache.rs can use it. -pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; +pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; // re-export for use in storage_sync.rs -pub use crate::layered_repository::metadata::save_metadata; +pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver -pub use crate::layered_repository::timeline::WalReceiverInfo; +pub use crate::tenant::timeline::WalReceiverInfo; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// -/// Repository consists of multiple timelines. Keep them in a hash table. +/// Tenant consists of multiple timelines. Keep them in a hash table. /// -pub struct Repository { +pub struct Tenant { // Global pageserver config parameters pub conf: &'static PageServerConf, + state: watch::Sender, + // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -114,17 +117,40 @@ pub struct Repository { upload_layers: bool, } +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} + /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. -impl Repository { +impl Tenant { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timeline_id: ZTimelineId) -> Option> { - self.timelines.lock().unwrap().get(&timeline_id).cloned() + pub fn get_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result> { + self.timelines + .lock() + .unwrap() + .get(&timeline_id) + .with_context(|| { + format!( + "Timeline {} was not found for tenant {}", + timeline_id, + self.tenant_id() + ) + }) + .map(Arc::clone) } - /// Lists timelines the repository contains. - /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + /// Lists timelines the tenant contains. + /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { self.timelines .lock() @@ -425,6 +451,54 @@ impl Repository { pub fn get_remote_index(&self) -> &RemoteIndex { &self.remote_index } + + pub fn current_state(&self) -> TenantState { + *self.state.borrow() + } + + pub fn is_active(&self) -> bool { + matches!(self.current_state(), TenantState::Active { .. }) + } + + pub fn should_run_tasks(&self) -> bool { + matches!( + self.current_state(), + TenantState::Active { + background_jobs_running: true + } + ) + } + + /// Changes tenant status to active, if it was not broken before. + /// Otherwise, ignores the state change, logging an error. + pub fn activate(&self, enable_background_jobs: bool) { + self.set_state(TenantState::Active { + background_jobs_running: enable_background_jobs, + }); + } + + pub fn set_state(&self, new_state: TenantState) { + match (self.current_state(), new_state) { + (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { + debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + } + (TenantState::Broken, _) => { + error!("Ignoring state update {new_state:?} for broken tenant"); + } + (_, new_state) => { + self.state.send_replace(new_state); + if self.should_run_tasks() { + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + crate::tenant_tasks::start_background_loops(self.tenant_id); + } + } + } + } + + pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + self.state.subscribe() + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -471,7 +545,7 @@ fn tree_sort_timelines( } /// Private functions -impl Repository { +impl Tenant { pub fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -609,8 +683,9 @@ impl Repository { tenant_id: ZTenantId, remote_index: RemoteIndex, upload_layers: bool, - ) -> Repository { - Repository { + ) -> Tenant { + let (state, _) = watch::channel(TenantState::Paused); + Tenant { tenant_id, conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), @@ -619,6 +694,7 @@ impl Repository { walredo_mgr, remote_index, upload_layers, + state, } } @@ -848,7 +924,7 @@ impl Repository { // compaction (both require `layer_removal_cs` lock), // but the GC iteration can run concurrently with branch creation. // - // See comments in [`Repository::branch_timeline`] for more information + // See comments in [`Tenant::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { if task_mgr::is_shutdown_requested() { @@ -881,7 +957,7 @@ impl Repository { } } -impl Drop for Repository { +impl Drop for Tenant { fn drop(&mut self) { remove_tenant_metrics(&self.tenant_id); } @@ -910,7 +986,7 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { } #[cfg(test)] -pub mod repo_harness { +pub mod harness { use bytes::{Bytes, BytesMut}; use once_cell::sync::Lazy; use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; @@ -920,8 +996,8 @@ pub mod repo_harness { use crate::storage_sync::index::RemoteIndex; use crate::{ config::PageServerConf, - layered_repository::Repository, repository::Key, + tenant::Tenant, walrecord::ZenithWalRecord, walredo::{WalRedoError, WalRedoManager}, }; @@ -968,7 +1044,7 @@ pub mod repo_harness { } } - pub struct RepoHarness<'a> { + pub struct TenantHarness<'a> { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, pub tenant_id: ZTenantId, @@ -979,7 +1055,7 @@ pub mod repo_harness { ), } - impl<'a> RepoHarness<'a> { + impl<'a> TenantHarness<'a> { pub fn create(test_name: &'static str) -> Result { Self::create_internal(test_name, false) } @@ -1016,14 +1092,14 @@ pub mod repo_harness { }) } - pub fn load(&self) -> Repository { - self.try_load().expect("failed to load test repo") + pub fn load(&self) -> Tenant { + self.try_load().expect("failed to load test tenant") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); - let repo = Repository::new( + let tenant = Tenant::new( self.conf, TenantConfOpt::from(self.tenant_conf), walredo_mgr, @@ -1031,7 +1107,7 @@ pub mod repo_harness { RemoteIndex::default(), false, ); - // populate repo with locally available timelines + // populate tenant with locally available timelines let mut timelines_to_load = HashMap::new(); for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) .expect("should be able to read timelines dir") @@ -1043,12 +1119,13 @@ pub mod repo_harness { .unwrap() .to_string_lossy() .parse()?; + let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; timelines_to_load.insert(timeline_id, timeline_metadata); } - repo.init_attach_timelines(timelines_to_load)?; + tenant.init_attach_timelines(timelines_to_load)?; - Ok(repo) + Ok(tenant) } pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { @@ -1110,8 +1187,8 @@ mod tests { use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; - use crate::layered_repository::repo_harness::*; use crate::repository::{Key, Value}; + use crate::tenant::harness::*; use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; @@ -1122,8 +1199,8 @@ mod tests { #[test] fn test_basic() -> Result<()> { - let repo = RepoHarness::create("test_basic")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_basic")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1144,10 +1221,10 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { - let repo = RepoHarness::create("no_duplicate_timelines")?.load(); - let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); + let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -1170,8 +1247,8 @@ mod tests { /// #[test] fn test_branch() -> Result<()> { - let repo = RepoHarness::create("test_branch")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_branch")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); use std::str::from_utf8; @@ -1193,8 +1270,8 @@ mod tests { //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); @@ -1263,19 +1340,20 @@ mod tests { #[test] fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + .load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 // FIXME: this doesn't actually remove any layer currently, given how the checkpointing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); @@ -1292,11 +1370,12 @@ mod tests { #[test] fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); @@ -1336,36 +1415,37 @@ mod tests { #[test] fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); Ok(()) } #[test] fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = + TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60))?; // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // Check that the data is still accessible on the branch. assert_eq!( @@ -1379,16 +1459,17 @@ mod tests { #[test] fn timeline_load() -> Result<()> { const TEST_NAME: &str = "timeline_load"; - let harness = RepoHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME)?; { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + let tenant = harness.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } - let repo = harness.load(); - repo.get_timeline(TIMELINE_ID) + let tenant = harness.load(); + tenant + .get_timeline(TIMELINE_ID) .expect("cannot load timeline"); Ok(()) @@ -1397,18 +1478,18 @@ mod tests { #[test] fn timeline_load_with_ancestor() -> Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = RepoHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME)?; // create two timelines { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = harness.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo + let newtline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -1417,14 +1498,14 @@ mod tests { } // check that both of them are initially unloaded - let repo = harness.load(); + let tenant = harness.load(); // check that both, child and ancestor are loaded - let _child_tline = repo + let _child_tline = tenant .get_timeline(NEW_TIMELINE_ID) .expect("cannot get child timeline loaded"); - let _ancestor_tline = repo + let _ancestor_tline = tenant .get_timeline(TIMELINE_ID) .expect("cannot get ancestor timeline loaded"); @@ -1434,11 +1515,11 @@ mod tests { #[test] fn corrupt_metadata() -> Result<()> { const TEST_NAME: &str = "corrupt_metadata"; - let harness = RepoHarness::create(TEST_NAME)?; - let repo = harness.load(); + let harness = TenantHarness::create(TEST_NAME)?; + let tenant = harness.load(); - repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - drop(repo); + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1473,8 +1554,8 @@ mod tests { #[test] fn test_images() -> Result<()> { - let repo = RepoHarness::create("test_images")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_images")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1523,8 +1604,8 @@ mod tests { // #[test] fn test_bulk_insert() -> Result<()> { - let repo = RepoHarness::create("test_bulk_insert")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_bulk_insert")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let mut lsn = Lsn(0x10); @@ -1563,8 +1644,8 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { - let repo = RepoHarness::create("test_random_updates")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_random_updates")?.load(); + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 1000; @@ -1633,8 +1714,8 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { - let repo = RepoHarness::create("test_traverse_branches")?.load(); - let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_traverse_branches")?.load(); + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 1000; @@ -1667,8 +1748,8 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant .get_timeline(new_tline_id) .expect("Should have the branched timeline"); tline_id = new_tline_id; @@ -1712,8 +1793,8 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { - let repo = RepoHarness::create("test_traverse_ancestors")?.load(); - let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; @@ -1728,8 +1809,8 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; - tline = repo + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant .get_timeline(new_tline_id) .expect("Should have the branched timeline"); tline_id = new_tline_id; diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/tenant/blob_io.rs similarity index 98% rename from pageserver/src/layered_repository/blob_io.rs rename to pageserver/src/tenant/blob_io.rs index a4c6186056..78ecbcb9c1 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -11,8 +11,8 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! -use crate::layered_repository::block_io::{BlockCursor, BlockReader}; use crate::page_cache::PAGE_SZ; +use crate::tenant::block_io::{BlockCursor, BlockReader}; use std::cmp::min; use std::io::{Error, ErrorKind}; diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/tenant/block_io.rs similarity index 98% rename from pageserver/src/layered_repository/block_io.rs rename to pageserver/src/tenant/block_io.rs index 5e32b8833a..bbcdabe1cd 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -60,7 +60,7 @@ where /// the underlying BlockReader. For example: /// /// ```no_run -/// # use pageserver::layered_repository::block_io::{BlockReader, FileBlockReader}; +/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; /// # let reader: FileBlockReader = todo!(); /// let cursor = reader.block_cursor(); /// let buf = cursor.read_blk(1); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs similarity index 98% rename from pageserver/src/layered_repository/delta_layer.rs rename to pageserver/src/tenant/delta_layer.rs index af02f84bc0..ff6d3652f9 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -24,15 +24,13 @@ //! "values" part. //! use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; -use crate::layered_repository::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; -use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::page_cache::{PageReadGuard, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; +use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::filename::{DeltaFileName, PathOrConf}; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::virtual_file::VirtualFile; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs similarity index 99% rename from pageserver/src/layered_repository/disk_btree.rs rename to pageserver/src/tenant/disk_btree.rs index c130a42a8e..33255dbd82 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -25,7 +25,7 @@ use std::{cmp::Ordering, io, result}; use thiserror::Error; use tracing::error; -use crate::layered_repository::block_io::{BlockReader, BlockWriter}; +use crate::tenant::block_io::{BlockReader, BlockWriter}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; diff --git a/pageserver/src/layered_repository/disk_btree_test_data.rs b/pageserver/src/tenant/disk_btree_test_data.rs similarity index 100% rename from pageserver/src/layered_repository/disk_btree_test_data.rs rename to pageserver/src/tenant/disk_btree_test_data.rs diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs similarity index 97% rename from pageserver/src/layered_repository/ephemeral_file.rs rename to pageserver/src/tenant/ephemeral_file.rs index a1b2d68cd5..c675e4e778 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -2,11 +2,11 @@ //! used to keep in-memory layers spilled on disk. use crate::config::PageServerConf; -use crate::layered_repository::blob_io::BlobWriter; -use crate::layered_repository::block_io::BlockReader; use crate::page_cache; use crate::page_cache::PAGE_SZ; use crate::page_cache::{ReadBufResult, WriteBufResult}; +use crate::tenant::blob_io::BlobWriter; +use crate::tenant::block_io::BlockReader; use crate::virtual_file::VirtualFile; use once_cell::sync::Lazy; use std::cmp::min; @@ -330,13 +330,13 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error { #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; - use crate::layered_repository::block_io::BlockCursor; + use crate::tenant::blob_io::{BlobCursor, BlobWriter}; + use crate::tenant::block_io::BlockCursor; use rand::{seq::SliceRandom, thread_rng, RngCore}; use std::fs; use std::str::FromStr; - fn repo_harness( + fn harness( test_name: &str, ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -368,7 +368,7 @@ mod tests { #[test] fn test_ephemeral_files() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?; + let (conf, tenantid, timelineid) = harness("ephemeral_files")?; let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -399,7 +399,7 @@ mod tests { #[test] fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?; + let (conf, tenantid, timelineid) = harness("ephemeral_blobs")?; let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/tenant/filename.rs similarity index 100% rename from pageserver/src/layered_repository/filename.rs rename to pageserver/src/tenant/filename.rs diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/tenant/image_layer.rs similarity index 97% rename from pageserver/src/layered_repository/image_layer.rs rename to pageserver/src/tenant/image_layer.rs index 4fe771bb3f..518643241d 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -20,15 +20,13 @@ //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; -use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader}; -use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::layered_repository::filename::{ImageFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; +use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::filename::{ImageFileName, PathOrConf}; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::virtual_file::VirtualFile; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs similarity index 96% rename from pageserver/src/layered_repository/inmemory_layer.rs rename to pageserver/src/tenant/inmemory_layer.rs index 5f269a868f..0e7b215b1e 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/tenant/inmemory_layer.rs @@ -5,14 +5,12 @@ //! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; -use crate::layered_repository::blob_io::{BlobCursor, BlobWriter}; -use crate::layered_repository::block_io::BlockReader; -use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; -use crate::layered_repository::ephemeral_file::EphemeralFile; -use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, -}; use crate::repository::{Key, Value}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter}; +use crate::tenant::block_io::BlockReader; +use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter}; +use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::walrecord; use anyhow::{bail, ensure, Result}; use std::cell::RefCell; diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/tenant/layer_map.rs similarity index 98% rename from pageserver/src/layered_repository/layer_map.rs rename to pageserver/src/tenant/layer_map.rs index 88dcf32409..c24e3976fb 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -10,11 +10,11 @@ //! corresponding files are written to disk. //! -use crate::layered_repository::inmemory_layer::InMemoryLayer; -use crate::layered_repository::storage_layer::Layer; -use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; +use crate::tenant::inmemory_layer::InMemoryLayer; +use crate::tenant::storage_layer::Layer; +use crate::tenant::storage_layer::{range_eq, range_overlaps}; use anyhow::Result; use std::collections::VecDeque; use std::ops::Range; diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/tenant/metadata.rs similarity index 98% rename from pageserver/src/layered_repository/metadata.rs rename to pageserver/src/tenant/metadata.rs index 910dba4644..4ea2b7d55b 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,4 +1,4 @@ -//! Every image of a certain timeline from [`crate::layered_repository::Repository`] +//! Every image of a certain timeline from [`crate::tenant::Tenant`] //! has a metadata that needs to be stored persistently. //! //! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of @@ -216,7 +216,7 @@ pub fn save_metadata( #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::TIMELINE_ID; + use crate::tenant::harness::TIMELINE_ID; #[test] fn metadata_serializes_correctly() { diff --git a/pageserver/src/layered_repository/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs similarity index 100% rename from pageserver/src/layered_repository/par_fsync.rs rename to pageserver/src/tenant/par_fsync.rs diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs similarity index 100% rename from pageserver/src/layered_repository/storage_layer.rs rename to pageserver/src/tenant/storage_layer.rs diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/tenant/timeline.rs similarity index 99% rename from pageserver/src/layered_repository/timeline.rs rename to pageserver/src/tenant/timeline.rs index 60abbe33e6..c96ad99909 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -17,7 +17,7 @@ use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering} use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; -use crate::layered_repository::{ +use crate::tenant::{ delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, filename::{DeltaFileName, ImageFileName}, @@ -118,7 +118,7 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`Repository::delete_timeline`]. + /// and [`Tenant::delete_timeline`]. layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a9f015229f..a8a9926c77 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,26 +1,31 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use crate::config::PageServerConf; -use crate::http::models::TenantInfo; -use crate::layered_repository::ephemeral_file::is_ephemeral_file; -use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; -use crate::layered_repository::Repository; -use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; -use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; -use crate::task_mgr::{self, TaskKind}; -use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::walredo::{PostgresRedoManager, WalRedoManager}; -use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use anyhow::Context; -use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; use std::collections::{hash_map, HashMap, HashSet}; use std::ffi::OsStr; use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; + +use anyhow::Context; use tracing::*; +use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; + +use crate::config::PageServerConf; +use crate::http::models::TenantInfo; +use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; +use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::{ + ephemeral_file::is_ephemeral_file, + metadata::{TimelineMetadata, METADATA_FILE_NAME}, + Tenant, TenantState, +}; +use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::walredo::PostgresRedoManager; +use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; + use utils::crashsafe_dir; use utils::zid::{ZTenantId, ZTimelineId}; @@ -28,64 +33,31 @@ mod tenants_state { use once_cell::sync::Lazy; use std::{ collections::HashMap, - sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, + sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; use utils::zid::ZTenantId; - use crate::tenant_mgr::Tenant; + use crate::tenant::Tenant; - static TENANTS: Lazy>> = + static TENANTS: Lazy>>> = Lazy::new(|| RwLock::new(HashMap::new())); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { TENANTS .read() .expect("Failed to read() tenants lock, it got poisoned") } - pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap> { + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { TENANTS .write() .expect("Failed to write() tenants lock, it got poisoned") } } -struct Tenant { - state: TenantState, - /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub enum TenantState { - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - Active, - // Tenant is active, but there is no walreceiver connection. - Idle, - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - // The tenant cannot be accessed anymore for any reason, but graceful shutdown. - Stopping, - - // Something went wrong loading the tenant state - Broken, -} - -impl std::fmt::Display for TenantState { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Active => f.write_str("Active"), - Self::Idle => f.write_str("Idle"), - Self::Stopping => f.write_str("Stopping"), - Self::Broken => f.write_str("Broken"), - } - } -} - /// Initialize repositories with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) -/// are scheduled for download and added to the repository once download is completed. +/// are scheduled for download and added to the tenant once download is completed. pub fn init_tenant_mgr( conf: &'static PageServerConf, remote_storage: Option, @@ -128,7 +100,7 @@ pub fn init_tenant_mgr( ) }; - attach_local_tenants(conf, &remote_index, tenants_to_attach)?; + attach_local_tenants(conf, &remote_index, tenants_to_attach); Ok(remote_index) } @@ -141,7 +113,7 @@ pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, tenants_to_attach: TenantTimelineValues, -) -> anyhow::Result<()> { +) { let _entered = info_span!("attach_local_tenants").entered(); let number_of_tenants = tenants_to_attach.0.len(); @@ -152,104 +124,109 @@ pub fn attach_local_tenants( ); debug!("Timelines to attach: {local_timelines:?}"); - let repository = load_local_repo(conf, tenant_id, remote_index) - .context("Failed to load repository for tenant")?; - - let repo = Arc::clone(&repository); + let tenant = load_local_tenant(conf, tenant_id, remote_index); { match tenants_state::write_tenants().entry(tenant_id) { hash_map::Entry::Occupied(_) => { - anyhow::bail!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); + continue; } hash_map::Entry::Vacant(v) => { - v.insert(Tenant { - state: TenantState::Idle, - repo, - }); + v.insert(Arc::clone(&tenant)); + } + } + } + + if tenant.current_state() == TenantState::Broken { + warn!("Skipping timeline load for broken tenant {tenant_id}") + } else { + let has_timelines = !local_timelines.is_empty(); + match tenant.init_attach_timelines(local_timelines) { + Ok(()) => { + info!("successfully loaded local timelines for tenant {tenant_id}"); + tenant.activate(has_timelines); + } + Err(e) => { + error!("Failed to attach tenant timelines: {e:?}"); + tenant.set_state(TenantState::Broken); } } } - // XXX: current timeline init enables walreceiver that looks for tenant in the state, so insert the tenant entry before - repository - .init_attach_timelines(local_timelines) - .context("Failed to attach timelines for tenant")?; } - info!("Processed {number_of_tenants} local tenants during attach"); - Ok(()) + info!("Processed {number_of_tenants} local tenants during attach") } -fn load_local_repo( +fn load_local_tenant( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { - let repository = Repository::new( +) -> Arc { + let tenant = Arc::new(Tenant::new( conf, TenantConfOpt::default(), Arc::new(PostgresRedoManager::new(conf, tenant_id)), tenant_id, remote_index.clone(), conf.remote_storage_config.is_some(), - ); - let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; - repository.update_tenant_config(tenant_conf); - - Ok(Arc::new(repository)) + )); + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } + } + tenant } /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// pub async fn shutdown_all_tenants() { - let tenantids = { + let tenants_to_shut_down = { let mut m = tenants_state::write_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - match tenant.state { - TenantState::Active | TenantState::Idle | TenantState::Stopping => { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) - } - TenantState::Broken => {} + let mut tenants_to_shut_down = Vec::with_capacity(m.len()); + for (_, tenant) in m.drain() { + if tenant.is_active() { + // updates tenant state, forbidding new GC and compaction iterations from starting + tenant.set_state(TenantState::Paused); + tenants_to_shut_down.push(tenant) } } drop(m); - tenantids + tenants_to_shut_down }; + // Shut down all existing walreceiver connections and stop accepting the new ones. task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from - // the repository have already been terminated by the caller, so there + // the tenant have already been terminated by the caller, so there // should be no more activity in any of the repositories. // // On error, log it but continue with the shutdown for other tenants. - for tenant_id in tenantids { + for tenant in tenants_to_shut_down { + let tenant_id = tenant.tenant_id(); debug!("shutdown tenant {tenant_id}"); - match get_repository_for_tenant(tenant_id) { - Ok(repo) => { - if let Err(err) = repo.checkpoint() { - error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); - } - } - Err(err) => { - error!("Could not get repository for tenant {tenant_id} during shutdown: {err:?}"); - } + + if let Err(err) = tenant.checkpoint() { + error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); } } } -fn create_repo( +fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, - wal_redo_manager: Arc, - remote_index: RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result<()> { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( !target_tenant_directory.exists(), @@ -282,7 +259,7 @@ fn create_repo( ) })?; // first, create a config in the top-level temp directory, fsync the file - Repository::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; + Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; // then, create a subdirectory in the top-level temp directory, fsynced crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( @@ -312,18 +289,11 @@ fn create_repo( fs::File::open(target_dir_parent)?.sync_all()?; info!( - "created directory structure in {}", + "created tenant directory structure in {}", target_tenant_directory.display() ); - Ok(Arc::new(Repository::new( - conf, - tenant_conf, - wal_redo_manager, - tenant_id, - remote_index, - conf.remote_storage_config.is_some(), - ))) + Ok(()) } fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result { @@ -350,12 +320,17 @@ pub fn create_tenant( } hash_map::Entry::Vacant(v) => { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); - let repo = create_repo(conf, tenant_conf, tenant_id, wal_redo_manager, remote_index)?; - v.insert(Tenant { - state: TenantState::Active, - repo, - }); - crate::tenant_tasks::start_background_loops(tenant_id); + create_tenant_files(conf, tenant_conf, tenant_id)?; + let tenant = Arc::new(Tenant::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + )); + tenant.activate(false); + v.insert(tenant); Ok(Some(tenant_id)) } } @@ -367,70 +342,23 @@ pub fn update_tenant_config( tenant_id: ZTenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); - get_repository_for_tenant(tenant_id)?.update_tenant_config(tenant_conf); - - Repository::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; + get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); + Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; Ok(()) } -pub fn get_tenant_state(tenantid: ZTenantId) -> Option { - Some(tenants_state::read_tenants().get(&tenantid)?.state) -} - -pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { - let old_state = { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant not found for id {tenant_id}"))?; - let old_state = tenant.state; - tenant.state = new_state; - old_state - }; - - match (old_state, new_state) { - (TenantState::Broken, TenantState::Broken) - | (TenantState::Active, TenantState::Active) - | (TenantState::Idle, TenantState::Idle) - | (TenantState::Stopping, TenantState::Stopping) => { - debug!("tenant {tenant_id} already in state {new_state}"); - } - (TenantState::Broken, ignored) => { - debug!("Ignoring {ignored} since tenant {tenant_id} is in broken state"); - } - (_, TenantState::Broken) => { - debug!("Setting tenant {tenant_id} status to broken"); - } - (TenantState::Stopping, ignored) => { - debug!("Ignoring {ignored} since tenant {tenant_id} is in stopping state"); - } - (TenantState::Idle, TenantState::Active) => { - info!("activating tenant {tenant_id}"); - - // Spawn gc and compaction loops. The loops will shut themselves - // down when they notice that the tenant is inactive. - crate::tenant_tasks::start_background_loops(tenant_id); - } - (TenantState::Idle, TenantState::Stopping) => { - info!("stopping idle tenant {tenant_id}"); - } - (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { - info!("stopping tenant {tenant_id} tasks due to new state {new_state}"); - - // Note: The caller is responsible for waiting for any tasks to finish. - } - } - - Ok(()) -} - -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. +/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. +pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found"))?; - - Ok(Arc::clone(&tenant.repo)) + .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + if active_only && !tenant.is_active() { + anyhow::bail!("Tenant {tenant_id} is not active") + } else { + Ok(Arc::clone(tenant)) + } } pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { @@ -455,9 +383,14 @@ pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; info!("timeline task shutdown completed"); - match tenants_state::read_tenants().get(&tenant_id) { - Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, - None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), + match get_tenant(tenant_id, true) { + Ok(tenant) => { + tenant.delete_timeline(timeline_id)?; + if tenant.list_timelines().is_empty() { + tenant.activate(false); + } + } + Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), } Ok(()) @@ -467,21 +400,24 @@ pub async fn detach_tenant( conf: &'static PageServerConf, tenant_id: ZTenantId, ) -> anyhow::Result<()> { - set_tenant_state(tenant_id, TenantState::Stopping)?; + let tenant = match { + let mut tenants_accessor = tenants_state::write_tenants(); + tenants_accessor.remove(&tenant_id) + } { + Some(tenant) => tenant, + None => anyhow::bail!("Tenant not found for id {tenant_id}"), + }; + + tenant.set_state(TenantState::Paused); // shutdown all tenant and timeline tasks: gc, compaction, page service) task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - { - let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id); - } - // If removal fails there will be no way to successfully retry detach, // because the tenant no longer exists in the in-memory map. And it needs to be removed from it - // before we remove files, because it contains references to repository + // before we remove files, because it contains references to tenant // which references ephemeral files which are deleted on drop. So if we keep these references, // we will attempt to remove files which no longer exist. This can be fixed by having shutdown - // mechanism for repository that will clean temporary data to avoid any references to ephemeral files + // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files let local_tenant_directory = conf.tenant_path(&tenant_id); fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( @@ -512,7 +448,7 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { TenantInfo { id: *id, - state: Some(tenant.state), + state: tenant.current_state(), current_physical_size: None, has_in_progress_downloads, } diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 9aaafe7f92..3ef54838af 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -1,12 +1,14 @@ //! This module contains functions to serve per-tenant background processes, //! such as compaction and GC +use std::ops::ControlFlow; +use std::sync::Arc; use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::{Tenant, TenantState}; use crate::tenant_mgr; -use crate::tenant_mgr::TenantState; use tracing::*; use utils::zid::ZTenantId; @@ -18,7 +20,10 @@ pub fn start_background_loops(tenant_id: ZTenantId) { None, &format!("compactor for tenant {tenant_id}"), false, - compaction_loop(tenant_id), + async move { + compaction_loop(tenant_id).await; + Ok(()) + }, ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), @@ -27,43 +32,50 @@ pub fn start_background_loops(tenant_id: ZTenantId) { None, &format!("garbage collector for tenant {tenant_id}"), false, - gc_loop(tenant_id), + async move { + gc_loop(tenant_id).await; + Ok(()) + }, ); } /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { +async fn compaction_loop(tenant_id: ZTenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting compaction loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - let result = async { + async { loop { trace!("waking up"); + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received compaction cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; + // Run blocking part of the task - // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { - break Ok(()); - } - // This should not fail. If someone started us, it means that the tenant exists. - // And before you remove a tenant, you have to wait until all the associated tasks - // exit. - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - // Run compaction - let mut sleep_duration = repo.get_compaction_period(); - if let Err(e) = repo.compaction_iteration() { - error!("Compaction failed, retrying: {}", e); - sleep_duration = Duration::from_secs(2) + let mut sleep_duration = tenant.get_compaction_period(); + if let Err(e) = tenant.compaction_iteration() { + error!("Compaction failed, retrying: {e:#}"); + sleep_duration = wait_duration; } // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - trace!("received cancellation request"); - break Ok(()); + info!("received compaction cancellation request during idling"); + break ; }, _ = tokio::time::sleep(sleep_duration) => {}, } @@ -72,49 +84,49 @@ async fn compaction_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - info!( - "compaction loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenant_id) - ); - result + trace!("compaction loop stopped."); } /// /// GC task's main loop /// -async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { +async fn gc_loop(tenant_id: ZTenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting gc loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - let result = async { + async { loop { trace!("waking up"); - // Break if tenant is not active - if tenant_mgr::get_tenant_state(tenant_id) != Some(TenantState::Active) { - break Ok(()); - } - // This should not fail. If someone started us, it means that the tenant exists. - // And before you remove a tenant, you have to wait until all the associated tasks - // exit. - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received GC cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; // Run gc - let gc_period = repo.get_gc_period(); - let gc_horizon = repo.get_gc_horizon(); + let gc_period = tenant.get_gc_period(); + let gc_horizon = tenant.get_gc_horizon(); let mut sleep_duration = gc_period; if gc_horizon > 0 { - if let Err(e) = repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false) + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false) { - error!("Gc failed, retrying: {}", e); - sleep_duration = Duration::from_secs(2) + error!("Gc failed, retrying: {e:#}"); + sleep_duration = wait_duration; } } // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - trace!("received cancellation request"); - break Ok(()); + info!("received GC cancellation request during idling"); + break; }, _ = tokio::time::sleep(sleep_duration) => {}, } @@ -122,9 +134,50 @@ async fn gc_loop(tenant_id: ZTenantId) -> anyhow::Result<()> { } .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - info!( - "GC loop stopped. State is {:?}", - tenant_mgr::get_tenant_state(tenant_id) - ); - result + trace!("GC loop stopped."); +} + +async fn wait_for_active_tenant( + tenant_id: ZTenantId, + wait: Duration, +) -> ControlFlow<(), Arc> { + let tenant = loop { + match tenant_mgr::get_tenant(tenant_id, false) { + Ok(tenant) => break tenant, + Err(e) => { + error!("Failed to get a tenant {tenant_id}: {e:#}"); + tokio::time::sleep(wait).await; + } + } + }; + + // if the tenant has a proper status already, no need to wait for anything + if tenant.should_run_tasks() { + ControlFlow::Continue(tenant) + } else { + let mut tenant_state_updates = tenant.subscribe_for_state_updates(); + loop { + match tenant_state_updates.changed().await { + Ok(()) => { + let new_state = *tenant_state_updates.borrow(); + match new_state { + TenantState::Active { + background_jobs_running: true, + } => { + debug!("Tenant state changed to active with background jobs enabled, continuing the task loop"); + return ControlFlow::Continue(tenant); + } + state => { + debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}"); + tokio::time::sleep(wait).await; + } + } + } + Err(_sender_dropped_error) => { + info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop"); + return ControlFlow::Break(()); + } + } + } + } } diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 35dec54d5c..69d14babf0 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -2,34 +2,28 @@ //! Timeline management code // -use anyhow::{bail, Context, Result}; -use remote_storage::path_with_suffix_extension; - use std::{ fs, path::Path, process::{Command, Stdio}, sync::Arc, }; + +use anyhow::{bail, Context, Result}; use tracing::*; +use remote_storage::path_with_suffix_extension; use utils::{ lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; -use crate::layered_repository::{Repository, Timeline}; +use crate::tenant::{Tenant, Timeline}; use crate::tenant_mgr; use crate::CheckpointConfig; use crate::{import_datadir, TEMP_FILE_SUFFIX}; -#[derive(Debug, Clone, Copy)] -pub struct PointInTime { - pub timeline_id: ZTimelineId, - pub lsn: Lsn, -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -69,7 +63,7 @@ fn bootstrap_timeline( conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId, - repo: &Repository, + tenant: &Tenant, ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. @@ -89,7 +83,7 @@ fn bootstrap_timeline( // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(timeline_id, lsn)?; + let timeline = tenant.create_empty_timeline(timeline_id, lsn)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -127,16 +121,16 @@ pub(crate) async fn create_timeline( mut ancestor_start_lsn: Option, ) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - debug!("timeline {} already exists", new_timeline_id); + debug!("timeline {new_timeline_id} already exists"); return Ok(None); } let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { - let ancestor_timeline = repo + let ancestor_timeline = tenant .get_timeline(ancestor_timeline_id) .context("Cannot branch off the timeline that's not present in pageserver")?; @@ -162,10 +156,13 @@ pub(crate) async fn create_timeline( } } - repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } - None => bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?, + None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?, }; + // Have added new timeline into the tenant, now its background tasks are needed. + tenant.activate(true); + Ok(Some(loaded_timeline)) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 57592a46d3..45d0916dec 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -30,9 +30,9 @@ use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; -use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; +use crate::tenant::Timeline; use crate::walrecord::*; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::v14::pg_constants; @@ -1022,16 +1022,13 @@ impl<'a> WalIngest<'a> { } } -/// -/// Tests that should work the same with any Repository/Timeline implementation. -/// #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::*; - use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::create_test_timeline; + use crate::tenant::harness::*; + use crate::tenant::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -1061,8 +1058,8 @@ mod tests { #[test] fn test_relsize() -> Result<()> { - let repo = RepoHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_relsize")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1189,8 +1186,8 @@ mod tests { // and then created it again within the same layer. #[test] fn test_drop_extend() -> Result<()> { - let repo = RepoHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_drop_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1229,8 +1226,8 @@ mod tests { // and then extended it again within the same layer. #[test] fn test_truncate_extend() -> Result<()> { - let repo = RepoHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_truncate_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1317,8 +1314,8 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[test] fn test_large_rel() -> Result<()> { - let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(&repo, TIMELINE_ID)?; + let tenant = TenantHarness::create("test_large_rel")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1fcb768ddf..69e400f291 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,10 +16,10 @@ use std::{ time::Duration, }; -use crate::layered_repository::Timeline; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; +use crate::tenant::Timeline; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -767,11 +767,11 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { use super::*; - use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; #[test] fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_no_candidate")?; + let harness = TenantHarness::create("no_connection_no_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -857,7 +857,7 @@ mod tests { #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("connection_no_candidate")?; + let harness = TenantHarness::create("connection_no_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -948,7 +948,7 @@ mod tests { #[test] fn no_connection_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("no_connection_candidate")?; + let harness = TenantHarness::create("no_connection_candidate")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -1053,7 +1053,7 @@ mod tests { #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { - let harness = RepoHarness::create("candidate_with_many_connection_failures")?; + let harness = TenantHarness::create("candidate_with_many_connection_failures")?; let mut state = dummy_state(&harness); let now = Utc::now().naive_utc(); @@ -1117,7 +1117,7 @@ mod tests { #[tokio::test] async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1204,7 +1204,7 @@ mod tests { #[tokio::test] async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_connection_threshhold_current_candidate")?; + let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1276,7 +1276,7 @@ mod tests { #[tokio::test] async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; + let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?; let mut state = dummy_state(&harness); let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); @@ -1353,7 +1353,7 @@ mod tests { const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - fn dummy_state(harness: &RepoHarness) -> WalreceiverState { + fn dummy_state(harness: &TenantHarness) -> WalreceiverState { WalreceiverState { id: ZTenantTimelineId { tenant_id: harness.tenant_id, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index e8fa9f9aca..6f1fbc2c9d 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -21,10 +21,10 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::{ - layered_repository::{Timeline, WalReceiverInfo}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, + tenant::{Timeline, WalReceiverInfo}, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, @@ -141,8 +141,7 @@ pub async fn handle_walreceiver_connection( let tenant_id = timeline.tenant_id; let timeline_id = timeline.timeline_id; - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; // // Start streaming the WAL, from where we left off previously. @@ -283,7 +282,7 @@ pub async fn handle_walreceiver_connection( })?; if let Some(last_lsn) = status_update { - let remote_index = repo.get_remote_index(); + let remote_index = tenant.get_remote_index(); let timeline_remote_consistent_lsn = remote_index .read() .await diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 1d083b3ef9..ce3a74930e 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -71,7 +71,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # First timeline would not get loaded into pageserver due to corrupt metadata file with pytest.raises( - Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}" + Exception, match=f"Timeline {timeline1} was not found for tenant {tenant1}" ) as err: pg1.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") @@ -80,7 +80,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # We don't have the remote storage enabled, which means timeline is in an incorrect state, # it's not loaded at all with pytest.raises( - Exception, match=f"Could not get timeline {timeline2} in tenant {tenant2}" + Exception, match=f"Timeline {timeline2} was not found for tenant {tenant2}" ) as err: pg2.start() log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 315ec7f306..1214d703d0 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -40,11 +40,16 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) + def assert_active_without_jobs(tenant): + assert get_state(tenant) == {"Active": {"background_jobs_running": False}} + # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) - assert get_state(tenant) == "Active" + assert get_state(tenant) == { + "Active": {"background_jobs_running": True} + }, "Pageserver should activate a tenant and start background jobs if timelines are loaded" # Stop compute pg.stop() @@ -53,6 +58,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for tenant_info in client.tenant_list(): tenant_id = ZTenantId(tenant_info["id"]) delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id)) # Assert that all tasks finish quickly after tenant is detached assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index a5dadc535b..5a20dbd232 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -18,7 +18,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): invalid_tenant_id = ZTenantId.generate() with pytest.raises( NeonPageserverApiException, - match=f"Tenant {invalid_tenant_id} not found in local tenant state", + match=f"Tenant {invalid_tenant_id} not found in the local state", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) @@ -64,7 +64,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # check 404 with pytest.raises( - NeonPageserverApiException, match="is not found neither locally nor remotely" + NeonPageserverApiException, + match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} is not found neither locally nor remotely", ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) From 59d04ab66aa68be3a7b3cd7997182f9b62636190 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 18:24:11 +0100 Subject: [PATCH 048/166] test_runner: redact passwords from log messages (#2434) --- test_runner/fixtures/log_helper.py | 13 +++++++++++++ test_runner/fixtures/neon_fixtures.py | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 17f2402391..7d112fce89 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,5 +1,6 @@ import logging import logging.config +import re """ This file configures logging to use in python tests. @@ -29,6 +30,17 @@ LOGGING = { } +class PasswordFilter(logging.Filter): + """Filter out password from logs.""" + + # Good enough to filter our passwords produced by PgProtocol.connstr + FILTER = re.compile(r"(\s*)password=[^\s]+(\s*)") + + def filter(self, record: logging.LogRecord) -> bool: + record.msg = self.FILTER.sub(r"\1password=\2", str(record.msg)) + return True + + def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. @@ -38,5 +50,6 @@ def getLogger(name="root") -> logging.Logger: # default logger for tests log = getLogger() +log.addFilter(PasswordFilter()) logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b47e560325..69c6d31315 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -125,7 +125,8 @@ def pytest_configure(config): if env_neon_bin: neon_binpath = env_neon_bin else: - neon_binpath = os.path.join(base_dir, "target/debug") + build_type = os.environ.get("BUILD_TYPE", "debug") + neon_binpath = os.path.join(base_dir, "target", build_type) log.info(f"neon_binpath is {neon_binpath}") if not os.path.exists(os.path.join(neon_binpath, "pageserver")): raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) From db0c49148db3bbc74d314313b601e6f1e7c0be3a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 20:07:16 +0300 Subject: [PATCH 049/166] clean up metrics in handle_pagerequests --- pageserver/src/page_service.rs | 53 +++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b03dab20e0..388f40f916 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -362,6 +362,39 @@ async fn page_service_conn_main( } } +struct PageRequestMetrics { + get_rel_exists: metrics::Histogram, + get_rel_size: metrics::Histogram, + get_page_at_lsn: metrics::Histogram, + get_db_size: metrics::Histogram, +} + +impl PageRequestMetrics { + fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + + let get_rel_exists = + SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]); + + let get_rel_size = + SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]); + + let get_page_at_lsn = + SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]); + + let get_db_size = + SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]); + + Self { + get_rel_exists, + get_rel_size, + get_page_at_lsn, + get_db_size, + } + } +} + #[derive(Debug)] struct PageServerHandler { conf: &'static PageServerConf, @@ -396,6 +429,8 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyBothResponse)?; pgb.flush().await?; + let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id); + loop { let msg = tokio::select! { biased; @@ -420,32 +455,22 @@ impl PageServerHandler { trace!("query: {:?}", copy_data_bytes); let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let tenant_str = tenant_id.to_string(); - let timeline_str = timeline_id.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_rel_exists.start_timer(); self.handle_get_rel_exists_request(&timeline, &req).await } PagestreamFeMessage::Nblocks(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_rel_size", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_rel_size.start_timer(); self.handle_get_nblocks_request(&timeline, &req).await } PagestreamFeMessage::GetPage(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_page_at_lsn.start_timer(); self.handle_get_page_at_lsn_request(&timeline, &req).await } PagestreamFeMessage::DbSize(req) => { - let _timer = SMGR_QUERY_TIME - .with_label_values(&["get_db_size", &tenant_str, &timeline_str]) - .start_timer(); + let _timer = metrics.get_db_size.start_timer(); self.handle_db_size_request(&timeline, &req).await } }; From d4d57ea2ddb49c6d40b90e171188dbeecee8f9fe Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 13 Sep 2022 19:26:26 +0100 Subject: [PATCH 050/166] github/workflows: fix project creation via API (#2437) --- .github/actions/neon-project-create/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index d4fced4196..ba81afaaff 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -60,7 +60,7 @@ runs: --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { - \"platform_id\": \"serverless\", + \"platform_id\": \"aws\", \"region_id\": \"${REGION_ID}\", \"settings\": { } } From 1d53173e62673aecc9e2c73ab6ba6f0488249207 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 20:41:26 +0300 Subject: [PATCH 051/166] update openapi spec (tenant state has changed) --- pageserver/src/http/openapi_spec.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b9a62d0f32..1f2eba05ec 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -494,7 +494,13 @@ components: id: type: string state: - type: string + oneOf: + - type: string + - type: object + properties: + background_jobs_running: + type: boolean + current_physical_size: type: integer has_in_progress_downloads: From 32b7259d5e639e3dd16e3758a1534f0f47d9a6f2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 22:37:20 +0300 Subject: [PATCH 052/166] Timeline data management RFC (#2152) --- docs/SUMMARY.md | 1 + docs/rfcs/017-timeline-data-management.md | 413 ++++++++++++++++++ .../lock_legend.svg | 4 + .../proposed_timeline_data_access_sync_1.svg | 4 + .../proposed_timeline_data_access_sync_2.svg | 4 + .../proposed_timeline_tenant_state.svg | 4 + .../timeline_data_access_sync_1.svg | 4 + .../timeline_data_access_sync_2.svg | 4 + .../timeline_tenant_state.svg | 4 + 9 files changed, 442 insertions(+) create mode 100644 docs/rfcs/017-timeline-data-management.md create mode 100644 docs/rfcs/images/017-timeline-data-management/lock_legend.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg create mode 100644 docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 95ac512ea8..fb6467ffd5 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -79,4 +79,5 @@ - [014-storage-lsm](rfcs/014-storage-lsm.md) - [015-storage-messaging](rfcs/015-storage-messaging.md) - [016-connection-routing](rfcs/016-connection-routing.md) +- [017-timeline-data-management](rfcs/017-timeline-data-management.md) - [cluster-size-limits](rfcs/cluster-size-limits.md) diff --git a/docs/rfcs/017-timeline-data-management.md b/docs/rfcs/017-timeline-data-management.md new file mode 100644 index 0000000000..a8ca3c7ca9 --- /dev/null +++ b/docs/rfcs/017-timeline-data-management.md @@ -0,0 +1,413 @@ +# Name + +Tenant and timeline data management in pageserver + +## Summary + +This RFC attempts to describe timeline-related data management as it's done now in pageserver, highlight current complexities caused by this and propose a set of changes to mitigate them. + +The main goal is to prepare for future [on-demand layer downloads](https://github.com/neondatabase/neon/issues/2029), yet timeline data is one of the core primitive of pageserver, so a number of other RFCs are affected either. +Due to that, this document won't have a single implementation, rather requiring a set of code changes to achieve the final state. + +RFC considers the repository at the `main` branch, commit [`28243d68e60ffc7e69f158522f589f7d2e09186d`](https://github.com/neondatabase/neon/tree/28243d68e60ffc7e69f158522f589f7d2e09186d) on the time of writing. + +## Motivation + +In recent discussions, it became more clear that timeline-related code becomes harder to change: it consists of multiple disjoint modules, each requiring a synchronization to access. +The lower the code is, the complex the sync gets since many concurrent processes are involved and require orchestration to keep the data consistent. +As the number of modules and isolated data grows per timeline, more questions and corner cases arise: + +- https://github.com/neondatabase/neon/issues/1559 + right now it's not straightened out what to do when the synchronization task fails for too many times: every separate module's data has to be treated differently. + +- https://github.com/neondatabase/neon/issues/1751 + GC and compaction file activities are not well known outside their tasks code, causing race bugs + +- https://github.com/neondatabase/neon/issues/2003 + Even the tenant management gets affected: we have to alter its state based on timeline state, yet the data for making the decision is separated and the synchronisation logic has bugs + +- more issues were brought in discussions, but apparently they were too specific to the code to mention them in the issues. + For instance, `tenant_mgr` itself is a static object that we can not mock anyhow, which reduces our capabilities to test the data synchronization logic. + In fact, we have zero Rust tests that cover the case of synchronizing more than one module's data. + +On demand layer downloads would require us to dynamically manage the layer files, which we almost not doing at all on the module level, resulting in the most of their APIs dealing with timelines, rather than the layer files. +The disjoint data that would require data synchronization with possibly a chain of lock acquisitions, some async and some sync, and it would be hard to unit test it with the current code state. + +Neither this helps to easy start the on-demand download epic, nor it's easy to add more timeline-related code on top, whatever the task is. +We have to develop a vision on a number of topics before progressing safely: + +- timeline and tenant data structure and how should we access it +- sync and async worlds and in what way that should evolve +- unit tests for the complex logic + +This RFC aims to provide a general overview of the existing situation and propose ways to improve it. +The changes proposed are quite big and no single PR is expected to do the adjustments, they should gradually be done during the on-demand download work later. + +## What is a timeline and its data + +First, we need to define what data we want to manage per timeline. +Currently, the data every timeline operates is: + +- a set of layer files, on the FS + + Never updated files, created after pageserver's checkpoints and compaction runs, can be removed from the local FS due to compaction, gc or timeline deletion. + +- a set of layer files, on the remote storage + + Identically named and placed in tenant subdirectories files on the remote storage (S3), copied by a special background sync thread + +- a `metadata` file, on the FS + + Updated after every checkpoint with the never `disk_consistent_lsn` and `latest_gc_cutoff_lsn` values. Used to quickly restore timeline's basic metadata on pageserver restart. + Also contains data about the ancestor, if the timeline was branched off another timeline. + +- an `index_part.json` file, on the remote storage + + Contains `metadata` file contents and a list of layer files, available in the current S3 "directory" for the timeline. + Used to avoid potentially slow and expensive `S3 list` command, updated by the remotes storage sync thread after every operation with the remote layer files. + +- LayerMap and PageCache, in memory + + Dynamic, used to store and retrieve the page data to users. + +- timeline info, in memory + + LSNs, walreceiver data, `RemoteTimelineIndex` and other data to share via HTTP API and internal processes. + +- metrics data, in memory + + Data to push or provide to Prometheus, Opentelemetry, etc. + +Besides the data, every timeline currently needs an etcd connection to receive WAL events and connect to safekeepers. + +Timeline could be an ancestor to another one, forming a dependency tree, which is implicit right now: every time relations are looked up in place, based on the corresponding `TimelineMetadata` struct contents. +Yet, there's knowledge on a tenant as a group of timelines, belonging to a single user which is used in GC and compaction tasks, run on every tenant. +`tenant_mgr` manages tenant creation and its task startup, along with the remote storage sync for timeline layers. + +Last file being managed per-tenant is the tenant config file, created and updated on the local FS to hold tenant-specific configuration between restarts. +It's not yet anyhow synchronized with the remote storage, so only exists on the local FS. + +### How the data is stored + +We have multiple places where timeline data is stored: + +- `tenant_mgr` [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L43) a static `static ref TENANTS: RwLock>` with the `Tenant` having the `local_timelines: HashMap>` inside + +- same `Tenant` above has actually two references to timelines: another via its `repo: Arc` with `pub type RepositoryImpl = LayeredRepository;` that [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L178) `Mutex>` + +- `RemoteTimelineIndex` [contains](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync/index.rs#L84) the metadata about timelines on the remote storage (S3) for sync reasons and possible HTTP API queries + +- `walreceiver` [stores](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver.rs#L60) the metadata for possible HTTP API queries and its [internal state](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver/connection_manager.rs#L245) with a reference to the timeline, its current connections and etcd subscription (if any) + +- `PageCache` contains timeline-related data, and is created globally for the whole pageserver + +- implicitly, we also have files on local FS, that contain timeline state. We operate on those files and for some operations (GC, compaction) yet we don't anyhow synchronize the access to the files per se: there are more high-level locks, ensuring only one of a group of operations is running at a time. + + On practice though, `LayerMap` and layer files are tightly coupled together: current low-level code requires a timeline to be loaded into the memory to work with it, and the code removes the layer files after removing the entry from the `LayerMap` first. + +Based on this, a high-level pageserver's module diagram with data and entities could be: + +![timeline tenant state diagram](./images/017-timeline-data-management/timeline_tenant_state.svg) + +A few comments on the diagram: + +- the diagram does not show all the data and replaces a few newtypes and type aliases (for example, completely ignores "unloaded" timelines due to reasons described below) + + It aims to show main data and means of synchronizing it. + +- modules tend to isolate their data inside and provide access to it via API + +Due to multitenancy, that results in a common pattern for storing both tenant and timeline data: `RwLock` or `Mutex` around the `HashMap`, gc and compaction tasks also use the same lock pattern to ensure no concurrent runs are happening. + +- part of the modules is asynchronous, while the other is not, that complicates the data access + +Currently, anything that's not related to tasks (walreceiver, storage sync, GC, compaction) is blocking. + +Async tasks that try to access the data in the sync world, have to call `std::sync::Mutex::lock` method, which blocks the thread the callee async task runs on, also blocking other async tasks running in the same thread. Methods of `std::sync::RwLock` have the same issues, forcing async tasks either to block or spawn another, "blocking" task on a separate thread. + +Sync tasks that try to access the data in the async world, cannot use `.await` hence have to have some `Runtime` doing those calls for them. [`tokio::sync::Mutex`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.Mutex.html#method.blocking_lock) and [`tokio::sync::RwLock`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.RwLock.html#method.blocking_read) provide an API to simplify such calls. Similarly, both `std::sync` and `tokio::sync` have channels that are able to communicate into one direction without blocking and requiring `.await` calls, hence can be used to connect both worlds without locking. + +Some modules are in transition, started as async "blocking" tasks and being fully synchronous in their entire code below the start. Current idea is to transfer them to the async further, but it's not yet done. + +- locks are used in two different ways: + + - `RwLock>` ones to hold the shared data and ensure its atomic updates + - `Mutex<()>` for synchronizing the tasks, used to implicitly order the data access + + The "shared data" locks of the first kind are mainly accessed briefly to either look up or alter the data, yet there are a few notable exceptions, such as + `latest_gc_cutoff_lsn: RwLock` that is explicitly held in a few places to prevent GC thread from progressing. Those are covered later in the data access diagrams. + +- some synchronizations are not yet implemented + +E.g. asynchronous storage sync module does not synchronize with almost synchronous GC and compaction tasks when the layer files are uploaded to the remote storage. +That occasionally results in the files being deleted before the storage upload task is run for this layer, but due to the incremental nature of the layer files, we can handle such situations without issues. + +- `LayeredRepository` covers lots of responsibilities: GC and compaction task synchronisation, timeline access (`local_timelines` in `Tenant` is not used directly before the timeline from the repository is accessed), layer flushing to FS, layer sync to remote storage scheduling, etc. + +### How is this data accessed? + +There are multiple ways the data is accessed, from different sources: + +1. [HTTP requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/http/routes.rs) + +High-level CRUD API for managing tenants, timelines and getting data about them. +Current API list (modified for readability): + +```rust +.get("/v1/status", status_handler) // pageserver status +.get("/v1/tenant", tenant_list_handler) +.post("/v1/tenant", tenant_create_handler) // can create "empty" timelines or branch off the existing ones +.get("/v1/tenant/:tenant_id", tenant_status) // the only tenant public metadata +.put("/v1/tenant/config", tenant_config_handler) // tenant config data and local file manager +.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) +.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) +.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) // download entire tenant from the remote storage and load its timelines memory +.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) // delete all tenant timelines from memory, remote corresponding storage and local FS files +.get("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler) +.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler) +.get("/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", wal_receiver_get_handler) // get walreceiver stats metadata +``` + +Overall, neither HTTP operation goes below `LayeredRepository` level and does not interact with layers: instead, they manage tenant and timeline entities, their configuration and metadata. + +`GET` data is small (relative to layer files contents), updated via brief `.write()/.lock()` calls and read via copying/cloning the data to release the lock soon. +It does not mean that the operations themselves are short, e.g. `tenant_attach_handler` downloads multiple files from the remote storage which might take time, yet the final data is inserted in memory via one brief write under the lock. + +Non-`GET` operations mostly follow the same rule, with two differences: + +- `tenant_detach_handler` has to wait for its background tasks to stop before shutting down, which requires more work with locks +- `timeline_create_handler` currently requires GC to be paused before branching the timeline, which requires orchestrating too. + This is the only HTTP operation, able to load the timeline into memory: rest of the operations are reading the metadata or, as in `tenant_attach_handler`, schedule a deferred task to download timeline and load it into memory. + +"Timeline data synchronization" section below describes both complex cases in more details. + +2. [libpq requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/page_service.rs) + +Is the main interface of pageserver, intended to handle libpq (and similar) requests. +Operates on `LayeredTimeline` and, lower, `LayerMap` modules; all timelines accessed during the operation are loaded into memory immediately (if not loaded already), operations bail on timeline load errors. + +- `pagestream` + + Page requests: `get_rel_exists`, `get_rel_size`, `get_page_at_lsn`, `get_db_size` + + Main API points, intended to be used by `compute` to show the data to the user. All require requests to be made at certain Lsn, if this Lsn is not available in the memory, request processing is paused until that happens or bails after a timeout. + +- `basebackup` and `fullbackup` + + Options to generate postgres-compatible backup archives. + +- `import basebackup` + +- `import wal` + + Import the `pg_wal` section of the basebackup archive. + +- `get_last_record_rlsn`, `get_lsn_by_timestamp` + +"Metadata" retrieval methods, that still requires internal knowledge about layers. + +- `set`, `fallpoints`, `show` + +Utility methods to support various edge cases or help with debugging/testing. + +- `do_gc`, `compact`, `checkpoint` + +Manual triggers for corresponding tenant tasks (GC, compaction) and inmemory layer flushing on disk (checkpointing), with upload task scheduling as a follow-up. + +Apart from loading into memory, every timeline layer has to be accessed using specific set of locking primitives, especially if a write operations happens: otherwise, GC or compaction might spoil the data. User API is implicitly affected by this synchronization during branching, when a GC has to be orchestrated properly before the new timeline could be branched off the existing one. +See "Timeline data synchronization" section for the united synchronization diagram on the topic. + +3. internal access + +Entities within pageserver that update files on local FS and remote storage, metadata in memory; has to use internal data for those operations. +Places that access internal, lower data are also required to have the corresponding timeline successfully loaded into memory and accessed with corresponding synchronization. + +If ancestors' data is accessed via its child branch, it means more than one timeline has to be loaded into memory entirely and more locking primitives usage involved. +Right now, all ancestors are resolved in-place: every place that has to check timeline's ancestor has to lock the timelines map, check if one is loaded into the memory, load it there or bail if it's not present, and get the information required and so on. + +- periodic GC and compaction tasks + +Alter metadata (GC info), in-memory data (layer relations, page caches, etc.) and layer files on disk. +Same as its libpq counterparts, needs full synchronization with the low level layer management code. + +- storage sync task + +Alters metadata (`RemoteTimelineIndex`), layer files on remote storage (upload, delete) and local FS (download) and in-memory data (registers downloaded timelines in the repository). +Currently, does not know anything about layer files contents, rather focusing on the file structure and metadata file updates: due to the fact that the layer files cannot be updated (only created or deleted), storage sync is able to back up the files to the remote storage without further low-level synchronizations: only when the timeline is downloaded, a load operation is needed to run, possibly pausing GC and compaction tasks. + +- walreceiver and walingest task + +Per timeline, subscribes for etcd events from safekeeper and eventually spawns a walreceiver connection task to receive WAL from a safekeeper node. +Fills memory with data, eventually triggering a checkpoint task that creates a new layer file in the local FS and schedules a remote storage sync upload task. +During WAL receiving, also updates a separate in-memory data structure with the walreceiver stats, used later via HTTP API. + +Layer updates require low-level set of sync primitives used to preserve the data consistency. + +- checkpoint (layer freeze) task + +Periodic, short-lived tasks to generate a new layer file in the FS. Requires low level synchronization in the end, when the layer is being registered after creating and has additional mode to ensure only one concurrent compaction happens at a time. + +### Timeline data synchronization + +Here's a high-level timeline data access diagram, considering the synchronization locks, based on the state diagram above. + +For brevity, diagrams do not show `RwLock>` data accesses, considering them almost instant to happen. +`RwLock` is close to be an exception to the previous rule, since it's taken in a multiple places to ensure all layers are inserted correctly. +Yet the only long operation in the current code is a `.write()` lock on the map during its creation, while all other lock usages tend to be short in the current code. +Note though, that due to current "working with loaded timeline only", prevailing amount of the locks taken on the struct are `.write()` locks, not the `.read()` ones. +To simplify the diagrams, these accesses are now considered "fast" data access, not the synchronization attempts. + +`write_lock` synchronization diagram: + +![timeline data access synchronization(1)](./images/017-timeline-data-management/timeline_data_access_sync_1.svg) + +Comments: + +- `write_lock: Mutex<()>` ensures that all timeline data being written into **in-memory layers** is done without races, one concurrent write at a time +- `layer_flush_lock: Mutex<()>` and layer flushing seems to be slightly bloated with various ways to create a layer on disk and write it in memory + The lock itself seem to repeat `write_lock` purpose when it touches in-memory layers, and also to limit the on-disk layer creations. + Yet the latter is not really done consistently, since remote storage sync manages to download and register the new layers without touching the locks +- `freeze_inmem_layer(true)` that touches both `write_lock` and `layer_flush_lock` seems not very aligned with the rest of the locks to those primitives; it also now restricts the layer creation concurrency even more, yet there are various `freeze_inmem_layer(false)` that are ignoring those restrictions at the same time + +![timeline data access synchronization(2)](./images/017-timeline-data-management/timeline_data_access_sync_2.svg) + +Comments: + +- `partitioning: Mutex<(KeyPartitioning, Lsn)>` lock is a data sync lock that's not used to synchronize the tasks (all other such kinds were considered "almost instant" and omitted on the diagram), yet is very similar to what `write_lock` and `layer_flush_lock` do: it ensures the timeline in-memory data is up-to-date with the layer files state on disk, which is what `LayerMap` is for. + +- there are multiple locks that do similar task management operations: + - `gc_cs: Mutex<()>` and `latest_gc_cutoff_lsn: RwLock` ensures that branching and gc are not run concurrently + - `layer_removal_cs: Mutex<()>` lock ensure gc, compaction and timeline deletion via HTTP API do not run concurrently + - `file_lock: RwLock<()>` is used as a semaphore, to ensure "all" gc and compaction tasks are shut down and do not start + Yet that lock does take only gc and compaction from internal loops: libpq call is not cancelled and waited upon. + +Those operations do not seem to belong to a timeline. Moreover, some of those could be eliminated entirely due to duplication of their tasks. + +## Proposed implementation + +### How to structure timeline data access better + +- adjust tenant state handling + +Current [`TenantState`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L108) [changes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L317) mainly indicates whether GC and compaction tasks are running or not; another state, `Broken` shows only in case any timeline does not load during startup. + +We could start both GC and compaction tasks at the time the tenant is created and adjust the tasks to throttle/sleep on timeline absence and wake up when the first one is added. +The latter becomes more important on download on demand, since we won't have the entire timeline in reach to verify its correctness. Moreover, if any network connection happens, the timeline could fail temporarily and entire tenant should be marked as broken due to that. + +Since nothing verifies the `TenantState` via HTTP API currently, it makes sense to remove the whole state entirely and don't write the code to synchronize its changes. +Instead, we could indicate internal issues for every timeline and have a better API to "stop" timeline processing without deleting its data, making our API less restrictive. + +- remove the "unloaded" status for the timeline + +Current approach to timeline management [assumes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L486-L493) + +```rust +#[derive(Clone)] +enum LayeredTimelineEntry { + Loaded(Arc), + Unloaded { + id: ZTimelineId, + metadata: TimelineMetadata, + }, +} +``` + +supposes that timelines have to be in `Unloaded` state. + +The difference between both variants is whether its layer map was loaded from disk and kept in memory (Loaded) or not (Unloaded). +The idea behind such separation was to lazy load timelines in memory with all their layers only after its first access and potentially unload them later. + +Yet now there's no public API methods, that deal with unloaded timelines' layers: all of them either bail when such timeline is worked on, or load it into memory and continue working. +Moreover, every timeline in the local FS is loaded on pageserver startup now, so only two places where `Unloaded` variant is used are branching and timeline attach, with both loading the timeline into memory before the end of the operation. +Even if that loading into memory bails for some reason, next GC or compaction task periodic run would load such timeline into memory. +There are a few timeline methods that return timeline metadata without loading its layers, but such metadata also comes from the `metadata` FS file, not the layer files (so no page info could be retrieved without loading the entire layer map first). + +With the layer on-demand download, it's not feasible anymore to wait for the entire layer map to be loaded into the memory, since it might not even be available on the local FS when requested: `LayerMap` needs to be changed to contain metadata to retrieve the missing layers and handle partially present on the local FS timeline state. + +To accommodate to that and move away from the redundant status, a timeline should always be "loaded" with its metadata read from the disk and its layer map prepared to be downloaded when requested, per layer. + +Layers in the layer map, on the other hand, could be in various state: loaded, unloaded, downloading, downloading failed, etc. and their state has to be handled instead, if we want to support on-demand download in the future. + +This way, tenants and timelines could always try to serve requests and do their internal tasks periodically, trying to recover. + +- scale down the remote storage sync to per layer file, not per timeline as now + +Due to the reasons from the previous bullet, current remote storage model needs its timeline download approach to be changed. +Right now, a timeline is marked as "ready" only after all its layers on the remote storage are downloaded on the local storage. +With the on-demand download approach, only remote storage timeline metadata should be downloaded from S3, leaving the rest of the layers ready for download if/when it's requested. + +Note: while the remote storage sync should operate per layer, it should stay global for all tenants, to better manage S3 limits and sync queue priorities. +Yet the only place using remote storage should be the layer map. + +- encapsulate `tenant_mgr` logic into a regular Rust struct, unite with part of the `Repository` and anything else needed to manage the timeline data in a single place and to test it independently + +[`Repository`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/repository.rs#L187) trait gets closer to `tenant_mgr` in terms of functionality: there are two background task-related functions, that are run on all timelines of a tenant: `gc_iteration` (it does allow running on a single timeline, but GC task runs it on all timelines) and `compaction_iteration` that are related to service tasks, not the data storage; and the metadata management functions, also not really related to the timeline contents. + +`tenant_mgr` proxies some of the `Repository` calls, yet both service tasks use `tenant_mgr` to access the data they need, creating a circular dependency between their APIs. +To avoid excessive synchronization between components, taking multiple locks for that and static state, we can organize the data access and updates in one place. +One potential benefit Rust gets from this is the ability to track and manage timeline resources, if all the related data is located in one place. + +- move `RemoteStorage` usage from `LayeredRepository` into `LayerMap`, as the rest of the layer-based entities (layer files, etc.) + +Layer == file in our model, since pageserver always either tries to load the LayerMap from disk for the timeline not in memory, or assumes the file contents matches its memory. +`LayeredRepository` is one of the most loaded objects currently and not everything from it deserves unification with the `tenant_mgr`. +In particular, layer files need to be better prepared for future download on demand functionality, where every layer could be dynamically loaded and unloaded from memory and local FS. +Current amount of locks and sync-async separation would make it hard to implement truly dynamic (un)loading; moreover, we would need retries with backoffs, since the unloaded layer files are most probably not available on the local FS either and network is not always reliable. + +One of the solutions to the issue is already being developed for the remote storage sync: [SyncQueue](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync.rs#L463) +The queue is able to batch CRUD layer operations (both for local and remote FS contexts) and reorder them to increase the sync speed. +Similar approach could be generalized for all layer modifications, including in-memory ones such as GC or compaction: this way, we could manage all layer modifications and reads in one place with lesser locks and tests that are closer to unit tests. + +- change the approach to locking synchronization + +A number of locks in the timeline seem to be used to coordinate gc, compaction tasks and related processes. +It should be done in a task manager or other place, external to the timeline. + +Timeline contents still needs to be synchronized, considering the task work, so fields like `latest_gc_cutoff_lsn: RwLock` are expected to stay for that purpose, but general amount of locks should be reduced. + +### Putting it all together + +If the proposal bullets applied to the diagrams above, the state could be represented as: + +![timeline timeline tenant state](./images/017-timeline-data-management/proposed_timeline_tenant_state.svg) + +The reorders aim to put all tasks into separated modules, with strictly defined interfaces and as less knowledge about other components, as possible. +This way, all timeline data is now in the `data_storage`, including the GC, walreceiver, `RemoteTimelineIndex`, `LayerMap`, etc. with some API to get the data in the way, +more convenient for the data sync system inside. +So far, it seems that a few maps with `Arc>` with actual data operations added inside each `SeparateData` struct, if needed. + +`page_cache` is proposed to placed into the same `data_storage` since it contains tenant timelines' data: this way, all metadata and data is in the same struct, simplifying things with Rust's borrow checker and allowing us to share internals between data modules and later might simplify timeline in-memory size tracking. + +`task_manager` is related to data storage and manages all tenant and timeline tasks, manages shared resources (runtimes, thread pools, etcd connection, etc.) and synchronizes tasks. +All locks such as `gc_cs` belong to this module tree, as primitives inherently related to the task synchronization. +Tasks have to access timelines and their metadata, but should do that through `data_storage` API and similar. + +`task_manager` should (re)start, stop and track all tasks that are run in it, selecting an appropriate runtime depending on a task kind (we have async/sync task separation, CPU and IO bound tasks separation, ...) +Some locks such as `layer_removal_cs` one are not needed, if the only component that starts the tasks ensures they don't run concurrently. + +`LayeredTimeline` is still split into two parts, more high-level with whatever primitives needed to sync its state, and the actual state storage with `LayerMap` and other low level entities. +Only `LayerMap` knows what storage it's layer files are taken from (inmem, local FS, etc.), and it's responsible for synchronizing the layers when needed, as also reacting to sync events, successful or not. + +Last but not least, `tenant config file` has to be backed into a remote storage, as tenant-specific information for all timelines. +Tenant and timelines have volatile information that's now partially mixed with constant information (e.g. fields in `metadata` file), that model should be better split and handled, in case we want to properly support its backups and synchronization. + +![proposed timeline data access synchronization(1)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg) + +There's still a need to keep inmemory layer buffer synchronized during layer freezing, yet that could happen on a layer level, not on a timeline level, as `write_lock` used to be, so we could lower the sync primitives one layer deeper, preparing us for download on demand feature, where multiple layers could be concurrently streamed and written from various data sources. + +Flushing the frozen layer requires creating a new layer on disk and further remote storage upload, so `LayerMap` has to get those flushed bytes and queue them later: no need to block in the timeline itself for anything again, rather locking on the layer level, if needed. + +![proposed timeline data access synchronization(2)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg) + +Lock diagrams legend: + +![lock diagrams legend](./images/017-timeline-data-management/lock_legend.svg) + +After the frozen layers are flushed, something has to ensure that the layer structure is intact, so a repartitioning lock is needed still, and could also guard the layer map structure changes, since both are needed either way. +This locking belongs to the `LowLevelLayeredTimeline` from the proposed data structure diagram, as the place with all such data being held. + +Similarly, branching is still required to be done after certain Lsn in our current model, but this needs only one lock to synchronize and that could be the `gc_cs: Mutex<()>` lock. +It raises the question of where this lock has to be placed, it's the only place that requires pausing a GC task during external, HTTP request handling. +The right place for the lock seems to be the `task_manager` that could manage GC in more fine-grained way to accommodate the incoming branching request. + +There's no explicit lock sync between GC, compaction or other mutually exclusive tasks: it is a job of the `task_manager` to ensure those are not run concurrently. diff --git a/docs/rfcs/images/017-timeline-data-management/lock_legend.svg b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg new file mode 100644 index 0000000000..d6d2bc00ae --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg @@ -0,0 +1,4 @@ + + + +
Lock interaction legend:

Lock interaction legend:...
LOCK NAME
LOCK NAME
LOCK NAME
LOCK NAME
Event flow
Event flow
or
or
lock acquisition, 
every lock is shown with a single lines
Different lines of the same shape denote different locks
lock acquisition,...
Continuous lock acquisition,
lock release is explicitly shown later
Continuous lock acquisition,...
Lock release
Lock release
Instant lock acquisition and release
Instant lock acquisition and rele...
Lock details (RwLock/Mutex)
are shown on the corresponding arrows
and lock names
Lock details (RwLock/Mutex)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..d1c97d1738 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
layer_write_lock.lock()
layer_write_lock.lock()
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
held through entire freezing
held through entire freezing
flush_frozen_layers
schedules the operation in to LayerMap
flush_frozen_layers...

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
checkpoint(Flush)
checkpoint(Flush)
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..81918fcd98 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
gc
gc
compact
compact
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
checkpoint(Forced)
checkpoint(Forced)
takes the lock when ready to do gc
holds during entire operation
takes the lock when ready to do gc...
gc_cs.lock()
gc_cs.lock()
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds lock during
entire operation
holds lock during...
holds lock during
entire branching
holds lock during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg new file mode 100644 index 0000000000..207017fb1b --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +sLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
Tenanta number of maps with Arc<RwLock<Data>> patternfor tenants, timelines, gc, walreceiver, remove storage, etc. metadataLayeredTimelinewrite_lock: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagercompactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLowLevelLayeredTimelinepartitioning: Mutex<(KeyPartitioning, Lsn)>layers: RwLock<LayerMap>
tenant contains timeline layer data
tenant con...
remote storage syncstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Tasks interact with layers, via LayerMap
Tasks interact with layers, via LayerMap
task_managerruntime, threadpools, shared connections (etcd), etc.logic to manage tenant/timeline taskstenant config file in any form
layer map schedules sync tasks
and calls logic on their completion
layer map schedules sync tasks...
page cachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>tenant storageHashMap<TenantId, Tenant>Tenant state information, its sync and task manager interaction
layer map manages local and remote files
in a queue-based manner
layer map manages local and remote files...
tasks update or read metadata via the storage
tasks update or read metadata via the storage
Legend:
Legend:
interaction between components,
arrows show which component does the data access
interaction between components,...
data relation,
arrows show where current data is contained in
data relation,...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..b968fedd8c --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
write_lock.lock()
w...
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
check_checkpoint_distance
check_checkpoint_distance
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
held through entire freezing
h...
 layer_flush_lock.lock() 
...
skips both flushes if the lock is taken
s...
skips the flush if the lock is taken 
s...
always waits for the lock
and runs
frozen layers flush 
holding the lock
always waits f...
flush_frozen_layers(false)
flush_frozen_layers(false)

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..382d834517 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
tenant idle/detach
shutdown
tenant idle/detach...
gc
gc
compact
compact
lock is held for
almost entire operations
lock is held for...
RwLock(file_lock)
RwLock(file_lock)
read
read
read
read
write
write
HTTP API call
delete timeline
HTTP API call...
layer_removal_cs.lock()
layer_removal_cs.lock()
lock is held for
the entire operation
lock is held for...
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
gc_cs.lock()
gc_cs.lock()
held during entire
branching
held during entire...
checkpoint(Forced)
checkpoint(Forced)
write updated value,
release the lock
write updated value,...
RwLock(latest_gc_cutoff_lsn)
RwLock(latest_gc_cutoff_lsn)
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds read during
enire operation
holds read during...
holds read during
enire branching
holds read during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg new file mode 100644 index 0000000000..c4bc36f309 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +                                             Tasks                                                                                                                                                                   StateLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
LayeredRepositorytimelines: Mutex<HashMap<TimelineId, LayeredTimeline>>gc_cs: Mutex<()>file_lock: RwLock<()>tenant_conf: Arc<RwLock<TenantConfOpt>>remote_index: Arc<RwLock<HashMap<                        TenantTimelineId, RemoteTimelineMetadata>>tenant_mgrstatic ref TENANTS: RwLock<HashMap<TenantId, Tenant>>Tenantstate: TenantStaterepo: Arc<LayeredRepository>local_timelines: HashMap<TimelineId, Arc<DatadirTimelineImpl>>PageCachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>DatadirTimelineImplpartitioning: Mutex<(KeyPartitioning, Lsn)>tline: Arc<LayeredTimeline>compactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLayeredTimelinewrite_lock: Mutex<()>layer_flush_lock: Mutex<()>layer_removal_cs: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>tenant_conf: Arc<RwLock<TenantConfOpt>>gc_info: RwLock<GcInfo>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagerlayers: RwLock<LayerMap>layer flush taskPer timeline, moves in-memory data to disk when scheduled (adds layers)remote storage sync taskstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Backed by repository:
Backed by repository:
get page requests lookup and update
get page requests lookup and update
flushes new files on disk, loads existing into memory
flushes new files on disk, loads existing into memory
Tasks interact with files on disk, full CRUD
Remote storage sync task is the only one to interact with other storage
Tasks interact with files on disk, full CRUD...
schedules layer sync
schedules layer sync
Text is not SVG - cannot display
\ No newline at end of file From 35761ac6b6f4daee78bcaabd083e88ec3b877958 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Tue, 13 Sep 2022 23:55:18 +0200 Subject: [PATCH 053/166] docs/sourcetree: add info about IDE config (#2332) --- docs/sourcetree.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index f3bc9230e2..339a90e0ba 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -134,3 +134,42 @@ Also consider: To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case. More details are available in poetry's [documentation](https://python-poetry.org/docs/). + +## Configuring IDEs +Neon consists of three projects in different languages which use different project models. + +* A bunch of Rust crates, all available from the root `Cargo.toml`. +* Integration tests in Python in the `test_runner` directory. Some stand-alone Python scripts exist as well. +* Postgres and our Postgres extensions in C built with Makefiles under `vendor/postgres` and `pgxn`. + +### CLion +You can use CLion with the [Rust plugin](https://plugins.jetbrains.com/plugin/8182-rust) to develop Neon. It should pick up Rust and Python projects whenever you open Neon's repository as a project. We have not tried setting up a debugger, though. + +C code requires some extra care, as it's built via Make, not CMake. Some of our developers have successfully used [compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_generate) for CLion. It is a JSON file which lists all C source files and corresponding compilation keys. CLion can use it instead of `CMakeLists.txt`. To set up a project with a compilation database: + +1. Clone the Neon repository and install all dependencies, including Python. Do not open it with CLion just yet. +2. Run the following commands in the repository's root: + ```bash + # Install a `compiledb` tool which can parse make's output and generate the compilation database. + poetry add -D compiledb + # Run Make without actually compiling code so we can generate the compilation database. It still may take a few minutes. + make --dry-run --print-directory --keep-going --assume-new=* postgres neon-pg-ext | poetry run compiledb --verbose --no-build + # Uninstall the tool + poetry remove -D compiledb + # Make sure the compile_commands.json file is not committed. + echo /compile_commands.json >>.git/info/exclude + ``` +3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. +4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). +5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. +7. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. + +You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. + +Whenever you change layout of C files, you may need to regenerate the compilation database. No need to re-create the CLion project, changes should be picked up automatically. + +Known issues (fixes and suggestions are welcome): + +* Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. +* CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. +* Cargo Clippy diagnostics in CLion may take a lot of resources. From ba8698bbcbc4f3a4d46e0eeaa48cec3191c0d440 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 13 Sep 2022 21:06:10 +0300 Subject: [PATCH 054/166] update neon_local output in readme --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 977afc2a2c..03ed57a0fa 100644 --- a/README.md +++ b/README.md @@ -125,16 +125,18 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > ./target/debug/neon_local init -initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c -created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50 -initial timeline de200bd42b49cc1814412c7e592dd6e9 created -pageserver init succeeded +Starting pageserver at '127.0.0.1:64000' in '.neon' + +Pageserver started +Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7 +Stopping pageserver gracefully...done! # start pageserver and safekeeper > ./target/debug/neon_local start +Starting etcd broker using /usr/bin/etcd Starting pageserver at '127.0.0.1:64000' in '.neon' + Pageserver started -initializing for sk 1 for 7676 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1' Safekeeper started From 260ec20a0218f3da95a2393c9ba377049967dcb2 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 23:58:27 +0300 Subject: [PATCH 055/166] Refotmat pgxn code, add typedefs.list that was used --- pgxn/neon/inmem_smgr.c | 28 +- pgxn/neon/libpagestore.c | 25 +- pgxn/neon/libpqwalproposer.c | 237 +- pgxn/neon/neon.c | 9 +- pgxn/neon/neon.h | 2 +- pgxn/neon/pagestore_client.h | 19 +- pgxn/neon/pagestore_smgr.c | 169 +- pgxn/neon/walproposer.c | 682 +++--- pgxn/neon/walproposer.h | 343 +-- pgxn/neon/walproposer_utils.c | 142 +- pgxn/neon/walproposer_utils.h | 26 +- pgxn/neon_test_utils/neontest.c | 30 +- pgxn/typedefs.list | 3776 +++++++++++++++++++++++++++++++ 13 files changed, 4691 insertions(+), 797 deletions(-) create mode 100644 pgxn/typedefs.list diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 13fd4d50b6..4926d759e8 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -188,10 +188,10 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { /* * We assume the buffer cache is large enough to hold all the buffers - * needed for most operations. Overflowing to this "in-mem smgr" in rare - * cases is OK. But if we find that we're using more than WARN_PAGES, - * print a warning so that we get alerted and get to investigate why - * we're accessing so many buffers. + * needed for most operations. Overflowing to this "in-mem smgr" in + * rare cases is OK. But if we find that we're using more than + * WARN_PAGES, print a warning so that we get alerted and get to + * investigate why we're accessing so many buffers. */ elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", @@ -207,7 +207,9 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, pg = used_pages; used_pages++; INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); - } else { + } + else + { elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, @@ -226,14 +228,14 @@ BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum) { /* - * It's not clear why a WAL redo function would call smgrnblocks(). - * During recovery, at least before reaching consistency, the size of a - * relation could be arbitrarily small, if it was truncated after the - * record being replayed, or arbitrarily large if it was extended - * afterwards. But one place where it's called is in - * XLogReadBufferExtended(): it extends the relation, if it's smaller than - * the requested page. That's a waste of time in the WAL redo - * process. Pretend that all relations are maximally sized to avoid it. + * It's not clear why a WAL redo function would call smgrnblocks(). During + * recovery, at least before reaching consistency, the size of a relation + * could be arbitrarily small, if it was truncated after the record being + * replayed, or arbitrarily large if it was extended afterwards. But one + * place where it's called is in XLogReadBufferExtended(): it extends the + * relation, if it's smaller than the requested page. That's a waste of + * time in the WAL redo process. Pretend that all relations are maximally + * sized to avoid it. */ return MaxBlockNumber; } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index d0572e66cb..55285a6345 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -153,11 +153,11 @@ static void pageserver_disconnect(void) { /* - * If anything goes wrong while we were sending a request, it's not - * clear what state the connection is in. For example, if we sent the - * request but didn't receive a response yet, we might receive the - * response some time later after we have already sent a new unrelated - * request. Close the connection to avoid getting confused. + * If anything goes wrong while we were sending a request, it's not clear + * what state the connection is in. For example, if we sent the request + * but didn't receive a response yet, we might receive the response some + * time later after we have already sent a new unrelated request. Close + * the connection to avoid getting confused. */ if (connected) { @@ -191,12 +191,13 @@ pageserver_send(ZenithRequest *request) * * In principle, this could block if the output buffer is full, and we * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output - * and TCP buffer. + * practice, our requests are small enough to always fit in the output and + * TCP buffer. */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { - char* msg = PQerrorMessage(pageserver_conn); + char *msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); neon_log(ERROR, "failed to send page request: %s", msg); } @@ -205,6 +206,7 @@ pageserver_send(ZenithRequest *request) if (message_level_is_interesting(PageStoreTrace)) { char *msg = zm_to_string((ZenithMessage *) request); + neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); } @@ -255,15 +257,16 @@ static void pageserver_flush(void) { if (PQflush(pageserver_conn)) - { - char* msg = PQerrorMessage(pageserver_conn); + { + char *msg = PQerrorMessage(pageserver_conn); + pageserver_disconnect(); neon_log(ERROR, "failed to flush page requests: %s", msg); } } static ZenithResponse * -pageserver_call(ZenithRequest* request) +pageserver_call(ZenithRequest *request) { pageserver_send(request); pageserver_flush(); diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c index 2b2b7a1a6a..1f739f3722 100644 --- a/pgxn/neon/libpqwalproposer.c +++ b/pgxn/neon/libpqwalproposer.c @@ -7,38 +7,40 @@ /* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ struct WalProposerConn { - PGconn* pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from libpqprop_async_read */ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from + * libpqprop_async_read */ }; /* Prototypes for exported functions */ -static char* libpqprop_error_message(WalProposerConn* conn); -static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); -static WalProposerConn* libpqprop_connect_start(char* conninfo); -static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); -static bool libpqprop_send_query(WalProposerConn* conn, char* query); -static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); -static pgsocket libpqprop_socket(WalProposerConn* conn); -static int libpqprop_flush(WalProposerConn* conn); -static void libpqprop_finish(WalProposerConn* conn); -static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); -static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); -static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); +static char *libpqprop_error_message(WalProposerConn * conn); +static WalProposerConnStatusType libpqprop_status(WalProposerConn * conn); +static WalProposerConn * libpqprop_connect_start(char *conninfo); +static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn * conn); +static bool libpqprop_send_query(WalProposerConn * conn, char *query); +static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn * conn); +static pgsocket libpqprop_socket(WalProposerConn * conn); +static int libpqprop_flush(WalProposerConn * conn); +static void libpqprop_finish(WalProposerConn * conn); +static PGAsyncReadResult libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount); +static PGAsyncWriteResult libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size); +static bool libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size); -static WalProposerFunctionsType PQWalProposerFunctions = { +static WalProposerFunctionsType PQWalProposerFunctions = +{ libpqprop_error_message, - libpqprop_status, - libpqprop_connect_start, - libpqprop_connect_poll, - libpqprop_send_query, - libpqprop_get_query_result, - libpqprop_socket, - libpqprop_flush, - libpqprop_finish, - libpqprop_async_read, - libpqprop_async_write, - libpqprop_blocking_write, + libpqprop_status, + libpqprop_connect_start, + libpqprop_connect_poll, + libpqprop_send_query, + libpqprop_get_query_result, + libpqprop_socket, + libpqprop_flush, + libpqprop_finish, + libpqprop_async_read, + libpqprop_async_write, + libpqprop_blocking_write, }; /* Module initialization */ @@ -52,7 +54,7 @@ pg_init_libpqwalproposer(void) /* Helper function */ static bool -ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) +ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking) { /* If we're already correctly blocking or nonblocking, all good */ if (is_nonblocking == conn->is_nonblocking) @@ -67,14 +69,14 @@ ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) } /* Exported function definitions */ -static char* -libpqprop_error_message(WalProposerConn* conn) +static char * +libpqprop_error_message(WalProposerConn * conn) { return PQerrorMessage(conn->pg_conn); } static WalProposerConnStatusType -libpqprop_status(WalProposerConn* conn) +libpqprop_status(WalProposerConn * conn) { switch (PQstatus(conn->pg_conn)) { @@ -87,35 +89,38 @@ libpqprop_status(WalProposerConn* conn) } } -static WalProposerConn* -libpqprop_connect_start(char* conninfo) +static WalProposerConn * +libpqprop_connect_start(char *conninfo) { - WalProposerConn* conn; - PGconn* pg_conn; + WalProposerConn *conn; + PGconn *pg_conn; pg_conn = PQconnectStart(conninfo); + /* - * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the - * behavior of PQconnectStart here. + * Allocation of a PQconn can fail, and will return NULL. We want to fully + * replicate the behavior of PQconnectStart here. */ if (!pg_conn) return NULL; /* - * And in theory this allocation can fail as well, but it's incredibly unlikely if we just - * successfully allocated a PGconn. + * And in theory this allocation can fail as well, but it's incredibly + * unlikely if we just successfully allocated a PGconn. * - * palloc will exit on failure though, so there's not much we could do if it *did* fail. + * palloc will exit on failure though, so there's not much we could do if + * it *did* fail. */ conn = palloc(sizeof(WalProposerConn)); conn->pg_conn = pg_conn; - conn->is_nonblocking = false; /* connections always start in blocking mode */ + conn->is_nonblocking = false; /* connections always start in blocking + * mode */ conn->recvbuf = NULL; return conn; } static WalProposerConnectPollStatusType -libpqprop_connect_poll(WalProposerConn* conn) +libpqprop_connect_poll(WalProposerConn * conn) { WalProposerConnectPollStatusType return_val; @@ -134,26 +139,34 @@ libpqprop_connect_poll(WalProposerConn* conn) return_val = WP_CONN_POLLING_OK; break; - /* There's a comment at its source about this constant being unused. We'll expect it's never - * returned. */ + /* + * There's a comment at its source about this constant being + * unused. We'll expect it's never returned. + */ case PGRES_POLLING_ACTIVE: elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); - /* This return is never actually reached, but it's here to make the compiler happy */ + + /* + * This return is never actually reached, but it's here to make + * the compiler happy + */ return WP_CONN_POLLING_FAILED; default: Assert(false); - return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ } return return_val; } static bool -libpqprop_send_query(WalProposerConn* conn, char* query) +libpqprop_send_query(WalProposerConn * conn, char *query) { - /* We need to be in blocking mode for sending the query to run without - * requiring a call to PQflush */ + /* + * We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush + */ if (!ensure_nonblocking_status(conn, false)) return false; @@ -165,13 +178,13 @@ libpqprop_send_query(WalProposerConn* conn, char* query) } static WalProposerExecStatusType -libpqprop_get_query_result(WalProposerConn* conn) +libpqprop_get_query_result(WalProposerConn * conn) { - PGresult* result; + PGresult *result; WalProposerExecStatusType return_val; /* Marker variable if we need to log an unexpected success result */ - char* unexpected_success = NULL; + char *unexpected_success = NULL; /* Consume any input that we might be missing */ if (!PQconsumeInput(conn->pg_conn)) @@ -182,8 +195,11 @@ libpqprop_get_query_result(WalProposerConn* conn) result = PQgetResult(conn->pg_conn); - /* PQgetResult returns NULL only if getting the result was successful & there's no more of the - * result to get. */ + + /* + * PQgetResult returns NULL only if getting the result was successful & + * there's no more of the result to get. + */ if (!result) { elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); @@ -191,7 +207,7 @@ libpqprop_get_query_result(WalProposerConn* conn) } /* Helper macro to reduce boilerplate */ - #define UNEXPECTED_SUCCESS(msg) \ +#define UNEXPECTED_SUCCESS(msg) \ return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ unexpected_success = msg; \ break; @@ -199,12 +215,12 @@ libpqprop_get_query_result(WalProposerConn* conn) switch (PQresultStatus(result)) { - /* "true" success case */ + /* "true" success case */ case PGRES_COPY_BOTH: return_val = WP_EXEC_SUCCESS_COPYBOTH; break; - /* Unexpected success case */ + /* Unexpected success case */ case PGRES_EMPTY_QUERY: UNEXPECTED_SUCCESS("empty query return"); case PGRES_COMMAND_OK: @@ -220,7 +236,7 @@ libpqprop_get_query_result(WalProposerConn* conn) case PGRES_PIPELINE_SYNC: UNEXPECTED_SUCCESS("pipeline sync point"); - /* Failure cases */ + /* Failure cases */ case PGRES_BAD_RESPONSE: case PGRES_NONFATAL_ERROR: case PGRES_FATAL_ERROR: @@ -230,7 +246,7 @@ libpqprop_get_query_result(WalProposerConn* conn) default: Assert(false); - return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ } if (unexpected_success) @@ -240,19 +256,19 @@ libpqprop_get_query_result(WalProposerConn* conn) } static pgsocket -libpqprop_socket(WalProposerConn* conn) +libpqprop_socket(WalProposerConn * conn) { return PQsocket(conn->pg_conn); } static int -libpqprop_flush(WalProposerConn* conn) +libpqprop_flush(WalProposerConn * conn) { return (PQflush(conn->pg_conn)); } static void -libpqprop_finish(WalProposerConn* conn) +libpqprop_finish(WalProposerConn * conn) { if (conn->recvbuf != NULL) PQfreemem(conn->recvbuf); @@ -267,9 +283,9 @@ libpqprop_finish(WalProposerConn* conn) * to this function. */ static PGAsyncReadResult -libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) +libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount) { - int result; + int result; if (conn->recvbuf != NULL) { @@ -285,12 +301,11 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) return PG_ASYNC_READ_FAIL; } - /* The docs for PQgetCopyData list the return values as: - * 0 if the copy is still in progress, but no "complete row" is - * available - * -1 if the copy is done - * -2 if an error occured - * (> 0) if it was successful; that value is the amount transferred. + /* + * The docs for PQgetCopyData list the return values as: 0 if the copy is + * still in progress, but no "complete row" is available -1 if the copy is + * done -2 if an error occured (> 0) if it was successful; that value is + * the amount transferred. * * The protocol we use between walproposer and safekeeper means that we * *usually* wouldn't expect to see that the copy is done, but this can @@ -304,25 +319,28 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) *buf = NULL; return PG_ASYNC_READ_TRY_AGAIN; case -1: - { - /* - * If we get -1, it's probably because of a server error; the - * safekeeper won't normally send a CopyDone message. - * - * We can check PQgetResult to make sure that the server failed; - * it'll always result in PGRES_FATAL_ERROR - */ - ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server + * failed; it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); - if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); - /* If there was actually an error, it'll be properly reported by - * calls to PQerrorMessage -- we don't have to do anything else */ - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - } + /* + * If there was actually an error, it'll be properly reported + * by calls to PQerrorMessage -- we don't have to do anything + * else + */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } case -2: *amount = 0; *buf = NULL; @@ -336,23 +354,25 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) } static PGAsyncWriteResult -libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) +libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size) { - int result; + int result; /* If we aren't in non-blocking mode, switch to it. */ if (!ensure_nonblocking_status(conn, true)) return PG_ASYNC_WRITE_FAIL; - /* The docs for PQputcopyData list the return values as: - * 1 if the data was queued, - * 0 if it was not queued because of full buffers, or - * -1 if an error occured + /* + * The docs for PQputcopyData list the return values as: 1 if the data was + * queued, 0 if it was not queued because of full buffers, or -1 if an + * error occured */ result = PQputCopyData(conn->pg_conn, buf, size); - /* We won't get a result of zero because walproposer always empties the - * connection's buffers before sending more */ + /* + * We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more + */ Assert(result != 0); switch (result) @@ -366,16 +386,17 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) elog(FATAL, "invalid return %d from PQputCopyData", result); } - /* After queueing the data, we still need to flush to get it to send. - * This might take multiple tries, but we don't want to wait around - * until it's done. + /* + * After queueing the data, we still need to flush to get it to send. This + * might take multiple tries, but we don't want to wait around until it's + * done. * - * PQflush has the following returns (directly quoting the docs): - * 0 if sucessful, - * 1 if it was unable to send all the data in the send queue yet - * -1 if it failed for some reason + * PQflush has the following returns (directly quoting the docs): 0 if + * sucessful, 1 if it was unable to send all the data in the send queue + * yet -1 if it failed for some reason */ - switch (result = PQflush(conn->pg_conn)) { + switch (result = PQflush(conn->pg_conn)) + { case 0: return PG_ASYNC_WRITE_SUCCESS; case 1: @@ -388,16 +409,18 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) } static bool -libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) +libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size) { - int result; + int result; /* If we are in non-blocking mode, switch out of it. */ if (!ensure_nonblocking_status(conn, false)) return false; - /* Ths function is very similar to libpqprop_async_write. For more - * information, refer to the comments there */ + /* + * Ths function is very similar to libpqprop_async_write. For more + * information, refer to the comments there + */ if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) return false; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 62d2624e56..5346680b0b 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -29,7 +29,8 @@ PG_MODULE_MAGIC; void _PG_init(void); -void _PG_init(void) +void +_PG_init(void) { pg_init_libpagestore(); pg_init_libpqwalproposer(); @@ -59,9 +60,9 @@ pg_cluster_size(PG_FUNCTION_ARGS) Datum backpressure_lsns(PG_FUNCTION_ARGS) { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; Datum values[3]; bool nulls[3]; TupleDesc tupdesc; diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 2c66bc7bf0..dad9c1b508 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -16,4 +16,4 @@ extern void pg_init_libpagestore(void); extern void pg_init_libpqwalproposer(void); extern void pg_init_walproposer(void); -#endif /* NEON_H */ +#endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 5b21abc1bd..7dc38c13fb 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -83,8 +83,8 @@ typedef struct typedef struct { ZenithRequest req; - Oid dbNode; -} ZenithDbSizeRequest; + Oid dbNode; +} ZenithDbSizeRequest; typedef struct @@ -123,12 +123,13 @@ typedef struct { ZenithMessageTag tag; int64 db_size; -} ZenithDbSizeResponse; +} ZenithDbSizeResponse; typedef struct { ZenithMessageTag tag; - char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error + * message */ } ZenithErrorResponse; extern StringInfoData zm_pack_request(ZenithRequest *msg); @@ -142,12 +143,12 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { ZenithResponse *(*request) (ZenithRequest *request); - void (*send) (ZenithRequest *request); + void (*send) (ZenithRequest *request); ZenithResponse *(*receive) (void); - void (*flush) (void); + void (*flush) (void); } page_server_api; -extern page_server_api *page_server; +extern page_server_api * page_server; extern char *page_server_connstring; extern char *zenith_timeline; @@ -179,7 +180,7 @@ extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber block char *buffer); extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, bool request_latest, char *buffer); extern void zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -217,7 +218,7 @@ extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); /* utils for zenith relsize cache */ extern void relsize_hash_init(void); -extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ebf899dfdb..504ae60d4a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -94,7 +94,9 @@ const int SmgrTrace = DEBUG5; page_server_api *page_server; /* GUCs */ -char *page_server_connstring; // with substituted password +char *page_server_connstring; + +//with substituted password char *zenith_timeline; char *zenith_tenant; bool wal_redo = false; @@ -107,7 +109,7 @@ typedef enum UNLOGGED_BUILD_PHASE_1, UNLOGGED_BUILD_PHASE_2, UNLOGGED_BUILD_NOT_PERMANENT -} UnloggedBuildPhase; +} UnloggedBuildPhase; static SMgrRelation unlogged_build_rel = NULL; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; @@ -127,31 +129,33 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; #define MAX_PREFETCH_REQUESTS 128 -BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; -BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; -int n_prefetch_requests; -int n_prefetch_responses; -int n_prefetched_buffers; -int n_prefetch_hits; -int n_prefetch_misses; -XLogRecPtr prefetch_lsn; +BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS]; +BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS]; +int n_prefetch_requests; +int n_prefetch_responses; +int n_prefetched_buffers; +int n_prefetch_hits; +int n_prefetch_misses; +XLogRecPtr prefetch_lsn; static void consume_prefetch_responses(void) { - for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { - ZenithResponse* resp = page_server->receive(); + for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) + { + ZenithResponse *resp = page_server->receive(); + pfree(resp); } n_prefetched_buffers = 0; n_prefetch_responses = 0; } -static ZenithResponse* -page_server_request(void const* req) +static ZenithResponse * +page_server_request(void const *req) { consume_prefetch_responses(); - return page_server->request((ZenithRequest*)req); + return page_server->request((ZenithRequest *) req); } @@ -196,11 +200,11 @@ zm_pack_request(ZenithRequest *msg) { ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->dbNode); + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); - break; + break; } case T_ZenithGetPageRequest: { @@ -546,21 +550,22 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, else if (lsn == InvalidXLogRecPtr) { /* - * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, - * and we can just ignore that in Zenith. We do need to remember the new size, - * though, so that smgrnblocks() returns the right answer after the rel has - * been extended. We rely on the relsize cache for that. + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Zenith. We do need + * to remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. * - * A completely empty heap page doesn't need to be WAL-logged, either. The - * heapam can leave such a page behind, if e.g. an insert errors out after - * initializing the page, but before it has inserted the tuple and WAL-logged - * the change. When we read the page from the page server, it will come back - * as all-zeros. That's OK, the heapam will initialize an all-zeros page on - * first use. + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. * - * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies - * that the page was not WAL-logged, and its contents will be lost when it's - * evicted. + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. */ if (PageIsNew(buffer)) { @@ -691,9 +696,9 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc * Is it possible that the last-written LSN is ahead of last flush * LSN? Generally not, we shouldn't evict a page from the buffer cache * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index building, - * _bt_blwritepage logs the full page without flushing WAL before - * smgrextend (files are fsynced before build ends). + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); @@ -728,10 +733,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) switch (reln->smgr_relpersistence) { case 0: + /* - * We don't know if it's an unlogged rel stored locally, or permanent - * rel stored in the page server. First check if it exists locally. - * If it does, great. Otherwise check if it exists in the page server. + * We don't know if it's an unlogged rel stored locally, or + * permanent rel stored in the page server. First check if it + * exists locally. If it does, great. Otherwise check if it exists + * in the page server. */ if (mdexists(reln, forkNum)) return true; @@ -755,11 +762,11 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) /* * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server - * will error out if you check that, because the whole dbdir for tablespace - * 0, db 0 doesn't exists. We possibly should change the page server to - * accept that and return 'false', to be consistent with mdexists(). But - * we probably also should fix pg_table_size() to not call smgrexists() - * with bogus relfilenode. + * will error out if you check that, because the whole dbdir for + * tablespace 0, db 0 doesn't exists. We possibly should change the page + * server to accept that and return 'false', to be consistent with + * mdexists(). But we probably also should fix pg_table_size() to not call + * smgrexists() with bogus relfilenode. * * For now, handle that special case here. */ @@ -880,13 +887,13 @@ void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { /* - * Might or might not exist locally, depending on whether it's - * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is - * set). Try to unlink, it won't do any harm if the file doesn't - * exist. + * Might or might not exist locally, depending on whether it's an unlogged + * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to + * unlink, it won't do any harm if the file doesn't exist. */ mdunlink(rnode, forkNum, isRedo); - if (!RelFileNodeBackendIsTemp(rnode)) { + if (!RelFileNodeBackendIsTemp(rnode)) + { forget_cached_relsize(rnode.node, forkNum); } } @@ -926,8 +933,9 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, /* * Check that the cluster size limit has not been exceeded. * - * Temporary and unlogged relations are not included in the cluster size measured - * by the page server, so ignore those. Autovacuum processes are also exempt. + * Temporary and unlogged relations are not included in the cluster size + * measured by the page server, so ignore those. Autovacuum processes are + * also exempt. */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && @@ -937,10 +945,10 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, - (errcode(ERRCODE_DISK_FULL), - errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", - max_cluster_size), - errhint("This limit is defined by neon.max_cluster_size GUC"))); + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); } zenith_wallog_page(reln, forkNum, blkno, buffer); @@ -987,8 +995,8 @@ void zenith_close(SMgrRelation reln, ForkNumber forknum) { /* - * Let md.c close it, if it had it open. Doesn't hurt to do this - * even for permanent relations that have no local storage. + * Let md.c close it, if it had it open. Doesn't hurt to do this even for + * permanent relations that have no local storage. */ mdclose(reln, forknum); } @@ -1079,17 +1087,18 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, * While function is defined in the zenith extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ -void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) +void +zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) { ZenithResponse *resp; - int i; + int i; /* - * Try to find prefetched page. - * It is assumed that pages will be requested in the same order as them are prefetched, - * but some other backend may load page in shared buffers, so some prefetch responses should - * be skipped. + * Try to find prefetched page. It is assumed that pages will be requested + * in the same order as them are prefetched, but some other backend may + * load page in shared buffers, so some prefetch responses should be + * skipped. */ for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) { @@ -1099,19 +1108,20 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno prefetch_responses[i].forkNum == forkNum && prefetch_responses[i].blockNum == blkno) { - char* page = ((ZenithGetPageResponse *) resp)->page; + char *page = ((ZenithGetPageResponse *) resp)->page; + /* - * Check if prefetched page is still relevant. - * If it is updated by some other backend, then it should not - * be requested from smgr unless it is evicted from shared buffers. - * In the last case last_evicted_lsn should be updated and - * request_lsn should be greater than prefetch_lsn. - * Maximum with page LSN is used because page returned by page server - * may have LSN either greater either smaller than requested. + * Check if prefetched page is still relevant. If it is updated by + * some other backend, then it should not be requested from smgr + * unless it is evicted from shared buffers. In the last case + * last_evicted_lsn should be updated and request_lsn should be + * greater than prefetch_lsn. Maximum with page LSN is used + * because page returned by page server may have LSN either + * greater either smaller than requested. */ if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn) { - n_prefetched_buffers = i+1; + n_prefetched_buffers = i + 1; n_prefetch_hits += 1; n_prefetch_requests = 0; memcpy(buffer, page, BLCKSZ); @@ -1133,6 +1143,7 @@ void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno .forknum = forkNum, .blkno = blkno }; + if (n_prefetch_requests > 0) { /* Combine all prefetch requests with primary request */ @@ -1471,8 +1482,8 @@ int64 zenith_dbsize(Oid dbNode) { ZenithResponse *resp; - int64 db_size; - XLogRecPtr request_lsn; + int64 db_size; + XLogRecPtr request_lsn; bool latest; RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; @@ -1564,10 +1575,12 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) XLogFlush(lsn); /* - * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them, - * or update LSN for "dummy" metadata block. Second approach seems more efficient. If the relation is extended - * again later, the extension will update the last-written LSN for the extended pages, so there's no harm in - * leaving behind obsolete entries for the truncated chunks. + * Truncate may affect several chunks of relations. So we should either + * update last written LSN for all of them, or update LSN for "dummy" + * metadata block. Second approach seems more efficient. If the relation + * is extended again later, the extension will update the last-written LSN + * for the extended pages, so there's no harm in leaving behind obsolete + * entries for the truncated chunks. */ SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a769a5216b..05257ced4c 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -88,8 +88,9 @@ WalProposerFunctionsType *WalProposerFunctions = NULL; static int n_safekeepers = 0; static int quorum = 0; static Safekeeper safekeeper[MAX_SAFEKEEPERS]; -static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to + * safekeepers */ static ProposerGreeting greetRequest; static VoteRequest voteRequest; /* Vote request for safekeeper */ static WaitEventSet *waitEvents; @@ -99,6 +100,7 @@ static AppendResponse quorumFeedback; * record-aligned (first record which might not yet received by someone). */ static XLogRecPtr truncateLsn; + /* * Term of the proposer. We want our term to be highest and unique, * so we collect terms from safekeepers quorum, choose max and +1. @@ -116,7 +118,7 @@ static int n_votes = 0; static int n_connected = 0; static TimestampTz last_reconnect_attempt; -static WalproposerShmemState *walprop_shared; +static WalproposerShmemState * walprop_shared; /* Prototypes for private functions */ static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId); @@ -138,7 +140,7 @@ static void RecvAcceptorGreeting(Safekeeper *sk); static void SendVoteRequest(Safekeeper *sk); static void RecvVoteResponse(Safekeeper *sk); static void HandleElectedProposer(void); -static term_t GetHighestTerm(TermHistory *th); +static term_t GetHighestTerm(TermHistory * th); static term_t GetEpoch(Safekeeper *sk); static void DetermineEpochStartLsn(void); static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); @@ -155,7 +157,7 @@ static XLogRecPtr CalculateMinFlushLsn(void); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); static void HandleSafekeeperResponse(void); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); -static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); @@ -175,7 +177,8 @@ static void walproposer_shmem_request(void); #endif -void pg_init_walproposer(void) +void +pg_init_walproposer(void) { if (!process_shared_preload_libraries_in_progress) return; @@ -194,50 +197,53 @@ void pg_init_walproposer(void) WalProposerStart = &WalProposerStartImpl; } -static void nwp_register_gucs(void) +static void +nwp_register_gucs(void) { DefineCustomStringVariable( - "neon.safekeepers", - "List of Neon WAL acceptors (host:port)", - NULL, /* long_desc */ - &wal_acceptors_list, /* valueAddr */ - "", /* bootValue */ - PGC_POSTMASTER, - GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */ - NULL, NULL, NULL - ); + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use + * GUC_LIST_QUOTE */ + NULL, NULL, NULL + ); DefineCustomIntVariable( - "neon.safekeeper_reconnect_timeout", - "Timeout for reconnecting to offline wal acceptor.", - NULL, - &wal_acceptor_reconnect_timeout, - 1000, 0, INT_MAX, /* default, min, max */ - PGC_SIGHUP, /* context */ - GUC_UNIT_MS, /* flags */ - NULL, NULL, NULL - ); + "neon.safekeeper_reconnect_timeout", + "Timeout for reconnecting to offline wal acceptor.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL + ); DefineCustomIntVariable( - "neon.safekeeper_connect_timeout", - "Timeout after which give up connection attempt to safekeeper.", - NULL, - &wal_acceptor_connect_timeout, - 5000, 0, INT_MAX, - PGC_SIGHUP, - GUC_UNIT_MS, - NULL, NULL, NULL - ); + "neon.safekeeper_connect_timeout", + "Timeout after which give up connection attempt to safekeeper.", + NULL, + &wal_acceptor_connect_timeout, + 5000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL + ); } /* shmem handling */ -static void nwp_prepare_shmem(void) +static void +nwp_prepare_shmem(void) { #if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = walproposer_shmem_request; + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = walproposer_shmem_request; #else RequestAddinShmemSpace(WalproposerShmemSize()); #endif @@ -260,7 +266,8 @@ walproposer_shmem_request(void) } #endif -static void nwp_shmem_startup_hook(void) +static void +nwp_shmem_startup_hook(void) { if (prev_shmem_startup_hook_type) prev_shmem_startup_hook_type(); @@ -275,7 +282,7 @@ void WalProposerMain(Datum main_arg) { #if PG_VERSION_NUM >= 150000 - TimeLineID tli; + TimeLineID tli; #endif /* Establish signal handlers. */ @@ -286,7 +293,7 @@ WalProposerMain(Datum main_arg) BackgroundWorkerUnblockSignals(); #if PG_VERSION_NUM >= 150000 - // FIXME pass proper tli to WalProposerInit ? + /* FIXME pass proper tli to WalProposerInit ? */ GetXLogReplayRecPtr(&tli); WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier()); #else @@ -339,7 +346,7 @@ WalProposerPoll(void) { while (true) { - Safekeeper *sk; + Safekeeper *sk; int rc; WaitEvent event; TimestampTz now = GetCurrentTimestamp(); @@ -356,8 +363,8 @@ WalProposerPoll(void) AdvancePollState(sk, event.events); /* - * If the timeout expired, attempt to reconnect to any safekeepers that - * we dropped + * If the timeout expired, attempt to reconnect to any safekeepers + * that we dropped */ ReconnectSafekeepers(); @@ -371,7 +378,7 @@ WalProposerPoll(void) ResetLatch(MyLatch); break; } - if (rc == 0) /* timeout expired: poll state */ + if (rc == 0) /* timeout expired: poll state */ { TimestampTz now; @@ -390,12 +397,12 @@ WalProposerPoll(void) now = GetCurrentTimestamp(); for (int i = 0; i < n_safekeepers; i++) { - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; if ((sk->state == SS_CONNECTING_WRITE || - sk->state == SS_CONNECTING_READ) && + sk->state == SS_CONNECTING_READ) && TimestampDifferenceExceeds(sk->startedConnAt, now, - wal_acceptor_connect_timeout)) + wal_acceptor_connect_timeout)) { elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", sk->host, sk->port, wal_acceptor_connect_timeout); @@ -472,7 +479,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) */ safekeeper[n_safekeepers].conninfo[0] = '\0'; initStringInfo(&safekeeper[n_safekeepers].outbuf); - safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); if (safekeeper[n_safekeepers].xlogreader == NULL) elog(FATAL, "Failed to allocate xlog reader"); safekeeper[n_safekeepers].flushWrite = false; @@ -504,7 +511,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); #if PG_VERSION_NUM >= 150000 -// FIXME don't use hardcoded timeline id +/* FIXME don't use hardcoded timeline id */ greetRequest.timeline = 1; #else greetRequest.timeline = ThisTimeLineID; @@ -589,7 +596,7 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove) for (int i = 0; i < n_safekeepers; i++) { uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; sk->eventPos = -1; @@ -647,12 +654,21 @@ ResetConnection(Safekeeper *sk) */ if (sk->conninfo[0] == '\0') { - int written = 0; + int written = 0; + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); - // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, - // so it is better to be defensive and check that everything aligns well + "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + + /* + * currently connection string is not that long, but once we pass + * something like jwt we might overflow the buffer, + */ + + /* + * so it is better to be defensive and check that everything aligns + * well + */ if (written > MAXCONNINFO || written < 0) elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } @@ -762,8 +778,8 @@ static void AdvancePollState(Safekeeper *sk, uint32 events) { /* - * Sanity check. We assume further down that the operations don't - * block because the socket is ready. + * Sanity check. We assume further down that the operations don't block + * because the socket is ready. */ AssertEventsOkForState(events, sk); @@ -777,12 +793,12 @@ AdvancePollState(Safekeeper *sk, uint32 events) case SS_OFFLINE: elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", sk->host, sk->port); - break; /* actually unreachable, but prevents - * -Wimplicit-fallthrough */ + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ /* - * Both connecting states run the same logic. The only - * difference is the events they're expecting + * Both connecting states run the same logic. The only difference + * is the events they're expecting */ case SS_CONNECTING_READ: case SS_CONNECTING_WRITE: @@ -797,20 +813,22 @@ AdvancePollState(Safekeeper *sk, uint32 events) break; /* - * Finish handshake comms: receive information about the safekeeper. + * Finish handshake comms: receive information about the + * safekeeper. */ case SS_HANDSHAKE_RECV: RecvAcceptorGreeting(sk); break; /* - * Voting is an idle state - we don't expect any events to trigger. - * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are - * transferred from SS_VOTING to sending actual vote requests. + * Voting is an idle state - we don't expect any events to + * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see how + * nodes are transferred from SS_VOTING to sending actual vote + * requests. */ case SS_VOTING: elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk->state)); ResetConnection(sk); return; @@ -824,8 +842,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) /* * AsyncFlush ensures we only move on to SS_ACTIVE once the flush - * completes. If we still have more to do, we'll wait until the next - * poll comes along. + * completes. If we still have more to do, we'll wait until the + * next poll comes along. */ if (!AsyncFlush(sk)) return; @@ -839,7 +857,7 @@ AdvancePollState(Safekeeper *sk, uint32 events) */ case SS_IDLE: elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + sk->port, FormatSafekeeperState(sk->state)); ResetConnection(sk); return; @@ -864,19 +882,17 @@ HandleConnectionEvent(Safekeeper *sk) { case WP_CONN_POLLING_OK: elog(LOG, "connected with node %s:%s", sk->host, - sk->port); + sk->port); /* - * We have to pick some event to update event set. - * We'll eventually need the socket to be readable, - * so we go with that. + * We have to pick some event to update event set. We'll + * eventually need the socket to be readable, so we go with that. */ new_events = WL_SOCKET_READABLE; break; /* - * If we need to poll to finish connecting, - * continue doing that + * If we need to poll to finish connecting, continue doing that */ case WP_CONN_POLLING_READING: sk->state = SS_CONNECTING_READ; @@ -889,13 +905,12 @@ HandleConnectionEvent(Safekeeper *sk) case WP_CONN_POLLING_FAILED: elog(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); /* - * If connecting failed, we don't want to restart - * the connection because that might run us into a - * loop. Instead, shut it down -- it'll naturally - * restart at a slower interval on calls to + * If connecting failed, we don't want to restart the connection + * because that might run us into a loop. Instead, shut it down -- + * it'll naturally restart at a slower interval on calls to * ReconnectSafekeepers. */ ShutdownConnection(sk); @@ -903,9 +918,8 @@ HandleConnectionEvent(Safekeeper *sk) } /* - * Because PQconnectPoll can change the socket, we have to - * un-register the old event and re-register an event on - * the new socket. + * Because PQconnectPoll can change the socket, we have to un-register the + * old event and re-register an event on the new socket. */ HackyRemoveWalProposerEvent(sk); sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); @@ -926,7 +940,7 @@ SendStartWALPush(Safekeeper *sk) if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) { elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); ShutdownConnection(sk); return; } @@ -940,8 +954,7 @@ RecvStartWALPushResult(Safekeeper *sk) switch (walprop_get_query_result(sk->conn)) { /* - * Successful result, move on to starting the - * handshake + * Successful result, move on to starting the handshake */ case WP_EXEC_SUCCESS_COPYBOTH: @@ -949,31 +962,31 @@ RecvStartWALPushResult(Safekeeper *sk) break; /* - * Needs repeated calls to finish. Wait until the - * socket is readable + * Needs repeated calls to finish. Wait until the socket is + * readable */ case WP_EXEC_NEEDS_INPUT: /* - * SS_WAIT_EXEC_RESULT is always reached through an - * event, so we don't need to update the event set + * SS_WAIT_EXEC_RESULT is always reached through an event, so we + * don't need to update the event set */ break; case WP_EXEC_FAILED: elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, walprop_error_message(sk->conn)); ShutdownConnection(sk); return; /* - * Unexpected result -- funamdentally an error, but we - * want to produce a custom message, rather than a - * generic "something went wrong" + * Unexpected result -- funamdentally an error, but we want to + * produce a custom message, rather than a generic "something went + * wrong" */ case WP_EXEC_UNEXPECTED_SUCCESS: elog(WARNING, "Received bad response from safekeeper %s:%s query execution", - sk->host, sk->port); + sk->host, sk->port); ShutdownConnection(sk); return; } @@ -988,8 +1001,8 @@ static void SendProposerGreeting(Safekeeper *sk) { /* - * On failure, logging & resetting the connection is handled. - * We just need to handle the control flow. + * On failure, logging & resetting the connection is handled. We just need + * to handle the control flow. */ BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); } @@ -998,12 +1011,12 @@ static void RecvAcceptorGreeting(Safekeeper *sk) { /* - * If our reading doesn't immediately succeed, any necessary - * error handling or state setting is taken care of. We can - * leave any other work until later. + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other work + * until later. */ sk->greetResponse.apm.tag = 'g'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse)) return; /* Protocol is all good, move to voting. */ @@ -1033,37 +1046,34 @@ RecvAcceptorGreeting(Safekeeper *sk) { /* Another compute with higher term is running. */ elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->greetResponse.term, propTerm); + sk->host, sk->port, + sk->greetResponse.term, propTerm); } /* - * Check if we have quorum. If there aren't enough safekeepers, - * wait and do nothing. We'll eventually get a task when the - * election starts. + * Check if we have quorum. If there aren't enough safekeepers, wait and + * do nothing. We'll eventually get a task when the election starts. * * If we do have quorum, we can start an election. */ if (n_connected < quorum) { /* - * SS_VOTING is an idle state; read-ready indicates the - * connection closed. + * SS_VOTING is an idle state; read-ready indicates the connection + * closed. */ UpdateEventSet(sk, WL_SOCKET_READABLE); } else { /* - * Now send voting request to the cohort and wait - * responses + * Now send voting request to the cohort and wait responses */ for (int j = 0; j < n_safekeepers; j++) { /* * Remember: SS_VOTING indicates that the safekeeper is - * participating in voting, but hasn't sent anything - * yet. + * participating in voting, but hasn't sent anything yet. */ if (safekeeper[j].state == SS_VOTING) SendVoteRequest(&safekeeper[j]); @@ -1087,28 +1097,27 @@ static void RecvVoteResponse(Safekeeper *sk) { sk->voteResponse.apm.tag = 'v'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->voteResponse)) return; elog(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), - LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); /* - * In case of acceptor rejecting our vote, bail out, but only - * if either it already lives in strictly higher term - * (concurrent compute spotted) or we are not elected yet and - * thus need the vote. + * In case of acceptor rejecting our vote, bail out, but only if either it + * already lives in strictly higher term (concurrent compute spotted) or + * we are not elected yet and thus need the vote. */ if ((!sk->voteResponse.voteGiven) && (sk->voteResponse.term > propTerm || n_votes < quorum)) { elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->voteResponse.term, propTerm); + sk->host, sk->port, + sk->voteResponse.term, propTerm); } Assert(sk->voteResponse.term == propTerm); @@ -1116,7 +1125,7 @@ RecvVoteResponse(Safekeeper *sk) n_votes++; if (n_votes < quorum) { - sk->state = SS_IDLE; /* can't do much yet, no quorum */ + sk->state = SS_IDLE; /* can't do much yet, no quorum */ } else if (n_votes > quorum) { @@ -1146,16 +1155,16 @@ HandleElectedProposer(void) DetermineEpochStartLsn(); /* - * Check if not all safekeepers are up-to-date, we need to - * download WAL needed to synchronize them + * Check if not all safekeepers are up-to-date, we need to download WAL + * needed to synchronize them */ if (truncateLsn < propEpochStartLsn) { elog(LOG, - "start recovery because truncateLsn=%X/%X is not " - "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(truncateLsn), - LSN_FORMAT_ARGS(propEpochStartLsn)); + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); /* Perform recovery */ if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) elog(FATAL, "Failed to recover state"); @@ -1175,18 +1184,17 @@ HandleElectedProposer(void) /* * The proposer has been elected, and there will be no quorum waiting - * after this point. There will be no safekeeper with state SS_IDLE - * also, because that state is used only for quorum waiting. + * after this point. There will be no safekeeper with state SS_IDLE also, + * because that state is used only for quorum waiting. */ if (syncSafekeepers) { /* - * Send empty message to enforce receiving feedback - * even from nodes who are fully recovered; this is - * required to learn they switched epoch which finishes - * sync-safeekepers who doesn't generate any real new - * records. Will go away once we switch to async acks. + * Send empty message to enforce receiving feedback even from nodes + * who are fully recovered; this is required to learn they switched + * epoch which finishes sync-safeekepers who doesn't generate any real + * new records. Will go away once we switch to async acks. */ BroadcastAppendRequest(); @@ -1200,7 +1208,7 @@ HandleElectedProposer(void) /* latest term in TermHistory, or 0 is there is no entries */ static term_t -GetHighestTerm(TermHistory *th) +GetHighestTerm(TermHistory * th) { return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; } @@ -1276,8 +1284,8 @@ DetermineEpochStartLsn(void) } /* - * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was - * committed yet. Start streaming then from the basebackup LSN. + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing + * was committed yet. Start streaming then from the basebackup LSN. */ if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) { @@ -1322,24 +1330,24 @@ DetermineEpochStartLsn(void) ); /* - * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since - * which we are going to write according to the consensus. If not, we must - * bail out, as clog and other non rel data is inconsistent. + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN + * since which we are going to write according to the consensus. If not, + * we must bail out, as clog and other non rel data is inconsistent. */ if (!syncSafekeepers) { /* - * Basebackup LSN always points to the beginning of the record (not the - * page), as StartupXLOG most probably wants it this way. Safekeepers - * don't skip header as they need continious stream of data, so - * correct LSN for comparison. + * Basebackup LSN always points to the beginning of the record (not + * the page), as StartupXLOG most probably wants it this way. + * Safekeepers don't skip header as they need continious stream of + * data, so correct LSN for comparison. */ if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) { /* - * However, allow to proceed if previously elected leader was me; plain - * restart of walproposer not intervened by concurrent compute (who could - * generate WAL) is ok. + * However, allow to proceed if previously elected leader was me; + * plain restart of walproposer not intervened by concurrent + * compute (who could generate WAL) is ok. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == walprop_shared->mineLastElectedTerm))) @@ -1407,7 +1415,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec { Assert(buf[0] == 'w' || buf[0] == 'k'); if (buf[0] == 'k') - continue; /* keepalive */ + continue; /* keepalive */ memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], sizeof rec_start_lsn); rec_start_lsn = pg_ntoh64(rec_start_lsn); @@ -1457,18 +1465,20 @@ SendProposerElected(Safekeeper *sk) { ProposerElected msg; TermHistory *th; - term_t lastCommonTerm; - int i; + term_t lastCommonTerm; + int i; /* - * Determine start LSN by comparing safekeeper's log term switch history and - * proposer's, searching for the divergence point. + * Determine start LSN by comparing safekeeper's log term switch history + * and proposer's, searching for the divergence point. * * Note: there is a vanishingly small chance of no common point even if * there is some WAL on safekeeper, if immediately after bootstrap compute - * wrote some WAL on single sk and died; we stream since the beginning then. + * wrote some WAL on single sk and died; we stream since the beginning + * then. */ th = &sk->voteResponse.termHistory; + /* * If any WAL is present on the sk, it must be authorized by some term. * OTOH, without any WAL there are no term swiches in the log. @@ -1485,7 +1495,7 @@ SendProposerElected(Safekeeper *sk) /* term must begin everywhere at the same point */ Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); } - i--; /* step back to the last common term */ + i--; /* step back to the last common term */ if (i < 0) { /* safekeeper is empty or no common point, start from the beginning */ @@ -1500,17 +1510,17 @@ SendProposerElected(Safekeeper *sk) * to the truncateLsn before, but now current safekeeper tells * otherwise. * - * Also we have a special condition here, which is empty safekeeper - * with no history. In combination with a gap, that can happen when - * we introduce a new safekeeper to the cluster. This is a rare case, - * which is triggered manually for now, and should be treated with - * care. + * Also we have a special condition here, which is empty + * safekeeper with no history. In combination with a gap, that can + * happen when we introduce a new safekeeper to the cluster. This + * is a rare case, which is triggered manually for now, and should + * be treated with care. */ /* - * truncateLsn will not change without ack from current safekeeper, - * and it's aligned to the WAL record, so we can safely start - * streaming from this point. + * truncateLsn will not change without ack from current + * safekeeper, and it's aligned to the WAL record, so we can + * safely start streaming from this point. */ sk->startStreamingAt = truncateLsn; @@ -1533,9 +1543,10 @@ SendProposerElected(Safekeeper *sk) } else { - XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; - XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : - sk->voteResponse.flushLsn); + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : + sk->voteResponse.flushLsn); + sk->startStreamingAt = Min(propEndLsn, skEndLsn); } } @@ -1595,8 +1606,8 @@ static void StartStreaming(Safekeeper *sk) { /* - * This is the only entrypoint to state SS_ACTIVE. It's executed - * exactly once for a connection. + * This is the only entrypoint to state SS_ACTIVE. It's executed exactly + * once for a connection. */ sk->state = SS_ACTIVE; sk->streamingAt = sk->startStreamingAt; @@ -1617,7 +1628,10 @@ SendMessageToNode(Safekeeper *sk) { Assert(sk->state == SS_ACTIVE); - /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ + /* + * Note: we always send everything to the safekeeper until WOULDBLOCK or + * nothing left to send + */ HandleActiveState(sk, WL_SOCKET_WRITEABLE); } @@ -1633,7 +1647,7 @@ BroadcastAppendRequest() } static void -PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +PrepareAppendRequest(AppendRequestHeader * req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); req->tag = 'a'; @@ -1652,7 +1666,7 @@ PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr e static void HandleActiveState(Safekeeper *sk, uint32 events) { - uint32 newEvents = WL_SOCKET_READABLE; + uint32 newEvents = WL_SOCKET_READABLE; if (events & WL_SOCKET_WRITEABLE) if (!SendAppendRequests(sk)) @@ -1666,10 +1680,10 @@ HandleActiveState(Safekeeper *sk, uint32 events) * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data * in the buffer. * - * LSN comparison checks if we have pending unsent messages. This check isn't - * necessary now, because we always send append messages immediately after - * arrival. But it's good to have it here in case we change this behavior - * in the future. + * LSN comparison checks if we have pending unsent messages. This check + * isn't necessary now, because we always send append messages immediately + * after arrival. But it's good to have it here in case we change this + * behavior in the future. */ if (sk->streamingAt != availableLsn || sk->flushWrite) newEvents |= WL_SOCKET_WRITEABLE; @@ -1689,15 +1703,16 @@ HandleActiveState(Safekeeper *sk, uint32 events) static bool SendAppendRequests(Safekeeper *sk) { - XLogRecPtr endLsn; + XLogRecPtr endLsn; AppendRequestHeader *req; PGAsyncWriteResult writeResult; WALReadError errinfo; - bool sentAnything = false; + bool sentAnything = false; if (sk->flushWrite) { if (!AsyncFlush(sk)) + /* * AsyncFlush failed, that could happen if the socket is closed or * we have nothing to write and should wait for writeable socket. @@ -1716,7 +1731,8 @@ SendAppendRequests(Safekeeper *sk) endLsn += MAX_SEND_SIZE; /* if we went beyond available WAL, back off */ - if (endLsn > availableLsn) { + if (endLsn > availableLsn) + { endLsn = availableLsn; } @@ -1734,21 +1750,21 @@ SendAppendRequests(Safekeeper *sk) resetStringInfo(&sk->outbuf); /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); + appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); /* write the WAL itself */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); if (!WALRead(sk->xlogreader, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn, - #if PG_VERSION_NUM >= 150000 - // FIXME don't use hardcoded timelineid here - 1, - #else - ThisTimeLineID, - #endif - &errinfo)) + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, +#if PG_VERSION_NUM >= 150000 + /* FIXME don't use hardcoded timelineid here */ + 1, +#else + ThisTimeLineID, +#endif + &errinfo)) { WALReadRaiseError(&errinfo); } @@ -1766,17 +1782,19 @@ SendAppendRequests(Safekeeper *sk) break; case PG_ASYNC_WRITE_TRY_FLUSH: + /* * We still need to call PQflush some more to finish the job. - * Caller function will handle this by setting right event set. + * Caller function will handle this by setting right event + * set. */ sk->flushWrite = true; return true; case PG_ASYNC_WRITE_FAIL: elog(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); ShutdownConnection(sk); return false; default: @@ -1800,17 +1818,17 @@ static bool RecvAppendResponses(Safekeeper *sk) { XLogRecPtr minQuorumLsn; - bool readAnything = false; + bool readAnything = false; while (true) { /* - * If our reading doesn't immediately succeed, any - * necessary error handling or state setting is taken care - * of. We can leave any other work until later. + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other + * work until later. */ sk->appendResponse.apm.tag = 'a'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->appendResponse)) break; ereport(DEBUG2, @@ -1824,8 +1842,8 @@ RecvAppendResponses(Safekeeper *sk) { /* Another compute with higher term is running. */ elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", - sk->host, sk->port, - sk->appendResponse.term, propTerm); + sk->host, sk->port, + sk->appendResponse.term, propTerm); } readAnything = true; @@ -1851,11 +1869,11 @@ RecvAppendResponses(Safekeeper *sk) /* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) { - uint8 nkeys; - int i; - int32 len; + uint8 nkeys; + int i; + int32 len; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); @@ -1863,54 +1881,65 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *r for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgstring(reply_message); + if (strcmp(key, "current_timeline_size") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); } else if (strcmp(key, "ps_writelsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); } else if (strcmp(key, "ps_flushlsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); } else if (strcmp(key, "ps_applylsn") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length + pq_getmsgint(reply_message, sizeof(int32)); + //read value length rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); } else if (strcmp(key, "ps_replytime") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->ps_replytime = pq_getmsgint64(reply_message); + pq_getmsgint(reply_message, sizeof(int32)); + //read value length + rf->ps_replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); + rf->ps_replytime, replyTimeStr); pfree(replyTimeStr); } } else { - len = pq_getmsgint(reply_message, sizeof(int32)); // read value length - // Skip unknown keys to support backward compatibile protocol changes - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + len = pq_getmsgint(reply_message, sizeof(int32)); + //read value length + + /* + * Skip unknown keys to support backward compatibile protocol + * changes + */ + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1952,9 +1981,10 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) static XLogRecPtr CalculateMinFlushLsn(void) { - XLogRecPtr lsn = n_safekeepers > 0 - ? safekeeper[0].appendResponse.flushLsn - : InvalidXLogRecPtr; + XLogRecPtr lsn = n_safekeepers > 0 + ? safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; + for (int i = 1; i < n_safekeepers; i++) { lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); @@ -2006,8 +2036,8 @@ WalproposerShmemInit(void) LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); walprop_shared = ShmemInitStruct("Walproposer shared state", - sizeof(WalproposerShmemState), - &found); + sizeof(WalproposerShmemState), + &found); if (!found) { @@ -2021,7 +2051,7 @@ WalproposerShmemInit(void) } void -replication_feedback_set(ReplicationFeedback *rf) +replication_feedback_set(ReplicationFeedback * rf) { SpinLockAcquire(&walprop_shared->mutex); memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); @@ -2044,10 +2074,11 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe * Get ReplicationFeedback fields from the most advanced safekeeper */ static void -GetLatestZentihFeedback(ReplicationFeedback *rf) +GetLatestZentihFeedback(ReplicationFeedback * rf) { - int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) { if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) @@ -2064,12 +2095,12 @@ GetLatestZentihFeedback(ReplicationFeedback *rf) rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); replication_feedback_set(rf); } @@ -2080,7 +2111,7 @@ HandleSafekeeperResponse(void) HotStandbyFeedback hsFeedback; XLogRecPtr minQuorumLsn; XLogRecPtr diskConsistentLsn; - XLogRecPtr minFlushLsn; + XLogRecPtr minFlushLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); @@ -2088,7 +2119,7 @@ HandleSafekeeperResponse(void) if (!syncSafekeepers) { - // Get ReplicationFeedback fields from the most advanced safekeeper + /* Get ReplicationFeedback fields from the most advanced safekeeper */ GetLatestZentihFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } @@ -2102,11 +2133,15 @@ HandleSafekeeperResponse(void) /* advance the replication slot */ if (!syncSafekeepers) ProcessStandbyReply( - // write_lsn - This is what durably stored in WAL service. + /* write_lsn - This is what durably stored in WAL service. */ quorumFeedback.flushLsn, - //flush_lsn - This is what durably stored in WAL service. + /* flush_lsn - This is what durably stored in WAL service. */ quorumFeedback.flushLsn, - //apply_lsn - This is what processed and durably saved at pageserver. + + /* + * apply_lsn - This is what processed and durably saved at + * pageserver. + */ quorumFeedback.rf.ps_flushlsn, GetCurrentTimestamp(), false); } @@ -2128,15 +2163,14 @@ HandleSafekeeperResponse(void) * flushed to all safekeepers. We must always start streaming from the * beginning of the record, which simplifies decoding on the far end. * - * Advanced truncateLsn should be not further than nearest commitLsn. - * This prevents surprising violation of truncateLsn <= commitLsn - * invariant which might occur because 1) truncateLsn can be advanced - * immediately once chunk is broadcast to all safekeepers, and - * commitLsn generally can't be advanced based on feedback from - * safekeeper who is still in the previous epoch (similar to 'leader - * can't commit entries from previous term' in Raft); 2) chunks we - * read from WAL and send are plain sheets of bytes, but safekeepers - * ack only on record boundaries. + * Advanced truncateLsn should be not further than nearest commitLsn. This + * prevents surprising violation of truncateLsn <= commitLsn invariant + * which might occur because 1) truncateLsn can be advanced immediately + * once chunk is broadcast to all safekeepers, and commitLsn generally + * can't be advanced based on feedback from safekeeper who is still in the + * previous epoch (similar to 'leader can't commit entries from previous + * term' in Raft); 2) chunks we read from WAL and send are plain sheets of + * bytes, but safekeepers ack only on record boundaries. */ minFlushLsn = CalculateMinFlushLsn(); if (minFlushLsn > truncateLsn) @@ -2144,8 +2178,8 @@ HandleSafekeeperResponse(void) truncateLsn = minFlushLsn; /* - * Advance the replication slot to free up old WAL files. Note - * that slot doesn't exist if we are in syncSafekeepers mode. + * Advance the replication slot to free up old WAL files. Note that + * slot doesn't exist if we are in syncSafekeepers mode. */ if (MyReplicationSlot) PhysicalConfirmReceivedLocation(truncateLsn); @@ -2170,7 +2204,7 @@ HandleSafekeeperResponse(void) n_synced = 0; for (int i = 0; i < n_safekeepers; i++) { - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &safekeeper[i]; bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; /* alive safekeeper which is not synced yet; wait for it */ @@ -2225,11 +2259,11 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * failed, a warning is emitted and the connection is reset. */ static bool -AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) { - char *buf; - int buf_size; - uint64 tag; + char *buf; + int buf_size; + uint64 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) @@ -2252,54 +2286,56 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) switch (tag) { case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); - pq_getmsgend(&s); - return true; - } + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } default: - { - Assert(false); - return false; - } + { + Assert(false); + return false; + } } } @@ -2367,7 +2403,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta ShutdownConnection(sk); return false; default: - Assert(false); + Assert(false); return false; } } @@ -2409,19 +2445,19 @@ AsyncFlush(Safekeeper *sk) } } -// Check if we need to suspend inserts because of lagging replication. +/* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) { if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; #if PG_VERSION_NUM >= 150000 - XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); #else - XLogRecPtr myFlushLsn = GetFlushRecPtr(); + XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024*1024) @@ -2434,23 +2470,23 @@ backpressure_lag_impl(void) if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 - && myFlushLsn > writePtr + max_replication_write_lag*MB)) + && myFlushLsn > writePtr + max_replication_write_lag * MB)) { - return (myFlushLsn - writePtr - max_replication_write_lag*MB); + return (myFlushLsn - writePtr - max_replication_write_lag * MB); } if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 - && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) + && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) { - return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); + return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); } if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 - && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) + && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) { - return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); + return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } } return 0; @@ -2458,24 +2494,26 @@ backpressure_lag_impl(void) #define BACK_PRESSURE_DELAY 10000L // 0.01 sec -static bool backpressure_throttling_impl(void) +static bool +backpressure_throttling_impl(void) { - int64 lag; - TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + int64 lag; + TimestampTz start, + stop; + bool retry = PrevProcessInterruptsCallback + ? PrevProcessInterruptsCallback() + : false; - // Don't throttle read only transactions and wal sender. + /* Don't throttle read only transactions and wal sender. */ if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) return retry; - // Calculate replicas lag + /* Calculate replicas lag */ lag = backpressure_lag_impl(); if (lag == 0) return retry; - // Suspend writers until replicas catch up + /* Suspend writers until replicas catch up */ set_ps_display("backpressure throttling"); elog(DEBUG2, "backpressure throttling: lag %lu", lag); diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 75167163f3..59e70f33bf 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -14,10 +14,13 @@ #define SK_PROTOCOL_VERSION 2 #define MAX_SAFEKEEPERS 32 -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ -#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ -#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single + * WAL message */ +#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender + * message header */ +#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender + * message header */ /* * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, @@ -25,12 +28,12 @@ */ #define WL_NO_EVENTS 0 -extern char* wal_acceptors_list; -extern int wal_acceptor_reconnect_timeout; -extern int wal_acceptor_connect_timeout; -extern bool am_wal_proposer; +extern char *wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connect_timeout; +extern bool am_wal_proposer; -struct WalProposerConn; /* Defined in libpqwalproposer */ +struct WalProposerConn; /* Defined in libpqwalproposer */ typedef struct WalProposerConn WalProposerConn; struct WalMessage; @@ -44,21 +47,26 @@ typedef enum { /* The full read was successful. buf now points to the data */ PG_ASYNC_READ_SUCCESS, - /* The read is ongoing. Wait until the connection is read-ready, then try - * again. */ + + /* + * The read is ongoing. Wait until the connection is read-ready, then try + * again. + */ PG_ASYNC_READ_TRY_AGAIN, /* Reading failed. Check PQerrorMessage(conn) */ PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; +} PGAsyncReadResult; /* Possible return values from WritePGAsync */ typedef enum { /* The write fully completed */ PG_ASYNC_WRITE_SUCCESS, - /* The write started, but you'll need to call PQflush some more times - * to finish it off. We just tried, so it's best to wait until the - * connection is read- or write-ready to try again. + + /* + * The write started, but you'll need to call PQflush some more times to + * finish it off. We just tried, so it's best to wait until the connection + * is read- or write-ready to try again. * * If it becomes read-ready, call PQconsumeInput and flush again. If it * becomes write-ready, just call PQflush. @@ -66,7 +74,7 @@ typedef enum PG_ASYNC_WRITE_TRY_FLUSH, /* Writing failed. Check PQerrorMessage(conn) */ PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; +} PGAsyncWriteResult; /* * WAL safekeeper state, which is used to wait for some event. @@ -79,8 +87,8 @@ typedef enum typedef enum { /* - * Does not have an active connection and will stay that way until - * further notice. + * Does not have an active connection and will stay that way until further + * notice. * * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. */ @@ -105,8 +113,8 @@ typedef enum SS_WAIT_EXEC_RESULT, /* - * Executing the receiving half of the handshake. After receiving, moves to - * SS_VOTING. + * Executing the receiving half of the handshake. After receiving, moves + * to SS_VOTING. */ SS_HANDSHAKE_RECV, @@ -120,8 +128,9 @@ typedef enum SS_VOTING, /* - * Already sent voting information, waiting to receive confirmation from the - * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. + * Already sent voting information, waiting to receive confirmation from + * the node. After receiving, moves to SS_IDLE, if the quorum isn't + * reached yet. */ SS_WAIT_VERDICT, @@ -141,7 +150,7 @@ typedef enum * to read. */ SS_ACTIVE, -} SafekeeperState; +} SafekeeperState; /* Consensus logical timestamp. */ typedef uint64 term_t; @@ -156,21 +165,21 @@ typedef uint64 NNodeId; /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting { - uint64 tag; /* message tag */ - uint32 protocolVersion; /* proposer-safekeeper protocol version */ - uint32 pgVersion; - pg_uuid_t proposerId; - uint64 systemId; /* Postgres system identifier */ - uint8 ztimelineid[16]; /* Zenith timeline id */ - uint8 ztenantid[16]; - TimeLineID timeline; - uint32 walSegSize; -} ProposerGreeting; + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 ztimelineid[16]; /* Zenith timeline id */ + uint8 ztenantid[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; typedef struct AcceptorProposerMessage { - uint64 tag; -} AcceptorProposerMessage; + uint64 tag; +} AcceptorProposerMessage; /* * Acceptor -> Proposer initial response: the highest term acceptor voted for. @@ -180,7 +189,7 @@ typedef struct AcceptorGreeting AcceptorProposerMessage apm; term_t term; NNodeId nodeId; -} AcceptorGreeting; +} AcceptorGreeting; /* * Proposer -> Acceptor vote request. @@ -189,36 +198,39 @@ typedef struct VoteRequest { uint64 tag; term_t term; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; /* Element of term switching chain. */ typedef struct TermSwitchEntry { - term_t term; - XLogRecPtr lsn; -} TermSwitchEntry; + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; typedef struct TermHistory { - uint32 n_entries; + uint32 n_entries; TermSwitchEntry *entries; -} TermHistory; +} TermHistory; /* Vote itself, sent from safekeeper to proposer */ -typedef struct VoteResponse { +typedef struct VoteResponse +{ AcceptorProposerMessage apm; - term_t term; - uint64 voteGiven; + term_t term; + uint64 voteGiven; + /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow - * proposer to choose the most advanced one. + * proposer to choose the most advanced one. */ - XLogRecPtr flushLsn; - XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for + * recovery of some safekeeper */ TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ -} VoteResponse; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; /* * Proposer -> Acceptor message announcing proposer is elected and communicating @@ -226,60 +238,62 @@ typedef struct VoteResponse { */ typedef struct ProposerElected { - uint64 tag; - term_t term; + uint64 tag; + term_t term; /* proposer will send since this point */ - XLogRecPtr startStreamingAt; + XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; -} ProposerElected; + XLogRecPtr timelineStartLsn; +} ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader { - uint64 tag; - term_t term; /* term of the proposer */ + uint64 tag; + term_t term; /* term of the proposer */ + /* * LSN since which current proposer appends WAL (begin_lsn of its first * record); determines epoch switch point. */ - XLogRecPtr epochStartLsn; - XLogRecPtr beginLsn; /* start position of message in WAL */ - XLogRecPtr endLsn; /* end position of message in WAL */ - XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + /* - * minimal LSN which may be needed for recovery of some safekeeper (end lsn - * + 1 of last chunk streamed to everyone) + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) */ - XLogRecPtr truncateLsn; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; /* * Hot standby feedback received from replica */ typedef struct HotStandbyFeedback { - TimestampTz ts; + TimestampTz ts; FullTransactionId xmin; FullTransactionId catalog_xmin; -} HotStandbyFeedback; +} HotStandbyFeedback; -typedef struct ReplicationFeedback +typedef struct ReplicationFeedback { - // current size of the timeline on pageserver - uint64 currentClusterSize; - // standby_status_update fields that safekeeper received from pageserver - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; + /* current size of the timeline on pageserver */ + uint64 currentClusterSize; + /* standby_status_update fields that safekeeper received from pageserver */ + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; TimestampTz ps_replytime; -} ReplicationFeedback; +} ReplicationFeedback; typedef struct WalproposerShmemState @@ -288,7 +302,7 @@ typedef struct WalproposerShmemState ReplicationFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; -} WalproposerShmemState; +} WalproposerShmemState; /* * Report safekeeper state to proposer @@ -296,25 +310,26 @@ typedef struct WalproposerShmemState typedef struct AppendResponse { AcceptorProposerMessage apm; + /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. */ - term_t term; - // TODO: add comment - XLogRecPtr flushLsn; - // Safekeeper reports back his awareness about which WAL is committed, as - // this is a criterion for walproposer --sync mode exit - XLogRecPtr commitLsn; + term_t term; + /* TODO: add comment */ + XLogRecPtr flushLsn; + /* Safekeeper reports back his awareness about which WAL is committed, as */ + /* this is a criterion for walproposer --sync mode exit */ + XLogRecPtr commitLsn; HotStandbyFeedback hs; - // Feedback recieved from pageserver includes standby_status_update fields - // and custom zenith feedback. - // This part of the message is extensible. + /* Feedback recieved from pageserver includes standby_status_update fields */ + /* and custom zenith feedback. */ + /* This part of the message is extensible. */ ReplicationFeedback rf; -} AppendResponse; +} AppendResponse; -// ReplicationFeedback is extensible part of the message that is parsed separately -// Other fields are fixed part +/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) @@ -323,9 +338,10 @@ typedef struct AppendResponse */ typedef struct Safekeeper { - char const* host; - char const* port; - char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ + char const *host; + char const *port; + char conninfo[MAXCONNINFO]; /* connection info for + * connecting/reconnecting */ /* * postgres protocol connection to the WAL acceptor @@ -333,46 +349,50 @@ typedef struct Safekeeper * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we * reach SS_ACTIVE; not before. */ - WalProposerConn* conn; + WalProposerConn *conn; + /* * Temporary buffer for the message being sent to the safekeeper. */ StringInfoData outbuf; + /* * WAL reader, allocated for each safekeeper. */ - XLogReaderState* xlogreader; + XLogReaderState *xlogreader; /* * Streaming will start here; must be record boundary. */ - XLogRecPtr startStreamingAt; + XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ - XLogRecPtr streamingAt; /* current streaming position */ - AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + bool flushWrite; /* set to true if we need to call AsyncFlush, + * to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ - int eventPos; /* position in wait event set. Equal to -1 if no event */ - SafekeeperState state; /* safekeeper state machine state */ - TimestampTz startedConnAt; /* when connection attempt started */ - AcceptorGreeting greetResponse; /* acceptor greeting */ - VoteResponse voteResponse; /* the vote */ - AppendResponse appendResponse; /* feedback for master */ + int eventPos; /* position in wait event set. Equal to -1 if + * no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz startedConnAt; /* when connection attempt started */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ } Safekeeper; extern PGDLLIMPORT void WalProposerMain(Datum main_arg); -void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); -void WalProposerPoll(void); -void WalProposerRegister(void); -void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +void WalProposerPoll(void); +void WalProposerRegister(void); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback * rf); extern void StartProposerReplication(StartReplicationCmd *cmd); -Size WalproposerShmemSize(void); -bool WalproposerShmemInit(void); -void replication_feedback_set(ReplicationFeedback *rf); -void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback * rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); /* libpqwalproposer hooks & helper type */ @@ -383,29 +403,37 @@ typedef enum WP_CONN_POLLING_READING, WP_CONN_POLLING_WRITING, WP_CONN_POLLING_OK, + /* * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. * We've removed it here to avoid clutter. */ -} WalProposerConnectPollStatusType; +} WalProposerConnectPollStatusType; /* Re-exported and modified ExecStatusType */ typedef enum { /* We received a single CopyBoth result */ WP_EXEC_SUCCESS_COPYBOTH, - /* Any success result other than a single CopyBoth was received. The specifics of the result - * were already logged, but it may be useful to provide an error message indicating which - * safekeeper messed up. + + /* + * Any success result other than a single CopyBoth was received. The + * specifics of the result were already logged, but it may be useful to + * provide an error message indicating which safekeeper messed up. * - * Do not expect PQerrorMessage to be appropriately set. */ + * Do not expect PQerrorMessage to be appropriately set. + */ WP_EXEC_UNEXPECTED_SUCCESS, - /* No result available at this time. Wait until read-ready, then call again. Internally, this is - * returned when PQisBusy indicates that PQgetResult would block. */ + + /* + * No result available at this time. Wait until read-ready, then call + * again. Internally, this is returned when PQisBusy indicates that + * PQgetResult would block. + */ WP_EXEC_NEEDS_INPUT, /* Catch-all failure. Check PQerrorMessage. */ WP_EXEC_FAILED, -} WalProposerExecStatusType; +} WalProposerExecStatusType; /* Re-exported ConnStatusType */ typedef enum @@ -414,40 +442,39 @@ typedef enum WP_CONNECTION_BAD, /* - * The original ConnStatusType has many more tags, but requests that - * they not be relied upon (except for displaying to the user). We - * don't need that extra functionality, so we collect them into a - * single tag here. + * The original ConnStatusType has many more tags, but requests that they + * not be relied upon (except for displaying to the user). We don't need + * that extra functionality, so we collect them into a single tag here. */ WP_CONNECTION_IN_PROGRESS, -} WalProposerConnStatusType; +} WalProposerConnStatusType; /* Re-exported PQerrorMessage */ -typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); +typedef char *(*walprop_error_message_fn) (WalProposerConn * conn); /* Re-exported PQstatus */ -typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); +typedef WalProposerConnStatusType(*walprop_status_fn) (WalProposerConn * conn); /* Re-exported PQconnectStart */ -typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); +typedef WalProposerConn * (*walprop_connect_start_fn) (char *conninfo); /* Re-exported PQconectPoll */ -typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); +typedef WalProposerConnectPollStatusType(*walprop_connect_poll_fn) (WalProposerConn * conn); /* Blocking wrapper around PQsendQuery */ -typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); +typedef bool (*walprop_send_query_fn) (WalProposerConn * conn, char *query); /* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ -typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); +typedef WalProposerExecStatusType(*walprop_get_query_result_fn) (WalProposerConn * conn); /* Re-exported PQsocket */ -typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); +typedef pgsocket (*walprop_socket_fn) (WalProposerConn * conn); /* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ -typedef int (*walprop_flush_fn) (WalProposerConn* conn); +typedef int (*walprop_flush_fn) (WalProposerConn * conn); /* Re-exported PQfinish */ -typedef void (*walprop_finish_fn) (WalProposerConn* conn); +typedef void (*walprop_finish_fn) (WalProposerConn * conn); /* * Ergonomic wrapper around PGgetCopyData @@ -463,9 +490,9 @@ typedef void (*walprop_finish_fn) (WalProposerConn* conn); * performs a bit of extra checking work that's always required and is normally * somewhat verbose. */ -typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, - char** buf, - int* amount); +typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn, + char **buf, + int *amount); /* * Ergonomic wrapper around PQputCopyData + PQflush @@ -474,33 +501,33 @@ typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, * * For information on the meaning of return codes, refer to PGAsyncWriteResult. */ -typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, - void const* buf, - size_t size); +typedef PGAsyncWriteResult(*walprop_async_write_fn) (WalProposerConn * conn, + void const *buf, + size_t size); /* * Blocking equivalent to walprop_async_write_fn * * Returns 'true' if successful, 'false' on failure. */ -typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); +typedef bool (*walprop_blocking_write_fn) (WalProposerConn * conn, void const *buf, size_t size); /* All libpqwalproposer exported functions collected together. */ typedef struct WalProposerFunctionsType { - walprop_error_message_fn walprop_error_message; - walprop_status_fn walprop_status; - walprop_connect_start_fn walprop_connect_start; - walprop_connect_poll_fn walprop_connect_poll; - walprop_send_query_fn walprop_send_query; - walprop_get_query_result_fn walprop_get_query_result; - walprop_socket_fn walprop_socket; - walprop_flush_fn walprop_flush; - walprop_finish_fn walprop_finish; - walprop_async_read_fn walprop_async_read; - walprop_async_write_fn walprop_async_write; - walprop_blocking_write_fn walprop_blocking_write; -} WalProposerFunctionsType; + walprop_error_message_fn walprop_error_message; + walprop_status_fn walprop_status; + walprop_connect_start_fn walprop_connect_start; + walprop_connect_poll_fn walprop_connect_poll; + walprop_send_query_fn walprop_send_query; + walprop_get_query_result_fn walprop_get_query_result; + walprop_socket_fn walprop_socket; + walprop_flush_fn walprop_flush; + walprop_finish_fn walprop_finish; + walprop_async_read_fn walprop_async_read; + walprop_async_write_fn walprop_async_write; + walprop_blocking_write_fn walprop_blocking_write; +} WalProposerFunctionsType; /* Allow the above functions to be "called" with normal syntax */ #define walprop_error_message(conn) \ @@ -536,8 +563,8 @@ typedef struct WalProposerFunctionsType * This pointer is set by the initializer in libpqwalproposer, so that we * can use it later. */ -extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; +extern PGDLLIMPORT WalProposerFunctionsType * WalProposerFunctions; extern uint64 BackpressureThrottlingTime(void); -#endif /* __NEON_WALPROPOSER_H__ */ +#endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index 417a8c4586..e1dcaa081d 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -127,10 +127,10 @@ CompareLsn(const void *a, const void *b) * * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); */ -char* +char * FormatSafekeeperState(SafekeeperState state) { - char* return_val = NULL; + char *return_val = NULL; switch (state) { @@ -171,27 +171,30 @@ FormatSafekeeperState(SafekeeperState state) /* Asserts that the provided events are expected for given safekeeper's state */ void -AssertEventsOkForState(uint32 events, Safekeeper* sk) +AssertEventsOkForState(uint32 events, Safekeeper *sk) { - uint32 expected = SafekeeperStateDesiredEvents(sk->state); + uint32 expected = SafekeeperStateDesiredEvents(sk->state); - /* The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. - * (b) if we are expecting something, there's overlap - * (i.e. `events & expected != 0`) + /* + * The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. (b) if we are expecting something, there's + * overlap (i.e. `events & expected != 0`) */ - bool events_ok_for_state; /* long name so the `Assert` is more clear later */ + bool events_ok_for_state; /* long name so the `Assert` is more + * clear later */ if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); + events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); else events_ok_for_state = ((events & expected) != 0); if (!events_ok_for_state) { - /* To give a descriptive message in the case of failure, we use elog and - * then an assertion that's guaranteed to fail. */ + /* + * To give a descriptive message in the case of failure, we use elog + * and then an assertion that's guaranteed to fail. + */ elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); Assert(events_ok_for_state); @@ -204,12 +207,12 @@ AssertEventsOkForState(uint32 events, Safekeeper* sk) uint32 SafekeeperStateDesiredEvents(SafekeeperState state) { - uint32 result = WL_NO_EVENTS; + uint32 result = WL_NO_EVENTS; /* If the state doesn't have a modifier, we can check the base state */ switch (state) { - /* Connecting states say what they want in the name */ + /* Connecting states say what they want in the name */ case SS_CONNECTING_READ: result = WL_SOCKET_READABLE; break; @@ -217,33 +220,35 @@ SafekeeperStateDesiredEvents(SafekeeperState state) result = WL_SOCKET_WRITEABLE; break; - /* Reading states need the socket to be read-ready to continue */ + /* Reading states need the socket to be read-ready to continue */ case SS_WAIT_EXEC_RESULT: case SS_HANDSHAKE_RECV: case SS_WAIT_VERDICT: result = WL_SOCKET_READABLE; break; - /* Idle states use read-readiness as a sign that the connection has been - * disconnected. */ + /* + * Idle states use read-readiness as a sign that the connection + * has been disconnected. + */ case SS_VOTING: case SS_IDLE: result = WL_SOCKET_READABLE; break; - /* - * Flush states require write-ready for flushing. - * Active state does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should - * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ + /* + * Flush states require write-ready for flushing. Active state + * does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We + * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ case SS_SEND_ELECTED_FLUSH: case SS_ACTIVE: result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; break; - /* The offline state expects no events. */ + /* The offline state expects no events. */ case SS_OFFLINE: result = WL_NO_EVENTS; break; @@ -263,27 +268,30 @@ SafekeeperStateDesiredEvents(SafekeeperState state) * * The string should not be freed. It should also not be expected to remain the same between * function calls. */ -char* +char * FormatEvents(uint32 events) { static char return_str[8]; /* Helper variable to check if there's extra bits */ - uint32 all_flags = WL_LATCH_SET - | WL_SOCKET_READABLE - | WL_SOCKET_WRITEABLE - | WL_TIMEOUT - | WL_POSTMASTER_DEATH - | WL_EXIT_ON_PM_DEATH - | WL_SOCKET_CONNECTED; + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; - /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an - * sense of what events have been triggered without needing to remember your powers of two. */ + /* + * The formatting here isn't supposed to be *particularly* useful -- it's + * just to give an sense of what events have been triggered without + * needing to remember your powers of two. + */ - return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; - return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; + return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_'; return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; - return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; + return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_'; return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; @@ -291,7 +299,7 @@ FormatEvents(uint32 events) if (events & (~all_flags)) { elog(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); + events & (~all_flags)); return_str[6] = '*'; return_str[7] = '\0'; } @@ -407,21 +415,21 @@ XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) if (walpropFile < 0) { - #if PG_VERSION_NUM >= 150000 - // FIXME Is it ok to use hardcoded value here? - TimeLineID tli = 1; - #else +#if PG_VERSION_NUM >= 150000 + /* FIXME Is it ok to use hardcoded value here? */ + TimeLineID tli = 1; +#else bool use_existent = true; - #endif +#endif /* Create/use new log file */ XLByteToSeg(recptr, walpropSegNo, wal_segment_size); - #if PG_VERSION_NUM >= 150000 +#if PG_VERSION_NUM >= 150000 walpropFile = XLogFileInit(walpropSegNo, tli); walpropFileTLI = tli; - #else +#else walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); walpropFileTLI = ThisTimeLineID; - #endif +#endif } /* Calculate the start offset of the received logs */ @@ -483,6 +491,7 @@ XLogWalPropClose(XLogRecPtr recptr) if (close(walpropFile) != 0) { char xlogfname[MAXFNAMELEN]; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); ereport(PANIC, @@ -508,12 +517,12 @@ StartProposerReplication(StartReplicationCmd *cmd) XLogRecPtr FlushPtr; TimeLineID currTLI; - #if PG_VERSION_NUM < 150000 +#if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); - #endif + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); +#endif /* create xlogreader for physical replication */ xlogreader = @@ -525,7 +534,7 @@ StartProposerReplication(StartReplicationCmd *cmd) if (!xlogreader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory"))); /* * We assume here that we're logging enough information in the WAL for @@ -542,7 +551,7 @@ StartProposerReplication(StartReplicationCmd *cmd) if (SlotIsLogical(MyReplicationSlot)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot use a logical replication slot for physical replication"))); + errmsg("cannot use a logical replication slot for physical replication"))); /* * We don't need to verify the slot's restart_lsn here; instead we @@ -630,9 +639,9 @@ StartProposerReplication(StartReplicationCmd *cmd) (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", LSN_FORMAT_ARGS(cmd->startpoint), cmd->timeline), - errdetail("This server's history forked from timeline %u at %X/%X.", - cmd->timeline, - LSN_FORMAT_ARGS(switchpoint)))); + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); } sendTimeLineValidUpto = switchpoint; } @@ -869,14 +878,14 @@ WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, errno = save_errno; ereport(ERROR, (errcode_for_file_access(), - errmsg("requested WAL segment %s has already been removed", - xlogfname))); + errmsg("requested WAL segment %s has already been removed", + xlogfname))); } else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); + errmsg("could not open file \"%s\": %m", + path))); } @@ -943,7 +952,7 @@ XLogSendPhysical(void) XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes PG_USED_FOR_ASSERTS_ONLY; - TimeLineID currTLI; + TimeLineID currTLI; /* If requested switch the WAL sender to the stopping state. */ if (got_STOPPING) @@ -1004,8 +1013,8 @@ XLogSendPhysical(void) { /* * Still a cascading standby. But is the timeline we're sending - * still the one recovery is recovering from? currTLI was - * updated by the GetStandbyFlushRecPtr() call above. + * still the one recovery is recovering from? currTLI was updated + * by the GetStandbyFlushRecPtr() call above. */ if (sendTimeLine != currTLI) becameHistoric = true; @@ -1043,11 +1052,11 @@ XLogSendPhysical(void) * primary: if the primary subsequently crashes and restarts, standbys * must not have applied any WAL that got lost on the primary. */ - #if PG_VERSION_NUM >= 150000 +#if PG_VERSION_NUM >= 150000 SendRqstPtr = GetFlushRecPtr(NULL); - #else +#else SendRqstPtr = GetFlushRecPtr(); - #endif +#endif } /* @@ -1180,4 +1189,3 @@ XLogSendPhysical(void) set_ps_display(activitymsg); } } - diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h index 4771d3ff82..aa5df5fa43 100644 --- a/pgxn/neon/walproposer_utils.h +++ b/pgxn/neon/walproposer_utils.h @@ -3,17 +3,17 @@ #include "walproposer.h" -int CompareLsn(const void *a, const void *b); -char* FormatSafekeeperState(SafekeeperState state); -void AssertEventsOkForState(uint32 events, Safekeeper* sk); -uint32 SafekeeperStateDesiredEvents(SafekeeperState state); -char* FormatEvents(uint32 events); -bool HexDecodeString(uint8 *result, char *input, int nbytes); -uint32 pq_getmsgint32_le(StringInfo msg); -uint64 pq_getmsgint64_le(StringInfo msg); -void pq_sendint32_le(StringInfo buf, uint32 i); -void pq_sendint64_le(StringInfo buf, uint64 i); -void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); -void XLogWalPropClose(XLogRecPtr recptr); +int CompareLsn(const void *a, const void *b); +char *FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper *sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char *FormatEvents(uint32 events); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); -#endif /* __NEON_WALPROPOSER_UTILS_H__ */ +#endif /* __NEON_WALPROPOSER_UTILS_H__ */ diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 3e30065cd3..07bd7bdd28 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -39,8 +39,8 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); * Linkage to functions in zenith module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +typedef void (*zenith_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; @@ -136,8 +136,8 @@ clear_buffer_cache(PG_FUNCTION_ARGS) /* * Pin the buffer, and release it again. Because we have - * zenith_test_evict==true, this will evict the page from - * the buffer cache if no one else is holding a pin on it. + * zenith_test_evict==true, this will evict the page from the + * buffer cache if no one else is holding a pin on it. */ if (isvalid) { @@ -177,8 +177,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *forkname; uint32 blkno; - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); @@ -262,7 +262,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to use raw page functions"))); + errmsg("must be superuser to use raw page functions"))); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || PG_ARGISNULL(3) || PG_ARGISNULL(4)) @@ -271,19 +271,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { RelFileNode rnode = { .spcNode = PG_GETARG_OID(0), - .dbNode = PG_GETARG_OID(1), + .dbNode = PG_GETARG_OID(1), .relNode = PG_GETARG_OID(2) }; - ForkNumber forknum = PG_GETARG_UINT32(3); + ForkNumber forknum = PG_GETARG_UINT32(3); - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); /* Initialize buffer to copy to */ - bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); @@ -298,7 +299,8 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) Datum neon_xlogflush(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogFlush(lsn); PG_RETURN_VOID(); } diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list new file mode 100644 index 0000000000..760f384212 --- /dev/null +++ b/pgxn/typedefs.list @@ -0,0 +1,3776 @@ +ACCESS_ALLOWED_ACE +ACL +ACL_SIZE_INFORMATION +AFFIX +ASN1_INTEGER +ASN1_OBJECT +ASN1_STRING +AV +A_ArrayExpr +A_Const +A_Expr +A_Expr_Kind +A_Indices +A_Indirection +A_Star +AbsoluteTime +AccessMethodInfo +AccessPriv +Acl +AclItem +AclMaskHow +AclMode +AclResult +AcquireSampleRowsFunc +ActionList +ActiveSnapshotElt +AddForeignUpdateTargets_function +AffixNode +AffixNodeData +AfterTriggerEvent +AfterTriggerEventChunk +AfterTriggerEventData +AfterTriggerEventList +AfterTriggerShared +AfterTriggerSharedData +AfterTriggersData +AfterTriggersQueryData +AfterTriggersTableData +AfterTriggersTransData +Agg +AggClauseCosts +AggInfo +AggPath +AggSplit +AggState +AggStatePerAgg +AggStatePerGroup +AggStatePerHash +AggStatePerPhase +AggStatePerTrans +AggStrategy +AggTransInfo +Aggref +AggregateInstrumentation +AlenState +Alias +AllocBlock +AllocChunk +AllocPointer +AllocSet +AllocSetContext +AllocSetFreeList +AllocateDesc +AllocateDescKind +AlterCollationStmt +AlterDatabaseSetStmt +AlterDatabaseStmt +AlterDefaultPrivilegesStmt +AlterDomainStmt +AlterEnumStmt +AlterEventTrigStmt +AlterExtensionContentsStmt +AlterExtensionStmt +AlterFdwStmt +AlterForeignServerStmt +AlterFunctionStmt +AlterObjectDependsStmt +AlterObjectSchemaStmt +AlterOpFamilyStmt +AlterOperatorStmt +AlterOwnerStmt +AlterPolicyStmt +AlterPublicationStmt +AlterRoleSetStmt +AlterRoleStmt +AlterSeqStmt +AlterStatsStmt +AlterSubscriptionStmt +AlterSubscriptionType +AlterSystemStmt +AlterTSConfigType +AlterTSConfigurationStmt +AlterTSDictionaryStmt +AlterTableCmd +AlterTableMoveAllStmt +AlterTableSpaceOptionsStmt +AlterTableStmt +AlterTableType +AlterTableUtilityContext +AlterTypeRecurseParams +AlterTypeStmt +AlterUserMappingStmt +AlteredTableInfo +AlternativeSubPlan +AmcheckOptions +AnalyzeAttrComputeStatsFunc +AnalyzeAttrFetchFunc +AnalyzeForeignTable_function +AnlExprData +AnlIndexData +AnyArrayType +Append +AppendPath +AppendRelInfo +AppendState +ApplyExecutionData +ApplySubXactData +Archive +ArchiveEntryPtrType +ArchiveFormat +ArchiveHandle +ArchiveMode +ArchiveOpts +ArchiverOutput +ArchiverStage +ArrayAnalyzeExtraData +ArrayBuildState +ArrayBuildStateAny +ArrayBuildStateArr +ArrayCoerceExpr +ArrayConstIterState +ArrayExpr +ArrayExprIterState +ArrayIOData +ArrayIterator +ArrayMapState +ArrayMetaState +ArrayParseState +ArraySubWorkspace +ArrayType +AsyncQueueControl +AsyncQueueEntry +AsyncRequest +AttInMetadata +AttStatsSlot +AttoptCacheEntry +AttoptCacheKey +AttrDefInfo +AttrDefault +AttrMap +AttrMissing +AttrNumber +AttributeOpts +AuthRequest +AutoPrewarmSharedState +AutoVacOpts +AutoVacuumShmemStruct +AutoVacuumWorkItem +AutoVacuumWorkItemType +AuxProcType +BF_ctx +BF_key +BF_word +BF_word_signed +BIGNUM +BIO +BIO_METHOD +BITVECP +BMS_Comparison +BMS_Membership +BN_CTX +BOOL +BOOLEAN +BOX +BTArrayKeyInfo +BTBuildState +BTCycleId +BTDedupInterval +BTDedupState +BTDedupStateData +BTDeletedPageData +BTIndexStat +BTInsertState +BTInsertStateData +BTLeader +BTMetaPageData +BTOneVacInfo +BTOptions +BTPS_State +BTPageOpaque +BTPageOpaqueData +BTPageStat +BTPageState +BTParallelScanDesc +BTPendingFSM +BTScanInsert +BTScanInsertData +BTScanOpaque +BTScanOpaqueData +BTScanPos +BTScanPosData +BTScanPosItem +BTShared +BTSortArrayContext +BTSpool +BTStack +BTStackData +BTVacInfo +BTVacState +BTVacuumPosting +BTVacuumPostingData +BTWriteState +BUF_MEM +BYTE +BY_HANDLE_FILE_INFORMATION +Backend +BackendId +BackendParameters +BackendState +BackendType +BackgroundWorker +BackgroundWorkerArray +BackgroundWorkerHandle +BackgroundWorkerSlot +Barrier +BaseBackupCmd +BeginDirectModify_function +BeginForeignInsert_function +BeginForeignModify_function +BeginForeignScan_function +BeginSampleScan_function +BernoulliSamplerData +BgWorkerStartTime +BgwHandleStatus +BinaryArithmFunc +BindParamCbData +BipartiteMatchState +BitmapAnd +BitmapAndPath +BitmapAndState +BitmapHeapPath +BitmapHeapScan +BitmapHeapScanState +BitmapIndexScan +BitmapIndexScanState +BitmapOr +BitmapOrPath +BitmapOrState +Bitmapset +BlobInfo +Block +BlockId +BlockIdData +BlockInfoRecord +BlockNumber +BlockSampler +BlockSamplerData +BlockedProcData +BlockedProcsData +BloomBuildState +BloomFilter +BloomMetaPageData +BloomOpaque +BloomOptions +BloomPageOpaque +BloomPageOpaqueData +BloomScanOpaque +BloomScanOpaqueData +BloomSignatureWord +BloomState +BloomTuple +BlowfishContext +BoolAggState +BoolExpr +BoolExprType +BoolTestType +BooleanTest +BpChar +BrinBuildState +BrinDesc +BrinMemTuple +BrinMetaPageData +BrinOpaque +BrinOpcInfo +BrinOptions +BrinRevmap +BrinSpecialSpace +BrinStatsData +BrinTuple +BrinValues +BtreeCheckState +BtreeLevel +Bucket +BufFile +Buffer +BufferAccessStrategy +BufferAccessStrategyType +BufferCachePagesContext +BufferCachePagesRec +BufferDesc +BufferDescPadded +BufferHeapTupleTableSlot +BufferLookupEnt +BufferStrategyControl +BufferTag +BufferUsage +BuildAccumulator +BuiltinScript +BulkInsertState +BulkInsertStateData +CACHESIGN +CAC_state +CCFastEqualFN +CCHashFN +CEOUC_WAIT_MODE +CFuncHashTabEntry +CHAR +CHECKPOINT +CHKVAL +CIRCLE +CMPDAffix +CONTEXT +COP +CRITICAL_SECTION +CRSSnapshotAction +CState +CTECycleClause +CTEMaterialize +CTESearchClause +CV +CachedExpression +CachedPlan +CachedPlanSource +CallContext +CallStmt +CancelRequestPacket +CaseExpr +CaseTestExpr +CaseWhen +Cash +CastInfo +CatCList +CatCTup +CatCache +CatCacheHeader +CatalogId +CatalogIndexState +ChangeVarNodes_context +CheckPoint +CheckPointStmt +CheckpointStatsData +CheckpointerRequest +CheckpointerShmemStruct +Chromosome +CkptSortItem +CkptTsStatus +ClientAuthentication_hook_type +ClientCertMode +ClientCertName +ClientData +ClonePtrType +ClosePortalStmt +ClosePtrType +Clump +ClusterInfo +ClusterParams +ClusterStmt +CmdType +CoalesceExpr +CoerceParamHook +CoerceToDomain +CoerceToDomainValue +CoerceViaIO +CoercionContext +CoercionForm +CoercionPathType +CollAliasData +CollInfo +CollateClause +CollateExpr +CollateStrength +CollectedATSubcmd +CollectedCommand +CollectedCommandType +ColorTrgm +ColorTrgmInfo +ColumnCompareData +ColumnDef +ColumnIOData +ColumnRef +ColumnsHashData +CombinationGenerator +ComboCidEntry +ComboCidEntryData +ComboCidKey +ComboCidKeyData +Command +CommandDest +CommandId +CommandTag +CommandTagBehavior +CommentItem +CommentStmt +CommitTimestampEntry +CommitTimestampShared +CommonEntry +CommonTableExpr +CompareScalarsContext +CompiledExprState +CompositeIOData +CompositeTypeStmt +CompoundAffixFlag +CompressionAlgorithm +CompressorState +ComputeXidHorizonsResult +ConditionVariable +ConditionVariableMinimallyPadded +ConditionalStack +ConfigData +ConfigVariable +ConnCacheEntry +ConnCacheKey +ConnParams +ConnStatusType +ConnType +ConnectionStateEnum +ConnsAllowedState +ConsiderSplitContext +Const +ConstrCheck +ConstrType +Constraint +ConstraintCategory +ConstraintInfo +ConstraintsSetStmt +ControlData +ControlFileData +ConvInfo +ConvProcInfo +ConversionLocation +ConvertRowtypeExpr +CookedConstraint +CopyDest +CopyFormatOptions +CopyFromState +CopyFromStateData +CopyInsertMethod +CopyMultiInsertBuffer +CopyMultiInsertInfo +CopySource +CopyStmt +CopyToState +CopyToStateData +Cost +CostSelector +Counters +CoverExt +CoverPos +CreateAmStmt +CreateCastStmt +CreateConversionStmt +CreateDomainStmt +CreateEnumStmt +CreateEventTrigStmt +CreateExtensionStmt +CreateFdwStmt +CreateForeignServerStmt +CreateForeignTableStmt +CreateFunctionStmt +CreateOpClassItem +CreateOpClassStmt +CreateOpFamilyStmt +CreatePLangStmt +CreatePolicyStmt +CreatePublicationStmt +CreateRangeStmt +CreateReplicationSlotCmd +CreateRoleStmt +CreateSchemaStmt +CreateSchemaStmtContext +CreateSeqStmt +CreateStatsStmt +CreateStmt +CreateStmtContext +CreateSubscriptionStmt +CreateTableAsStmt +CreateTableSpaceStmt +CreateTransformStmt +CreateTrigStmt +CreateUserMappingStmt +CreatedbStmt +CredHandle +CteItem +CteScan +CteScanState +CteState +CtlCommand +CtxtHandle +CurrentOfExpr +CustomExecMethods +CustomOutPtrType +CustomPath +CustomScan +CustomScanMethods +CustomScanState +CycleCtr +DBState +DCHCacheEntry +DEADLOCK_INFO +DECountItem +DH +DIR +DNSServiceErrorType +DNSServiceRef +DR_copy +DR_intorel +DR_printtup +DR_sqlfunction +DR_transientrel +DSA +DWORD +DataDumperPtr +DataPageDeleteStack +DatabaseInfo +DateADT +Datum +DatumTupleFields +DbInfo +DbInfoArr +DeClonePtrType +DeadLockState +DeallocateStmt +DeclareCursorStmt +DecodedBkpBlock +DecodingOutputState +DefElem +DefElemAction +DefaultACLInfo +DefineStmt +DeleteStmt +DependencyGenerator +DependencyGeneratorData +DependencyType +DestReceiver +DictISpell +DictInt +DictSimple +DictSnowball +DictSubState +DictSyn +DictThesaurus +DimensionInfo +DirectoryMethodData +DirectoryMethodFile +DisableTimeoutParams +DiscardMode +DiscardStmt +DistanceValue +DistinctExpr +DoStmt +DocRepresentation +DomainConstraintCache +DomainConstraintRef +DomainConstraintState +DomainConstraintType +DomainIOData +DropBehavior +DropOwnedStmt +DropReplicationSlotCmd +DropRoleStmt +DropStmt +DropSubscriptionStmt +DropTableSpaceStmt +DropUserMappingStmt +DropdbStmt +DumpComponents +DumpId +DumpOptions +DumpSignalInformation +DumpableObject +DumpableObjectType +DynamicFileList +DynamicZoneAbbrev +EC_KEY +EDGE +ENGINE +EOM_flatten_into_method +EOM_get_flat_size_method +EPQState +EPlan +EState +EVP_CIPHER +EVP_CIPHER_CTX +EVP_MD +EVP_MD_CTX +EVP_PKEY +EachState +Edge +EditableObjectType +ElementsState +EnableTimeoutParams +EndBlobPtrType +EndBlobsPtrType +EndDataPtrType +EndDirectModify_function +EndForeignInsert_function +EndForeignModify_function +EndForeignScan_function +EndSampleScan_function +EnumItem +EolType +EphemeralNameRelationType +EphemeralNamedRelation +EphemeralNamedRelationData +EphemeralNamedRelationMetadata +EphemeralNamedRelationMetadataData +EquivalenceClass +EquivalenceMember +ErrorContextCallback +ErrorData +EstimateDSMForeignScan_function +EstimationInfo +EventTriggerCacheEntry +EventTriggerCacheItem +EventTriggerCacheStateType +EventTriggerData +EventTriggerEvent +EventTriggerInfo +EventTriggerQueryState +ExceptionLabelMap +ExceptionMap +ExclusiveBackupState +ExecAuxRowMark +ExecEvalBoolSubroutine +ExecEvalSubroutine +ExecForeignBatchInsert_function +ExecForeignDelete_function +ExecForeignInsert_function +ExecForeignTruncate_function +ExecForeignUpdate_function +ExecParallelEstimateContext +ExecParallelInitializeDSMContext +ExecPhraseData +ExecProcNodeMtd +ExecRowMark +ExecScanAccessMtd +ExecScanRecheckMtd +ExecStatus +ExecStatusType +ExecuteStmt +ExecutorCheckPerms_hook_type +ExecutorEnd_hook_type +ExecutorFinish_hook_type +ExecutorRun_hook_type +ExecutorStart_hook_type +ExpandedArrayHeader +ExpandedObjectHeader +ExpandedObjectMethods +ExpandedRange +ExpandedRecordFieldInfo +ExpandedRecordHeader +ExplainDirectModify_function +ExplainForeignModify_function +ExplainForeignScan_function +ExplainFormat +ExplainOneQuery_hook_type +ExplainState +ExplainStmt +ExplainWorkersState +ExportedSnapshot +Expr +ExprContext +ExprContextCallbackFunction +ExprContext_CB +ExprDoneCond +ExprEvalOp +ExprEvalOpLookup +ExprEvalRowtypeCache +ExprEvalStep +ExprState +ExprStateEvalFunc +ExtensibleNode +ExtensibleNodeEntry +ExtensibleNodeMethods +ExtensionControlFile +ExtensionInfo +ExtensionMemberId +ExtensionVersionInfo +FDWCollateState +FD_SET +FILE +FILETIME +FILE_INFORMATION_CLASS +FILE_STANDARD_INFORMATION +FSMAddress +FSMPage +FSMPageData +FakeRelCacheEntry +FakeRelCacheEntryData +FastPathStrongRelationLockData +FdwInfo +FdwRoutine +FetchDirection +FetchStmt +FieldSelect +FieldStore +File +FileFdwExecutionState +FileFdwPlanState +FileNameMap +FileTag +FinalPathExtraData +FindColsContext +FindSplitData +FindSplitStrat +FixedParallelExecutorState +FixedParallelState +FixedParamState +FlagMode +FlushPosition +FmgrBuiltin +FmgrHookEventType +FmgrInfo +ForBothCellState +ForBothState +ForEachState +ForFiveState +ForFourState +ForThreeState +ForeignAsyncConfigureWait_function +ForeignAsyncNotify_function +ForeignAsyncRequest_function +ForeignDataWrapper +ForeignKeyCacheInfo +ForeignKeyOptInfo +ForeignPath +ForeignScan +ForeignScanState +ForeignServer +ForeignServerInfo +ForeignTable +ForeignTruncateInfo +ForkNumber +FormData_pg_aggregate +FormData_pg_am +FormData_pg_amop +FormData_pg_amproc +FormData_pg_attrdef +FormData_pg_attribute +FormData_pg_auth_members +FormData_pg_authid +FormData_pg_cast +FormData_pg_class +FormData_pg_collation +FormData_pg_constraint +FormData_pg_conversion +FormData_pg_database +FormData_pg_default_acl +FormData_pg_depend +FormData_pg_enum +FormData_pg_event_trigger +FormData_pg_extension +FormData_pg_foreign_data_wrapper +FormData_pg_foreign_server +FormData_pg_foreign_table +FormData_pg_index +FormData_pg_inherits +FormData_pg_language +FormData_pg_largeobject +FormData_pg_largeobject_metadata +FormData_pg_namespace +FormData_pg_opclass +FormData_pg_operator +FormData_pg_opfamily +FormData_pg_partitioned_table +FormData_pg_policy +FormData_pg_proc +FormData_pg_publication +FormData_pg_publication_rel +FormData_pg_range +FormData_pg_replication_origin +FormData_pg_rewrite +FormData_pg_sequence +FormData_pg_sequence_data +FormData_pg_shdepend +FormData_pg_statistic +FormData_pg_statistic_ext +FormData_pg_subscription +FormData_pg_subscription_rel +FormData_pg_tablespace +FormData_pg_transform +FormData_pg_trigger +FormData_pg_ts_config +FormData_pg_ts_config_map +FormData_pg_ts_dict +FormData_pg_ts_parser +FormData_pg_ts_template +FormData_pg_type +FormData_pg_user_mapping +Form_pg_aggregate +Form_pg_am +Form_pg_amop +Form_pg_amproc +Form_pg_attrdef +Form_pg_attribute +Form_pg_auth_members +Form_pg_authid +Form_pg_cast +Form_pg_class +Form_pg_collation +Form_pg_constraint +Form_pg_conversion +Form_pg_database +Form_pg_default_acl +Form_pg_depend +Form_pg_enum +Form_pg_event_trigger +Form_pg_extension +Form_pg_foreign_data_wrapper +Form_pg_foreign_server +Form_pg_foreign_table +Form_pg_index +Form_pg_inherits +Form_pg_language +Form_pg_largeobject +Form_pg_largeobject_metadata +Form_pg_namespace +Form_pg_opclass +Form_pg_operator +Form_pg_opfamily +Form_pg_partitioned_table +Form_pg_policy +Form_pg_proc +Form_pg_publication +Form_pg_publication_rel +Form_pg_range +Form_pg_replication_origin +Form_pg_rewrite +Form_pg_sequence +Form_pg_sequence_data +Form_pg_shdepend +Form_pg_statistic +Form_pg_statistic_ext +Form_pg_subscription +Form_pg_subscription_rel +Form_pg_tablespace +Form_pg_transform +Form_pg_trigger +Form_pg_ts_config +Form_pg_ts_config_map +Form_pg_ts_dict +Form_pg_ts_parser +Form_pg_ts_template +Form_pg_type +Form_pg_user_mapping +FormatNode +FreeBlockNumberArray +FreeListData +FreePageBtree +FreePageBtreeHeader +FreePageBtreeInternalKey +FreePageBtreeLeafKey +FreePageBtreeSearchResult +FreePageManager +FreePageSpanLeader +FromCharDateMode +FromExpr +FullTransactionId +FuncCall +FuncCallContext +FuncCandidateList +FuncDetailCode +FuncExpr +FuncInfo +FuncLookupError +FunctionCallInfo +FunctionCallInfoBaseData +FunctionParameter +FunctionParameterMode +FunctionScan +FunctionScanPerFuncState +FunctionScanState +FuzzyAttrMatchState +GBT_NUMKEY +GBT_NUMKEY_R +GBT_VARKEY +GBT_VARKEY_R +GENERAL_NAME +GISTBuildBuffers +GISTBuildState +GISTDeletedPageContents +GISTENTRY +GISTInsertStack +GISTInsertState +GISTIntArrayBigOptions +GISTIntArrayOptions +GISTNodeBuffer +GISTNodeBufferPage +GISTPageOpaque +GISTPageOpaqueData +GISTPageSplitInfo +GISTSTATE +GISTScanOpaque +GISTScanOpaqueData +GISTSearchHeapItem +GISTSearchItem +GISTTYPE +GIST_SPLITVEC +GMReaderTupleBuffer +GV +Gather +GatherMerge +GatherMergePath +GatherMergeState +GatherPath +GatherState +Gene +GeneratePruningStepsContext +GenerationBlock +GenerationChunk +GenerationContext +GenerationPointer +GenericCosts +GenericXLogState +GeqoPrivateData +GetForeignJoinPaths_function +GetForeignModifyBatchSize_function +GetForeignPaths_function +GetForeignPlan_function +GetForeignRelSize_function +GetForeignRowMarkType_function +GetForeignUpperPaths_function +GetState +GiSTOptions +GinBtree +GinBtreeData +GinBtreeDataLeafInsertData +GinBtreeEntryInsertData +GinBtreeStack +GinBuildState +GinChkVal +GinEntries +GinEntryAccumulator +GinIndexStat +GinMetaPageData +GinNullCategory +GinOptions +GinPageOpaque +GinPageOpaqueData +GinPlaceToPageRC +GinPostingList +GinQualCounts +GinScanEntry +GinScanKey +GinScanOpaque +GinScanOpaqueData +GinState +GinStatsData +GinTernaryValue +GinTupleCollector +GinVacuumState +GistBuildMode +GistEntryVector +GistHstoreOptions +GistInetKey +GistNSN +GistOptBufferingMode +GistSortedBuildPageState +GistSplitUnion +GistSplitVector +GistTsVectorOptions +GistVacState +GlobalTransaction +GlobalVisState +GrantRoleStmt +GrantStmt +GrantTargetType +Group +GroupClause +GroupPath +GroupPathExtraData +GroupResultPath +GroupState +GroupVarInfo +GroupingFunc +GroupingSet +GroupingSetData +GroupingSetKind +GroupingSetsPath +GucAction +GucBoolAssignHook +GucBoolCheckHook +GucContext +GucEnumAssignHook +GucEnumCheckHook +GucIntAssignHook +GucIntCheckHook +GucRealAssignHook +GucRealCheckHook +GucShowHook +GucSource +GucStack +GucStackState +GucStringAssignHook +GucStringCheckHook +HANDLE +HASHACTION +HASHBUCKET +HASHCTL +HASHELEMENT +HASHHDR +HASHSEGMENT +HASH_SEQ_STATUS +HCRYPTPROV +HE +HEntry +HIST_ENTRY +HKEY +HLOCAL +HMAC_CTX +HMODULE +HOldEntry +HRESULT +HSParser +HSpool +HStore +HTAB +HTSV_Result +HV +Hash +HashAggBatch +HashAggSpill +HashAllocFunc +HashBuildState +HashCompareFunc +HashCopyFunc +HashIndexStat +HashInstrumentation +HashJoin +HashJoinState +HashJoinTable +HashJoinTuple +HashMemoryChunk +HashMetaPage +HashMetaPageData +HashOptions +HashPageOpaque +HashPageOpaqueData +HashPageStat +HashPath +HashScanOpaque +HashScanOpaqueData +HashScanPosData +HashScanPosItem +HashSkewBucket +HashState +HashTapeInfo +HashValueFunc +HbaLine +HbaToken +HeadlineJsonState +HeadlineParsedText +HeadlineWordEntry +HeapCheckContext +HeapScanDesc +HeapTuple +HeapTupleData +HeapTupleFields +HeapTupleForceOption +HeapTupleHeader +HeapTupleHeaderData +HeapTupleTableSlot +HistControl +HotStandbyState +I32 +ICU_Convert_Func +ID +INFIX +INT128 +INTERFACE_INFO +IOFuncSelector +IO_STATUS_BLOCK +IPCompareMethod +ITEM +IV +IdentLine +IdentifierLookup +IdentifySystemCmd +IfStackElem +ImportForeignSchemaStmt +ImportForeignSchemaType +ImportForeignSchema_function +ImportQual +InProgressEnt +IncludeWal +InclusionOpaque +IncrementVarSublevelsUp_context +IncrementalSort +IncrementalSortExecutionStatus +IncrementalSortGroupInfo +IncrementalSortInfo +IncrementalSortPath +IncrementalSortState +Index +IndexAMProperty +IndexAmRoutine +IndexArrayKeyInfo +IndexAttachInfo +IndexAttrBitmapKind +IndexBuildCallback +IndexBuildResult +IndexBulkDeleteCallback +IndexBulkDeleteResult +IndexClause +IndexClauseSet +IndexDeleteCounts +IndexDeletePrefetchState +IndexElem +IndexFetchHeapData +IndexFetchTableData +IndexInfo +IndexList +IndexOnlyScan +IndexOnlyScanState +IndexOptInfo +IndexOrderByDistance +IndexPath +IndexRuntimeKeyInfo +IndexScan +IndexScanDesc +IndexScanState +IndexStateFlagsAction +IndexStmt +IndexTuple +IndexTupleData +IndexUniqueCheck +IndexVacuumInfo +IndxInfo +InferClause +InferenceElem +InfoItem +InhInfo +InheritableSocket +InitSampleScan_function +InitializeDSMForeignScan_function +InitializeWorkerForeignScan_function +InlineCodeBlock +InsertStmt +Instrumentation +Int128AggState +Int8TransTypeData +IntRBTreeNode +IntegerSet +InternalDefaultACL +InternalGrant +Interval +IntoClause +InvalidationChunk +InvalidationListHeader +IpcMemoryId +IpcMemoryKey +IpcMemoryState +IpcSemaphoreId +IpcSemaphoreKey +IsForeignPathAsyncCapable_function +IsForeignRelUpdatable_function +IsForeignScanParallelSafe_function +IsoConnInfo +IspellDict +Item +ItemId +ItemIdData +ItemPointer +ItemPointerData +IterateDirectModify_function +IterateForeignScan_function +IterateJsonStringValuesState +JEntry +JHashState +JOBOBJECTINFOCLASS +JOBOBJECT_BASIC_LIMIT_INFORMATION +JOBOBJECT_BASIC_UI_RESTRICTIONS +JOBOBJECT_SECURITY_LIMIT_INFORMATION +JitContext +JitInstrumentation +JitProviderCallbacks +JitProviderCompileExprCB +JitProviderInit +JitProviderReleaseContextCB +JitProviderResetAfterErrorCB +Join +JoinCostWorkspace +JoinExpr +JoinHashEntry +JoinPath +JoinPathExtraData +JoinState +JoinType +JsObject +JsValue +JsonAggState +JsonBaseObjectInfo +JsonHashEntry +JsonIterateStringValuesAction +JsonLexContext +JsonLikeRegexContext +JsonManifestFileField +JsonManifestParseContext +JsonManifestParseState +JsonManifestSemanticState +JsonManifestWALRangeField +JsonParseContext +JsonParseErrorType +JsonPath +JsonPathBool +JsonPathExecContext +JsonPathExecResult +JsonPathGinAddPathItemFunc +JsonPathGinContext +JsonPathGinExtractNodesFunc +JsonPathGinNode +JsonPathGinNodeType +JsonPathGinPath +JsonPathGinPathItem +JsonPathItem +JsonPathItemType +JsonPathKeyword +JsonPathParseItem +JsonPathParseResult +JsonPathPredicateCallback +JsonPathString +JsonSemAction +JsonTokenType +JsonTransformStringValuesAction +JsonTypeCategory +JsonValueList +JsonValueListIterator +Jsonb +JsonbAggState +JsonbContainer +JsonbInState +JsonbIterState +JsonbIterator +JsonbIteratorToken +JsonbPair +JsonbParseState +JsonbSubWorkspace +JsonbTypeCategory +JsonbValue +JumbleState +JunkFilter +KeyArray +KeySuffix +KeyWord +LARGE_INTEGER +LDAP +LDAPMessage +LDAPURLDesc +LDAP_TIMEVAL +LINE +LLVMAttributeRef +LLVMBasicBlockRef +LLVMBuilderRef +LLVMIntPredicate +LLVMJitContext +LLVMJitHandle +LLVMMemoryBufferRef +LLVMModuleRef +LLVMOrcJITStackRef +LLVMOrcModuleHandle +LLVMOrcTargetAddress +LLVMPassManagerBuilderRef +LLVMPassManagerRef +LLVMSharedModuleRef +LLVMTargetMachineRef +LLVMTargetRef +LLVMTypeRef +LLVMValueRef +LOCALLOCK +LOCALLOCKOWNER +LOCALLOCKTAG +LOCALPREDICATELOCK +LOCK +LOCKMASK +LOCKMETHODID +LOCKMODE +LOCKTAG +LONG +LONG_PTR +LOOP +LPBYTE +LPCTSTR +LPCWSTR +LPDWORD +LPSECURITY_ATTRIBUTES +LPSERVICE_STATUS +LPSTR +LPTHREAD_START_ROUTINE +LPTSTR +LPVOID +LPWSTR +LSEG +LUID +LVDeadTuples +LVPagePruneState +LVParallelState +LVRelState +LVSavedErrInfo +LVShared +LVSharedIndStats +LWLock +LWLockHandle +LWLockMode +LWLockPadded +LabelProvider +LagTracker +LargeObjectDesc +LastAttnumInfo +Latch +LerpFunc +LexDescr +LexemeEntry +LexemeHashKey +LexemeInfo +LexemeKey +LexizeData +LibraryInfo +Limit +LimitOption +LimitPath +LimitState +LimitStateCond +List +ListCell +ListDictionary +ListParsedLex +ListenAction +ListenActionKind +ListenStmt +LoadStmt +LocalBufferLookupEnt +LocalPgBackendStatus +LocalTransactionId +LocationIndex +LocationLen +LockAcquireResult +LockClauseStrength +LockData +LockInfoData +LockInstanceData +LockMethod +LockMethodData +LockRelId +LockRows +LockRowsPath +LockRowsState +LockStmt +LockTagType +LockTupleMode +LockViewRecurse_context +LockWaitPolicy +LockingClause +LogOpts +LogStmtLevel +LogicalDecodeBeginCB +LogicalDecodeBeginPrepareCB +LogicalDecodeChangeCB +LogicalDecodeCommitCB +LogicalDecodeCommitPreparedCB +LogicalDecodeFilterByOriginCB +LogicalDecodeFilterPrepareCB +LogicalDecodeMessageCB +LogicalDecodePrepareCB +LogicalDecodeRollbackPreparedCB +LogicalDecodeShutdownCB +LogicalDecodeStartupCB +LogicalDecodeStreamAbortCB +LogicalDecodeStreamChangeCB +LogicalDecodeStreamCommitCB +LogicalDecodeStreamMessageCB +LogicalDecodeStreamPrepareCB +LogicalDecodeStreamStartCB +LogicalDecodeStreamStopCB +LogicalDecodeStreamTruncateCB +LogicalDecodeTruncateCB +LogicalDecodingContext +LogicalErrorCallbackState +LogicalOutputPluginInit +LogicalOutputPluginWriterPrepareWrite +LogicalOutputPluginWriterUpdateProgress +LogicalOutputPluginWriterWrite +LogicalRepBeginData +LogicalRepCommitData +LogicalRepCtxStruct +LogicalRepMsgType +LogicalRepPartMapEntry +LogicalRepRelId +LogicalRepRelMapEntry +LogicalRepRelation +LogicalRepTupleData +LogicalRepTyp +LogicalRepWorker +LogicalRewriteMappingData +LogicalTape +LogicalTapeSet +LtreeGistOptions +LtreeSignature +MAGIC +MBuf +MCVItem +MCVList +MEMORY_BASIC_INFORMATION +MINIDUMPWRITEDUMP +MINIDUMP_TYPE +MJEvalResult +MTTargetRelLookup +MVDependencies +MVDependency +MVNDistinct +MVNDistinctItem +Material +MaterialPath +MaterialState +MdfdVec +Memoize +MemoizeEntry +MemoizeInstrumentation +MemoizeKey +MemoizePath +MemoizeState +MemoizeTuple +MemoryContext +MemoryContextCallback +MemoryContextCallbackFunction +MemoryContextCounters +MemoryContextData +MemoryContextMethods +MemoryStatsPrintFunc +MergeAppend +MergeAppendPath +MergeAppendState +MergeJoin +MergeJoinClause +MergeJoinState +MergePath +MergeScanSelCache +MetaCommand +MinMaxAggInfo +MinMaxAggPath +MinMaxExpr +MinMaxMultiOptions +MinMaxOp +MinimalTuple +MinimalTupleData +MinimalTupleTableSlot +MinmaxMultiOpaque +MinmaxOpaque +ModifyTable +ModifyTablePath +ModifyTableState +MorphOpaque +MsgType +MultiAssignRef +MultiSortSupport +MultiSortSupportData +MultiXactId +MultiXactMember +MultiXactOffset +MultiXactStateData +MultiXactStatus +MultirangeIOData +MultirangeParseState +MultirangeType +NDBOX +NODE +NTSTATUS +NUMCacheEntry +NUMDesc +NUMProc +NV +Name +NameData +NameHashEntry +NamedArgExpr +NamedLWLockTranche +NamedLWLockTrancheRequest +NamedTuplestoreScan +NamedTuplestoreScanState +NamespaceInfo +NestLoop +NestLoopParam +NestLoopState +NestPath +NewColumnValue +NewConstraint +NextSampleBlock_function +NextSampleTuple_function +NextValueExpr +Node +NodeTag +NonEmptyRange +Notification +NotificationHash +NotificationList +NotifyStmt +Nsrt +NullIfExpr +NullTest +NullTestType +NullableDatum +Numeric +NumericAggState +NumericDigit +NumericSortSupport +NumericSumAccum +NumericVar +OM_uint32 +OP +OSAPerGroupState +OSAPerQueryState +OSInfo +OSSLCipher +OSSLDigest +OVERLAPPED +ObjectAccessDrop +ObjectAccessNamespaceSearch +ObjectAccessPostAlter +ObjectAccessPostCreate +ObjectAccessType +ObjectAddress +ObjectAddressAndFlags +ObjectAddressExtra +ObjectAddressStack +ObjectAddresses +ObjectClass +ObjectPropertyType +ObjectType +ObjectWithArgs +Offset +OffsetNumber +OffsetVarNodes_context +Oid +OidOptions +OkeysState +OldSnapshotControlData +OldSnapshotTimeMapping +OldToNewMapping +OldToNewMappingData +OnCommitAction +OnCommitItem +OnConflictAction +OnConflictClause +OnConflictExpr +OnConflictSetState +OpBtreeInterpretation +OpClassCacheEnt +OpExpr +OpFamilyMember +OpFamilyOpFuncGroup +OpclassInfo +Operator +OperatorElement +OpfamilyInfo +OprCacheEntry +OprCacheKey +OprInfo +OprProofCacheEntry +OprProofCacheKey +OutputContext +OutputPluginCallbacks +OutputPluginOptions +OutputPluginOutputType +OverrideSearchPath +OverrideStackEntry +OverridingKind +PACE_HEADER +PACL +PATH +PBOOL +PCtxtHandle +PFN +PFN_NTQUERYINFORMATIONFILE +PGAlignedBlock +PGAlignedXLogBlock +PGAsyncStatusType +PGCALL2 +PGChecksummablePage +PGContextVisibility +PGEvent +PGEventConnDestroy +PGEventConnReset +PGEventId +PGEventProc +PGEventRegister +PGEventResultCopy +PGEventResultCreate +PGEventResultDestroy +PGFInfoFunction +PGFileType +PGFunction +PGLZ_HistEntry +PGLZ_Strategy +PGMessageField +PGModuleMagicFunction +PGNoticeHooks +PGOutputData +PGPROC +PGP_CFB +PGP_Context +PGP_MPI +PGP_PubKey +PGP_S2K +PGPing +PGQueryClass +PGRUsage +PGSemaphore +PGSemaphoreData +PGShmemHeader +PGTargetServerType +PGTernaryBool +PGTransactionStatusType +PGVerbosity +PG_Locale_Strategy +PG_Lock_Status +PG_init_t +PGcancel +PGcmdQueueEntry +PGconn +PGdataValue +PGlobjfuncs +PGnotify +PGpipelineStatus +PGresAttDesc +PGresAttValue +PGresParamDesc +PGresult +PGresult_data +PHANDLE +PIO_STATUS_BLOCK +PLAINTREE +PLAssignStmt +PLUID_AND_ATTRIBUTES +PLcword +PLpgSQL_case_when +PLpgSQL_condition +PLpgSQL_datum +PLpgSQL_datum_type +PLpgSQL_diag_item +PLpgSQL_exception +PLpgSQL_exception_block +PLpgSQL_execstate +PLpgSQL_expr +PLpgSQL_func_hashkey +PLpgSQL_function +PLpgSQL_getdiag_kind +PLpgSQL_if_elsif +PLpgSQL_label_type +PLpgSQL_nsitem +PLpgSQL_nsitem_type +PLpgSQL_plugin +PLpgSQL_promise_type +PLpgSQL_raise_option +PLpgSQL_raise_option_type +PLpgSQL_rec +PLpgSQL_recfield +PLpgSQL_resolve_option +PLpgSQL_row +PLpgSQL_stmt +PLpgSQL_stmt_assert +PLpgSQL_stmt_assign +PLpgSQL_stmt_block +PLpgSQL_stmt_call +PLpgSQL_stmt_case +PLpgSQL_stmt_close +PLpgSQL_stmt_commit +PLpgSQL_stmt_dynexecute +PLpgSQL_stmt_dynfors +PLpgSQL_stmt_execsql +PLpgSQL_stmt_exit +PLpgSQL_stmt_fetch +PLpgSQL_stmt_forc +PLpgSQL_stmt_foreach_a +PLpgSQL_stmt_fori +PLpgSQL_stmt_forq +PLpgSQL_stmt_fors +PLpgSQL_stmt_getdiag +PLpgSQL_stmt_if +PLpgSQL_stmt_loop +PLpgSQL_stmt_open +PLpgSQL_stmt_perform +PLpgSQL_stmt_raise +PLpgSQL_stmt_return +PLpgSQL_stmt_return_next +PLpgSQL_stmt_return_query +PLpgSQL_stmt_rollback +PLpgSQL_stmt_type +PLpgSQL_stmt_while +PLpgSQL_trigtype +PLpgSQL_type +PLpgSQL_type_type +PLpgSQL_var +PLpgSQL_variable +PLwdatum +PLword +PLyArrayToOb +PLyCursorObject +PLyDatumToOb +PLyDatumToObFunc +PLyExceptionEntry +PLyExecutionContext +PLyObToArray +PLyObToDatum +PLyObToDatumFunc +PLyObToDomain +PLyObToScalar +PLyObToTransform +PLyObToTuple +PLyObject_AsString_t +PLyPlanObject +PLyProcedure +PLyProcedureEntry +PLyProcedureKey +PLyResultObject +PLySRFState +PLySavedArgs +PLyScalarToOb +PLySubtransactionData +PLySubtransactionObject +PLyTransformToOb +PLyTupleToOb +PLyUnicode_FromStringAndSize_t +PLy_elog_impl_t +PMINIDUMP_CALLBACK_INFORMATION +PMINIDUMP_EXCEPTION_INFORMATION +PMINIDUMP_USER_STREAM_INFORMATION +PMSignalData +PMSignalReason +PMState +POLYGON +PQArgBlock +PQEnvironmentOption +PQExpBuffer +PQExpBufferData +PQcommMethods +PQconninfoOption +PQnoticeProcessor +PQnoticeReceiver +PQprintOpt +PQsslKeyPassHook_OpenSSL_type +PREDICATELOCK +PREDICATELOCKTAG +PREDICATELOCKTARGET +PREDICATELOCKTARGETTAG +PROCESS_INFORMATION +PROCLOCK +PROCLOCKTAG +PROC_HDR +PROC_QUEUE +PSID +PSID_AND_ATTRIBUTES +PSQL_COMP_CASE +PSQL_ECHO +PSQL_ECHO_HIDDEN +PSQL_ERROR_ROLLBACK +PTEntryArray +PTIterationArray +PTOKEN_PRIVILEGES +PTOKEN_USER +PUTENVPROC +PVOID +PX_Alias +PX_Cipher +PX_Combo +PX_HMAC +PX_MD +Page +PageData +PageGistNSN +PageHeader +PageHeaderData +PageXLogRecPtr +PagetableEntry +Pairs +ParallelAppendState +ParallelBitmapHeapState +ParallelBlockTableScanDesc +ParallelBlockTableScanWorker +ParallelBlockTableScanWorkerData +ParallelCompletionPtr +ParallelContext +ParallelExecutorInfo +ParallelHashGrowth +ParallelHashJoinBatch +ParallelHashJoinBatchAccessor +ParallelHashJoinState +ParallelIndexScanDesc +ParallelReadyList +ParallelSlot +ParallelSlotArray +ParallelSlotResultHandler +ParallelState +ParallelTableScanDesc +ParallelTableScanDescData +ParallelWorkerContext +ParallelWorkerInfo +Param +ParamCompileHook +ParamExecData +ParamExternData +ParamFetchHook +ParamKind +ParamListInfo +ParamPathInfo +ParamRef +ParamsErrorCbData +ParentMapEntry +ParseCallbackState +ParseExprKind +ParseNamespaceColumn +ParseNamespaceItem +ParseParamRefHook +ParseState +ParsedLex +ParsedScript +ParsedText +ParsedWord +ParserSetupHook +ParserState +PartClauseInfo +PartClauseMatchStatus +PartClauseTarget +PartitionBoundInfo +PartitionBoundInfoData +PartitionBoundSpec +PartitionCmd +PartitionDesc +PartitionDescData +PartitionDirectory +PartitionDirectoryEntry +PartitionDispatch +PartitionElem +PartitionHashBound +PartitionKey +PartitionListValue +PartitionMap +PartitionPruneCombineOp +PartitionPruneContext +PartitionPruneInfo +PartitionPruneState +PartitionPruneStep +PartitionPruneStepCombine +PartitionPruneStepOp +PartitionPruningData +PartitionRangeBound +PartitionRangeDatum +PartitionRangeDatumKind +PartitionScheme +PartitionSpec +PartitionTupleRouting +PartitionedRelPruneInfo +PartitionedRelPruningData +PartitionwiseAggregateType +PasswordType +Path +PathClauseUsage +PathCostComparison +PathHashStack +PathKey +PathKeysComparison +PathTarget +PatternInfo +PatternInfoArray +Pattern_Prefix_Status +Pattern_Type +PendingFsyncEntry +PendingRelDelete +PendingRelSync +PendingUnlinkEntry +PendingWriteback +PerlInterpreter +Perl_check_t +Perl_ppaddr_t +Permutation +PermutationStep +PermutationStepBlocker +PermutationStepBlockerType +PgArchData +PgBackendGSSStatus +PgBackendSSLStatus +PgBackendStatus +PgBenchExpr +PgBenchExprLink +PgBenchExprList +PgBenchExprType +PgBenchFunction +PgBenchValue +PgBenchValueType +PgChecksumMode +PgFdwAnalyzeState +PgFdwConnState +PgFdwDirectModifyState +PgFdwModifyState +PgFdwOption +PgFdwPathExtraData +PgFdwRelationInfo +PgFdwScanState +PgIfAddrCallback +PgStat_ArchiverStats +PgStat_BackendFunctionEntry +PgStat_Counter +PgStat_FunctionCallUsage +PgStat_FunctionCounts +PgStat_FunctionEntry +PgStat_GlobalStats +PgStat_Msg +PgStat_MsgAnalyze +PgStat_MsgAnlAncestors +PgStat_MsgArchiver +PgStat_MsgAutovacStart +PgStat_MsgBgWriter +PgStat_MsgChecksumFailure +PgStat_MsgConnect +PgStat_MsgDeadlock +PgStat_MsgDisconnect +PgStat_MsgDropdb +PgStat_MsgDummy +PgStat_MsgFuncpurge +PgStat_MsgFuncstat +PgStat_MsgHdr +PgStat_MsgInquiry +PgStat_MsgRecoveryConflict +PgStat_MsgReplSlot +PgStat_MsgResetcounter +PgStat_MsgResetreplslotcounter +PgStat_MsgResetsharedcounter +PgStat_MsgResetsinglecounter +PgStat_MsgResetslrucounter +PgStat_MsgSLRU +PgStat_MsgTabpurge +PgStat_MsgTabstat +PgStat_MsgTempFile +PgStat_MsgVacuum +PgStat_MsgWal +PgStat_SLRUStats +PgStat_Shared_Reset_Target +PgStat_Single_Reset_Type +PgStat_StatDBEntry +PgStat_StatFuncEntry +PgStat_StatReplSlotEntry +PgStat_StatTabEntry +PgStat_SubXactStatus +PgStat_TableCounts +PgStat_TableEntry +PgStat_TableStatus +PgStat_TableXactStatus +PgStat_WalStats +PgXmlErrorContext +PgXmlStrictness +Pg_finfo_record +Pg_magic_struct +PipeProtoChunk +PipeProtoHeader +PlaceHolderInfo +PlaceHolderVar +Plan +PlanDirectModify_function +PlanForeignModify_function +PlanInvalItem +PlanRowMark +PlanState +PlannedStmt +PlannerGlobal +PlannerInfo +PlannerParamItem +Point +Pointer +PolicyInfo +PolyNumAggState +Pool +PopulateArrayContext +PopulateArrayState +PopulateRecordCache +PopulateRecordsetState +Port +Portal +PortalHashEnt +PortalStatus +PortalStrategy +PostParseColumnRefHook +PostgresPollingStatusType +PostingItem +PostponedQual +PreParseColumnRefHook +PredClass +PredIterInfo +PredIterInfoData +PredXactList +PredXactListElement +PredicateLockData +PredicateLockTargetType +PrefetchBufferResult +PrepParallelRestorePtrType +PrepareStmt +PreparedStatement +PresortedKeyData +PrewarmType +PrintExtraTocPtrType +PrintTocDataPtrType +PrintfArgType +PrintfArgValue +PrintfTarget +PrinttupAttrInfo +PrivTarget +PrivateRefCountEntry +ProcArrayStruct +ProcLangInfo +ProcSignalBarrierType +ProcSignalHeader +ProcSignalReason +ProcSignalSlot +ProcState +ProcWaitStatus +ProcessUtilityContext +ProcessUtility_hook_type +ProcessingMode +ProgressCommandType +ProjectSet +ProjectSetPath +ProjectSetState +ProjectionInfo +ProjectionPath +ProtocolVersion +PrsStorage +PruneState +PruneStepResult +PsqlScanCallbacks +PsqlScanQuoteType +PsqlScanResult +PsqlScanState +PsqlScanStateData +PsqlSettings +Publication +PublicationActions +PublicationInfo +PublicationPartOpt +PublicationRelInfo +PullFilter +PullFilterOps +PushFilter +PushFilterOps +PushFunction +PyCFunction +PyCodeObject +PyMappingMethods +PyMethodDef +PyModuleDef +PyObject +PySequenceMethods +PyTypeObject +Py_ssize_t +QPRS_STATE +QTN2QTState +QTNode +QUERYTYPE +QUERY_SECURITY_CONTEXT_TOKEN_FN +QualCost +QualItem +Query +QueryCompletion +QueryDesc +QueryEnvironment +QueryInfo +QueryItem +QueryItemType +QueryMode +QueryOperand +QueryOperator +QueryRepresentation +QueryRepresentationOperand +QuerySource +QueueBackendStatus +QueuePosition +QuitSignalReason +RBTNode +RBTOrderControl +RBTree +RBTreeIterator +REPARSE_JUNCTION_DATA_BUFFER +RIX +RI_CompareHashEntry +RI_CompareKey +RI_ConstraintInfo +RI_QueryHashEntry +RI_QueryKey +RTEKind +RWConflict +RWConflictPoolHeader +RandomState +Range +RangeBound +RangeBox +RangeFunction +RangeIOData +RangeQueryClause +RangeSubselect +RangeTableFunc +RangeTableFuncCol +RangeTableSample +RangeTblEntry +RangeTblFunction +RangeTblRef +RangeType +RangeVar +RangeVarGetRelidCallback +Ranges +RawColumnDefault +RawParseMode +RawStmt +ReInitializeDSMForeignScan_function +ReScanForeignScan_function +ReadBufPtrType +ReadBufferMode +ReadBytePtrType +ReadExtraTocPtrType +ReadFunc +ReassignOwnedStmt +RecheckForeignScan_function +RecordCacheEntry +RecordCompareData +RecordIOData +RecoveryLockListsEntry +RecoveryPauseState +RecoveryState +RecoveryTargetTimeLineGoal +RecoveryTargetType +RectBox +RecursionContext +RecursiveUnion +RecursiveUnionPath +RecursiveUnionState +RefetchForeignRow_function +RefreshMatViewStmt +RegProcedure +Regis +RegisNode +RegisteredBgWorker +ReindexErrorInfo +ReindexIndexInfo +ReindexObjectType +ReindexParams +ReindexStmt +ReindexType +RelFileNode +RelFileNodeBackend +RelIdCacheEnt +RelInfo +RelInfoArr +RelMapFile +RelMapping +RelOptInfo +RelOptKind +RelSizeEntry +RelTag +RelToCheck +RelToCluster +RelabelType +Relation +RelationData +RelationInfo +RelationPtr +RelationSyncEntry +RelcacheCallbackFunction +RelfilenodeMapEntry +RelfilenodeMapKey +Relids +RelocationBufferInfo +RelptrFreePageBtree +RelptrFreePageManager +RelptrFreePageSpanLeader +RenameStmt +ReopenPtrType +ReorderBuffer +ReorderBufferApplyChangeCB +ReorderBufferApplyTruncateCB +ReorderBufferBeginCB +ReorderBufferChange +ReorderBufferCommitCB +ReorderBufferCommitPreparedCB +ReorderBufferDiskChange +ReorderBufferIterTXNEntry +ReorderBufferIterTXNState +ReorderBufferMessageCB +ReorderBufferPrepareCB +ReorderBufferRollbackPreparedCB +ReorderBufferStreamAbortCB +ReorderBufferStreamChangeCB +ReorderBufferStreamCommitCB +ReorderBufferStreamMessageCB +ReorderBufferStreamPrepareCB +ReorderBufferStreamStartCB +ReorderBufferStreamStopCB +ReorderBufferStreamTruncateCB +ReorderBufferTXN +ReorderBufferTXNByIdEnt +ReorderBufferToastEnt +ReorderBufferTupleBuf +ReorderBufferTupleCidEnt +ReorderBufferTupleCidKey +ReorderTuple +RepOriginId +ReparameterizeForeignPathByChild_function +ReplaceVarsFromTargetList_context +ReplaceVarsNoMatchOption +ReplicaIdentityStmt +ReplicationKind +ReplicationSlot +ReplicationSlotCtlData +ReplicationSlotOnDisk +ReplicationSlotPersistency +ReplicationSlotPersistentData +ReplicationState +ReplicationStateCtl +ReplicationStateOnDisk +ResTarget +ReservoirState +ReservoirStateData +ResourceArray +ResourceOwner +ResourceReleaseCallback +ResourceReleaseCallbackItem +ResourceReleasePhase +RestoreOptions +RestorePass +RestrictInfo +Result +ResultRelInfo +ResultState +ReturnSetInfo +ReturnStmt +RevmapContents +RewriteMappingDataEntry +RewriteMappingFile +RewriteRule +RewriteState +RmgrData +RmgrDescData +RmgrId +RmgrIds +RoleSpec +RoleSpecType +RoleStmtType +RollupData +RowCompareExpr +RowCompareType +RowExpr +RowIdentityVarInfo +RowMarkClause +RowMarkType +RowSecurityDesc +RowSecurityPolicy +RuleInfo +RuleLock +RuleStmt +RunningTransactions +RunningTransactionsData +SC_HANDLE +SECURITY_ATTRIBUTES +SECURITY_STATUS +SEG +SERIALIZABLEXACT +SERIALIZABLEXID +SERIALIZABLEXIDTAG +SERVICE_STATUS +SERVICE_STATUS_HANDLE +SERVICE_TABLE_ENTRY +SHM_QUEUE +SID_AND_ATTRIBUTES +SID_IDENTIFIER_AUTHORITY +SID_NAME_USE +SISeg +SIZE_T +SMgrRelation +SMgrRelationData +SMgrSortArray +SOCKADDR +SOCKET +SPELL +SPICallbackArg +SPIExecuteOptions +SPIParseOpenOptions +SPIPlanPtr +SPIPrepareOptions +SPITupleTable +SPLITCOST +SPNode +SPNodeData +SPPageDesc +SQLCmd +SQLDropObject +SQLFunctionCache +SQLFunctionCachePtr +SQLFunctionParseInfo +SQLFunctionParseInfoPtr +SQLValueFunction +SQLValueFunctionOp +SSL +SSLExtensionInfoContext +SSL_CTX +STARTUPINFO +STRLEN +SV +SYNCHRONIZATION_BARRIER +SampleScan +SampleScanGetSampleSize_function +SampleScanState +SamplerRandomState +ScalarArrayOpExpr +ScalarArrayOpExprHashEntry +ScalarArrayOpExprHashTable +ScalarIOData +ScalarItem +ScalarMCVItem +Scan +ScanDirection +ScanKey +ScanKeyData +ScanKeywordHashFunc +ScanKeywordList +ScanState +ScanTypeControl +ScannerCallbackState +SchemaQuery +SecBuffer +SecBufferDesc +SecLabelItem +SecLabelStmt +SeenRelsEntry +SelectLimit +SelectStmt +Selectivity +SemTPadded +SemiAntiJoinFactors +SeqScan +SeqScanState +SeqTable +SeqTableData +SerCommitSeqNo +SerialControl +SerializableXactHandle +SerializedActiveRelMaps +SerializedRanges +SerializedReindexState +SerializedSnapshotData +SerializedTransactionState +Session +SessionBackupState +SessionEndType +SetConstraintState +SetConstraintStateData +SetConstraintTriggerData +SetExprState +SetFunctionReturnMode +SetOp +SetOpCmd +SetOpPath +SetOpState +SetOpStatePerGroup +SetOpStrategy +SetOperation +SetOperationStmt +SetQuantifier +SetToDefault +SetupWorkerPtrType +ShDependObjectInfo +SharedAggInfo +SharedBitmapState +SharedDependencyObjectType +SharedDependencyType +SharedExecutorInstrumentation +SharedFileSet +SharedHashInfo +SharedIncrementalSortInfo +SharedInvalCatalogMsg +SharedInvalCatcacheMsg +SharedInvalRelcacheMsg +SharedInvalRelmapMsg +SharedInvalSmgrMsg +SharedInvalSnapshotMsg +SharedInvalidationMessage +SharedJitInstrumentation +SharedMemoizeInfo +SharedRecordTableEntry +SharedRecordTableKey +SharedRecordTypmodRegistry +SharedSortInfo +SharedTuplestore +SharedTuplestoreAccessor +SharedTuplestoreChunk +SharedTuplestoreParticipant +SharedTypmodTableEntry +Sharedsort +ShellTypeInfo +ShippableCacheEntry +ShippableCacheKey +ShmemIndexEnt +ShutdownForeignScan_function +ShutdownInformation +ShutdownMode +SignTSVector +SimpleActionList +SimpleActionListCell +SimpleEcontextStackEntry +SimpleOidList +SimpleOidListCell +SimplePtrList +SimplePtrListCell +SimpleStats +SimpleStringList +SimpleStringListCell +SingleBoundSortItem +Size +SkipPages +SlabBlock +SlabChunk +SlabContext +SlabSlot +SlotErrCallbackArg +SlotNumber +SlruCtl +SlruCtlData +SlruErrorCause +SlruPageStatus +SlruScanCallback +SlruShared +SlruSharedData +SlruWriteAll +SlruWriteAllData +SnapBuild +SnapBuildOnDisk +SnapBuildState +Snapshot +SnapshotData +SnapshotType +SockAddr +Sort +SortBy +SortByDir +SortByNulls +SortCoordinate +SortGroupClause +SortItem +SortPath +SortShimExtra +SortState +SortSupport +SortSupportData +SortTuple +SortTupleComparator +SortedPoint +SpGistBuildState +SpGistCache +SpGistDeadTuple +SpGistDeadTupleData +SpGistInnerTuple +SpGistInnerTupleData +SpGistLUPCache +SpGistLastUsedPage +SpGistLeafTuple +SpGistLeafTupleData +SpGistMetaPageData +SpGistNodeTuple +SpGistNodeTupleData +SpGistOptions +SpGistPageOpaque +SpGistPageOpaqueData +SpGistScanOpaque +SpGistScanOpaqueData +SpGistSearchItem +SpGistState +SpGistTypeDesc +SpecialJoinInfo +SpinDelayStatus +SplitInterval +SplitLR +SplitPoint +SplitTextOutputData +SplitVar +SplitedPageLayout +StackElem +StartBlobPtrType +StartBlobsPtrType +StartDataPtrType +StartReplicationCmd +StartupStatusEnum +StatEntry +StatExtEntry +StatMsgType +StateFileChunk +StatisticExtInfo +Stats +StatsBuildData +StatsData +StatsElem +StatsExtInfo +StdAnalyzeData +StdRdOptIndexCleanup +StdRdOptions +Step +StopList +StrategyNumber +StreamCtl +StreamXidHash +StringInfo +StringInfoData +StripnullState +SubLink +SubLinkType +SubPlan +SubPlanState +SubRemoveRels +SubTransactionId +SubXactCallback +SubXactCallbackItem +SubXactEvent +SubXactInfo +SubqueryScan +SubqueryScanPath +SubqueryScanState +SubscriptExecSetup +SubscriptExecSteps +SubscriptRoutines +SubscriptTransform +SubscriptingRef +SubscriptingRefState +Subscription +SubscriptionInfo +SubscriptionRelState +SupportRequestCost +SupportRequestIndexCondition +SupportRequestRows +SupportRequestSelectivity +SupportRequestSimplify +Syn +SyncOps +SyncRepConfigData +SyncRepStandbyData +SyncRequestHandler +SyncRequestType +SysFKRelationship +SysScanDesc +SyscacheCallbackFunction +SystemRowsSamplerData +SystemSamplerData +SystemTimeSamplerData +TAR_MEMBER +TBMIterateResult +TBMIteratingState +TBMIterator +TBMSharedIterator +TBMSharedIteratorState +TBMStatus +TBlockState +TIDBitmap +TM_FailureData +TM_IndexDelete +TM_IndexDeleteOp +TM_IndexStatus +TM_Result +TOKEN_DEFAULT_DACL +TOKEN_INFORMATION_CLASS +TOKEN_PRIVILEGES +TOKEN_USER +TParser +TParserCharTest +TParserPosition +TParserSpecial +TParserState +TParserStateAction +TParserStateActionItem +TQueueDestReceiver +TRGM +TSAnyCacheEntry +TSConfigCacheEntry +TSConfigInfo +TSDictInfo +TSDictionaryCacheEntry +TSExecuteCallback +TSLexeme +TSParserCacheEntry +TSParserInfo +TSQuery +TSQueryData +TSQueryParserState +TSQuerySign +TSReadPointer +TSTemplateInfo +TSTernaryValue +TSTokenTypeStorage +TSVector +TSVectorBuildState +TSVectorData +TSVectorParseState +TSVectorStat +TState +TStoreState +TXNEntryFile +TYPCATEGORY +T_Action +T_WorkerStatus +TabStatHashEntry +TabStatusArray +TableAmRoutine +TableAttachInfo +TableDataInfo +TableFunc +TableFuncRoutine +TableFuncScan +TableFuncScanState +TableInfo +TableLikeClause +TableSampleClause +TableScanDesc +TableScanDescData +TableSpaceCacheEntry +TableSpaceOpts +TablespaceList +TablespaceListCell +TapeBlockTrailer +TapeShare +TarMethodData +TarMethodFile +TargetEntry +TclExceptionNameMap +Tcl_DString +Tcl_FileProc +Tcl_HashEntry +Tcl_HashTable +Tcl_Interp +Tcl_NotifierProcs +Tcl_Obj +Tcl_Time +TempNamespaceStatus +TestDecodingData +TestDecodingTxnData +TestSpec +TextFreq +TextPositionState +TheLexeme +TheSubstitute +TidExpr +TidExprType +TidHashKey +TidOpExpr +TidPath +TidRangePath +TidRangeScan +TidRangeScanState +TidScan +TidScanState +TimeADT +TimeLineHistoryCmd +TimeLineHistoryEntry +TimeLineID +TimeOffset +TimeStamp +TimeTzADT +TimeZoneAbbrevTable +TimeoutId +TimeoutType +Timestamp +TimestampTz +TmFromChar +TmToChar +ToastAttrInfo +ToastCompressionId +ToastTupleContext +ToastedAttribute +TocEntry +TokenAuxData +TokenizedLine +TrackItem +TransInvalidationInfo +TransState +TransactionId +TransactionState +TransactionStateData +TransactionStmt +TransactionStmtKind +TransformInfo +TransformJsonStringValuesState +TransitionCaptureState +TrgmArc +TrgmArcInfo +TrgmBound +TrgmColor +TrgmColorInfo +TrgmGistOptions +TrgmNFA +TrgmPackArcInfo +TrgmPackedArc +TrgmPackedGraph +TrgmPackedState +TrgmPrefix +TrgmState +TrgmStateKey +TrieChar +Trigger +TriggerData +TriggerDesc +TriggerEvent +TriggerFlags +TriggerInfo +TriggerTransition +TruncateStmt +TsmRoutine +TupOutputState +TupSortStatus +TupStoreStatus +TupleConstr +TupleConversionMap +TupleDesc +TupleHashEntry +TupleHashEntryData +TupleHashIterator +TupleHashTable +TupleQueueReader +TupleTableSlot +TupleTableSlotOps +TuplesortInstrumentation +TuplesortMethod +TuplesortSpaceType +Tuplesortstate +Tuplestorestate +TwoPhaseCallback +TwoPhaseFileHeader +TwoPhaseLockRecord +TwoPhasePgStatRecord +TwoPhasePredicateLockRecord +TwoPhasePredicateRecord +TwoPhasePredicateRecordType +TwoPhasePredicateXactRecord +TwoPhaseRecordOnDisk +TwoPhaseRmgrId +TwoPhaseStateData +Type +TypeCacheEntry +TypeCacheEnumData +TypeCast +TypeCat +TypeFuncClass +TypeInfo +TypeName +U +U32 +U8 +UChar +UCharIterator +UColAttribute +UColAttributeValue +UCollator +UConverter +UErrorCode +UINT +ULARGE_INTEGER +ULONG +ULONG_PTR +UV +UVersionInfo +UnicodeNormalizationForm +UnicodeNormalizationQC +Unique +UniquePath +UniquePathMethod +UniqueState +UnlistenStmt +UnpackTarState +UnresolvedTup +UnresolvedTupData +UpdateStmt +UpperRelationKind +UpperUniquePath +UserAuth +UserMapping +UserOpts +VacAttrStats +VacAttrStatsP +VacErrPhase +VacOptValue +VacuumParams +VacuumRelation +VacuumStmt +ValidateIndexState +Value +ValuesScan +ValuesScanState +Var +VarBit +VarChar +VarParamState +VarString +VarStringSortSupport +Variable +VariableAssignHook +VariableCache +VariableCacheData +VariableSetKind +VariableSetStmt +VariableShowStmt +VariableSpace +VariableStatData +VariableSubstituteHook +VersionedQuery +Vfd +ViewCheckOption +ViewOptCheckOption +ViewOptions +ViewStmt +VirtualTransactionId +VirtualTupleTableSlot +VolatileFunctionStatus +Vsrt +WAIT_ORDER +WALAvailability +WALInsertLock +WALInsertLockPadded +WALOpenSegment +WALReadError +WALSegmentCloseCB +WALSegmentContext +WALSegmentOpenCB +WCHAR +WCOKind +WFW_WaitOption +WIDGET +WORD +WORKSTATE +WSABUF +WSADATA +WSANETWORKEVENTS +WSAPROTOCOL_INFO +WaitEvent +WaitEventActivity +WaitEventClient +WaitEventIO +WaitEventIPC +WaitEventSet +WaitEventTimeout +WaitPMResult +WalCloseMethod +WalLevel +Safekeeper +WalMessage +WalRcvData +WalRcvExecResult +WalRcvExecStatus +WalRcvState +WalRcvStreamOptions +WalReceiverConn +WalReceiverFunctionsType +WalSnd +WalSndCtlData +WalSndSendDataCallback +WalSndState +WalTimeSample +WalUsage +WalWriteMethod +Walfile +WindowAgg +WindowAggPath +WindowAggState +WindowClause +WindowClauseSortData +WindowDef +WindowFunc +WindowFuncExprState +WindowFuncLists +WindowObject +WindowObjectData +WindowStatePerAgg +WindowStatePerAggData +WindowStatePerFunc +WithCheckOption +WithClause +WordEntry +WordEntryIN +WordEntryPos +WordEntryPosVector +WordEntryPosVector1 +WorkTableScan +WorkTableScanState +WorkerInfo +WorkerInfoData +WorkerInstrumentation +WorkerJobDumpPtrType +WorkerJobRestorePtrType +Working_State +WriteBufPtrType +WriteBytePtrType +WriteDataCallback +WriteDataPtrType +WriteExtraTocPtrType +WriteFunc +WriteManifestState +WriteTarState +WritebackContext +X509 +X509_EXTENSION +X509_NAME +X509_NAME_ENTRY +X509_STORE +X509_STORE_CTX +XLTW_Oper +XLogCtlData +XLogCtlInsert +XLogDumpConfig +XLogDumpPrivate +XLogDumpStats +XLogLongPageHeader +XLogLongPageHeaderData +XLogPageHeader +XLogPageHeaderData +XLogPageReadCB +XLogPageReadPrivate +XLogReaderRoutine +XLogReaderState +XLogRecData +XLogRecPtr +XLogRecord +XLogRecordBlockCompressHeader +XLogRecordBlockHeader +XLogRecordBlockImageHeader +XLogRecordBuffer +XLogRedoAction +XLogSegNo +XLogSource +XLogwrtResult +XLogwrtRqst +XPVIV +XPVMG +XactCallback +XactCallbackItem +XactEvent +XactLockTableWaitInfo +XidBoundsViolation +XidCacheStatus +XidCommitStatus +XidStatus +XmlExpr +XmlExprOp +XmlOptionType +XmlSerialize +XmlTableBuilderData +YYLTYPE +YYSTYPE +YY_BUFFER_STATE +ZenithErrorResponse +ZenithExistsRequest +ZenithExistsResponse +ZenithGetPageRequest +ZenithGetPageResponse +ZenithMessage +ZenithMessageTag +ZenithNblocksRequest +ZenithNblocksResponse +ZenithRequest +ZenithResponse +_SPI_connection +_SPI_plan +__AssignProcessToJobObject +__CreateJobObject +__CreateRestrictedToken +__IsProcessInJob +__QueryInformationJobObject +__SetInformationJobObject +__time64_t +_dev_t +_ino_t +_resultmap +_stringlist +acquireLocksOnSubLinks_context +adjust_appendrel_attrs_context +aff_regex_struct +allocfunc +amadjustmembers_function +ambeginscan_function +ambuild_function +ambuildempty_function +ambuildphasename_function +ambulkdelete_function +amcanreturn_function +amcostestimate_function +amendscan_function +amestimateparallelscan_function +amgetbitmap_function +amgettuple_function +aminitparallelscan_function +aminsert_function +ammarkpos_function +amoptions_function +amparallelrescan_function +amproperty_function +amrescan_function +amrestrpos_function +amvacuumcleanup_function +amvalidate_function +array_iter +array_unnest_fctx +assign_collations_context +autovac_table +av_relation +avl_dbase +avl_node +avl_tree +avw_dbase +backslashResult +backup_manifest_info +backup_manifest_option +base_yy_extra_type +basebackup_options +bgworker_main_type +binaryheap +binaryheap_comparator +bitmapword +bits16 +bits32 +bits8 +bloom_filter +brin_column_state +brin_serialize_callback_type +bytea +cached_re_str +cashKEY +cfp +check_agg_arguments_context +check_function_callback +check_network_data +check_object_relabel_type +check_password_hook_type +check_ungrouped_columns_context +chr +clock_t +cmpEntriesArg +cmpfunc +codes_t +coercion +collation_cache_entry +color +colormaprange +compare_context +config_var_value +contain_aggs_of_level_context +convert_testexpr_context +copy_data_source_cb +core_YYSTYPE +core_yy_extra_type +core_yyscan_t +corrupt_items +cost_qual_eval_context +cp_hash_func +create_upper_paths_hook_type +createdb_failure_params +crosstab_HashEnt +crosstab_cat_desc +datapagemap_iterator_t +datapagemap_t +dateKEY +datetkn +dce_uuid_t +decimal +deparse_columns +deparse_context +deparse_expr_cxt +deparse_namespace +destructor +dev_t +digit +disassembledLeaf +dlist_head +dlist_iter +dlist_mutable_iter +dlist_node +ds_state +dsa_area +dsa_area_control +dsa_area_pool +dsa_area_span +dsa_handle +dsa_pointer +dsa_pointer_atomic +dsa_segment_header +dsa_segment_index +dsa_segment_map +dshash_compare_function +dshash_hash +dshash_hash_function +dshash_parameters +dshash_partition +dshash_table +dshash_table_control +dshash_table_handle +dshash_table_item +dsm_control_header +dsm_control_item +dsm_handle +dsm_op +dsm_segment +dsm_segment_detach_callback +eLogType +ean13 +eary +ec_matches_callback_type +ec_member_foreign_arg +ec_member_matches_arg +emit_log_hook_type +eval_const_expressions_context +exec_thread_arg +execution_state +explain_get_index_name_hook_type +f_smgr +fd_set +fe_scram_state +fe_scram_state_enum +fetch_range_request +file_action_t +file_entry_t +file_type_t +filehash_hash +filehash_iterator +filemap_t +fill_string_relopt +finalize_primnode_context +find_dependent_phvs_context +find_expr_references_context +fix_join_expr_context +fix_scan_expr_context +fix_upper_expr_context +flatten_join_alias_vars_context +float4 +float4KEY +float8 +float8KEY +floating_decimal_32 +floating_decimal_64 +fmAggrefPtr +fmExprContextCallbackFunction +fmNodePtr +fmStringInfo +fmgr_hook_type +foreign_glob_cxt +foreign_loc_cxt +freeaddrinfo_ptr_t +freefunc +fsec_t +gbt_vsrt_arg +gbtree_ninfo +gbtree_vinfo +generate_series_fctx +generate_series_numeric_fctx +generate_series_timestamp_fctx +generate_series_timestamptz_fctx +generate_subscripts_fctx +get_attavgwidth_hook_type +get_index_stats_hook_type +get_relation_info_hook_type +get_relation_stats_hook_type +getaddrinfo_ptr_t +getnameinfo_ptr_t +gid_t +gin_leafpage_items_state +ginxlogCreatePostingTree +ginxlogDeleteListPages +ginxlogDeletePage +ginxlogInsert +ginxlogInsertDataInternal +ginxlogInsertEntry +ginxlogInsertListPage +ginxlogRecompressDataLeaf +ginxlogSplit +ginxlogUpdateMeta +ginxlogVacuumDataLeafPage +gistxlogDelete +gistxlogPage +gistxlogPageDelete +gistxlogPageReuse +gistxlogPageSplit +gistxlogPageUpdate +grouping_sets_data +gseg_picksplit_item +gss_buffer_desc +gss_cred_id_t +gss_ctx_id_t +gss_name_t +gtrgm_consistent_cache +gzFile +hashfunc +hbaPort +heap_page_items_state +help_handler +hlCheck +hstoreCheckKeyLen_t +hstoreCheckValLen_t +hstorePairs_t +hstoreUniquePairs_t +hstoreUpgrade_t +hyperLogLogState +ifState +ilist +import_error_callback_arg +indexed_tlist +inet +inetKEY +inet_struct +init_function +inline_cte_walker_context +inline_error_callback_arg +ino_t +inquiry +instr_time +int128 +int16 +int16KEY +int2vector +int32 +int32KEY +int32_t +int64 +int64KEY +int8 +internalPQconninfoOption +intptr_t +intset_internal_node +intset_leaf_node +intset_node +intvKEY +itemIdCompact +itemIdCompactData +iterator +jmp_buf +join_search_hook_type +json_aelem_action +json_manifest_error_callback +json_manifest_perfile_callback +json_manifest_perwalrange_callback +json_ofield_action +json_scalar_action +json_struct_action +keyEntryData +key_t +lclContext +lclTocEntry +leafSegmentInfo +leaf_item +libpq_source +line_t +lineno_t +list_sort_comparator +local_relopt +local_relopts +local_source +locale_t +locate_agg_of_level_context +locate_var_of_level_context +locate_windowfunc_context +logstreamer_param +lquery +lquery_level +lquery_variant +ltree +ltree_gist +ltree_level +ltxtquery +mXactCacheEnt +mac8KEY +macKEY +macaddr +macaddr8 +macaddr_sortsupport_state +manifest_file +manifest_files_hash +manifest_files_iterator +manifest_wal_range +map_variable_attnos_context +max_parallel_hazard_context +mb2wchar_with_len_converter +mbchar_verifier +mbcharacter_incrementer +mbdisplaylen_converter +mblen_converter +mbstr_verifier +memoize_hash +memoize_iterator +metastring +mix_data_t +mixedStruct +mode_t +movedb_failure_params +mp_digit +mp_int +mp_result +mp_sign +mp_size +mp_small +mp_usmall +mp_word +mpz_t +multirange_bsearch_comparison +mxact +mxtruncinfo +needs_fmgr_hook_type +network_sortsupport_state +nodeitem +normal_rand_fctx +ntile_context +numeric +object_access_hook_type +off_t +oidKEY +oidvector +on_dsm_detach_callback +on_exit_nicely_callback +openssl_tls_init_hook_typ +ossl_EVP_cipher_func +other +output_type +pagetable_hash +pagetable_iterator +pairingheap +pairingheap_comparator +pairingheap_node +parallel_worker_main_type +parse_error_callback_arg +parser_context +partition_method_t +pendingPosition +pgParameterStatus +pg_atomic_flag +pg_atomic_uint32 +pg_atomic_uint64 +pg_checksum_context +pg_checksum_raw_context +pg_checksum_type +pg_conn_host +pg_conn_host_type +pg_conv_map +pg_crc32 +pg_crc32c +pg_cryptohash_ctx +pg_cryptohash_type +pg_ctype_cache +pg_enc +pg_enc2gettext +pg_enc2name +pg_encname +pg_funcptr_t +pg_gssinfo +pg_hmac_ctx +pg_int64 +pg_local_to_utf_combined +pg_locale_t +pg_mb_radix_tree +pg_md5_ctx +pg_on_exit_callback +pg_re_flags +pg_saslprep_rc +pg_sha1_ctx +pg_sha224_ctx +pg_sha256_ctx +pg_sha384_ctx +pg_sha512_ctx +pg_snapshot +pg_stack_base_t +pg_time_t +pg_time_usec_t +pg_tz +pg_tz_cache +pg_tzenum +pg_unicode_decompinfo +pg_unicode_decomposition +pg_unicode_norminfo +pg_unicode_normprops +pg_unicode_recompinfo +pg_utf_to_local_combined +pg_uuid_t +pg_wc_probefunc +pg_wchar +pg_wchar_tbl +pgp_armor_headers_state +pgpid_t +pgsocket +pgsql_thing_t +pgssEntry +pgssGlobalStats +pgssHashKey +pgssSharedState +pgssStoreKind +pgssVersion +pgstat_page +pgstattuple_type +pgthreadlock_t +pid_t +pivot_field +planner_hook_type +plperl_array_info +plperl_call_data +plperl_interp_desc +plperl_proc_desc +plperl_proc_key +plperl_proc_ptr +plperl_query_desc +plperl_query_entry +plpgsql_CastHashEntry +plpgsql_CastHashKey +plpgsql_HashEnt +pltcl_call_state +pltcl_interp_desc +pltcl_proc_desc +pltcl_proc_key +pltcl_proc_ptr +pltcl_query_desc +pointer +polymorphic_actuals +pos_trgm +post_parse_analyze_hook_type +postprocess_result_function +pqbool +pqsigfunc +printQueryOpt +printTableContent +printTableFooter +printTableOpt +printTextFormat +printTextLineFormat +printTextLineWrap +printTextRule +printfunc +priv_map +process_file_callback_t +process_sublinks_context +proclist_head +proclist_mutable_iter +proclist_node +promptStatus_t +pthread_barrier_t +pthread_cond_t +pthread_key_t +pthread_mutex_t +pthread_once_t +pthread_t +ptrdiff_t +pull_var_clause_context +pull_varattnos_context +pull_varnos_context +pull_vars_context +pullup_replace_vars_context +pushdown_safety_info +qc_hash_func +qsort_arg_comparator +qsort_comparator +query_pathkeys_callback +radius_attribute +radius_packet +rangeTableEntry_used_context +rank_context +rbt_allocfunc +rbt_combiner +rbt_comparator +rbt_freefunc +reduce_outer_joins_state +reference +regex_arc_t +regex_t +regexp +regexp_matches_ctx +registered_buffer +regmatch_t +regoff_t +regproc +relopt_bool +relopt_enum +relopt_enum_elt_def +relopt_gen +relopt_int +relopt_kind +relopt_parse_elt +relopt_real +relopt_string +relopt_type +relopt_value +relopts_validator +remoteConn +remoteConnHashEnt +remoteDep +rendezvousHashEntry +replace_rte_variables_callback +replace_rte_variables_context +ret_type +rewind_source +rewrite_event +rijndael_ctx +rm_detail_t +role_auth_extra +row_security_policy_hook_type +rsv_callback +saophash_hash +save_buffer +scram_state +scram_state_enum +sem_t +sequence_magic +set_join_pathlist_hook_type +set_rel_pathlist_hook_type +shm_mq +shm_mq_handle +shm_mq_iovec +shm_mq_result +shm_toc +shm_toc_entry +shm_toc_estimator +shmem_startup_hook_type +sig_atomic_t +sigjmp_buf +signedbitmapword +sigset_t +size_t +slist_head +slist_iter +slist_mutable_iter +slist_node +slock_t +socket_set +spgBulkDeleteState +spgChooseIn +spgChooseOut +spgChooseResultType +spgConfigIn +spgConfigOut +spgInnerConsistentIn +spgInnerConsistentOut +spgLeafConsistentIn +spgLeafConsistentOut +spgNodePtr +spgPickSplitIn +spgPickSplitOut +spgVacPendingItem +spgxlogAddLeaf +spgxlogAddNode +spgxlogMoveLeafs +spgxlogPickSplit +spgxlogSplitTuple +spgxlogState +spgxlogVacuumLeaf +spgxlogVacuumRedirect +spgxlogVacuumRoot +split_pathtarget_context +split_pathtarget_item +sql_error_callback_arg +sqlparseInfo +sqlparseState +ss_lru_item_t +ss_scan_location_t +ss_scan_locations_t +ssize_t +standard_qp_extra +stemmer_module +stmtCacheEntry +storeInfo +storeRes_func +stream_stop_callback +string +substitute_actual_parameters_context +substitute_actual_srf_parameters_context +substitute_phv_relids_context +svtype +symbol +tablespaceinfo +teSection +temp_tablespaces_extra +test_re_flags +test_regex_ctx +test_shm_mq_header +test_spec +test_start_function +text +timeKEY +time_t +timeout_handler_proc +timeout_params +timerCA +tlist_vinfo +toast_compress_header +transferMode +transfer_thread_arg +trgm +trgm_mb_char +trivalue +tsKEY +ts_parserstate +ts_tokenizer +ts_tokentype +tsearch_readline_state +tuplehash_hash +tuplehash_iterator +type +tzEntry +u1byte +u4byte +u_char +u_int +uchr +uid_t +uint128 +uint16 +uint16_t +uint32 +uint32_t +uint64 +uint64_t +uint8 +uint8_t +uintptr_t +unicodeStyleBorderFormat +unicodeStyleColumnFormat +unicodeStyleFormat +unicodeStyleRowFormat +unicode_linestyle +unit_conversion +unlogged_relation_entry +utf_local_conversion_func +uuidKEY +uuid_rc_t +uuid_sortsupport_state +uuid_t +va_list +vacuumingOptions +validate_string_relopt +varatt_expanded +varattrib_1b +varattrib_1b_e +varattrib_4b +vbits +verifier_context +walrcv_check_conninfo_fn +walrcv_connect_fn +walrcv_create_slot_fn +walrcv_disconnect_fn +walrcv_endstreaming_fn +walrcv_exec_fn +walrcv_get_backend_pid_fn +walrcv_get_conninfo_fn +walrcv_get_senderinfo_fn +walrcv_identify_system_fn +walrcv_readtimelinehistoryfile_fn +walrcv_receive_fn +walrcv_send_fn +walrcv_server_version_fn +walrcv_startstreaming_fn +wchar2mb_with_len_converter +wchar_t +win32_deadchild_waitinfo +wint_t +worker_state +worktable +wrap +xl_brin_createidx +xl_brin_desummarize +xl_brin_insert +xl_brin_revmap_extend +xl_brin_samepage_update +xl_brin_update +xl_btree_dedup +xl_btree_delete +xl_btree_insert +xl_btree_mark_page_halfdead +xl_btree_metadata +xl_btree_newroot +xl_btree_reuse_page +xl_btree_split +xl_btree_unlink_page +xl_btree_update +xl_btree_vacuum +xl_clog_truncate +xl_commit_ts_truncate +xl_dbase_create_rec +xl_dbase_drop_rec +xl_end_of_recovery +xl_hash_add_ovfl_page +xl_hash_delete +xl_hash_init_bitmap_page +xl_hash_init_meta_page +xl_hash_insert +xl_hash_move_page_contents +xl_hash_split_allocate_page +xl_hash_split_complete +xl_hash_squeeze_page +xl_hash_update_meta_page +xl_hash_vacuum_one_page +xl_heap_confirm +xl_heap_delete +xl_heap_freeze_page +xl_heap_freeze_tuple +xl_heap_header +xl_heap_inplace +xl_heap_insert +xl_heap_lock +xl_heap_lock_updated +xl_heap_multi_insert +xl_heap_new_cid +xl_heap_prune +xl_heap_rewrite_mapping +xl_heap_truncate +xl_heap_update +xl_heap_vacuum +xl_heap_visible +xl_invalid_page +xl_invalid_page_key +xl_invalidations +xl_logical_message +xl_multi_insert_tuple +xl_multixact_create +xl_multixact_truncate +xl_overwrite_contrecord +xl_parameter_change +xl_relmap_update +xl_replorigin_drop +xl_replorigin_set +xl_restore_point +xl_running_xacts +xl_seq_rec +xl_smgr_create +xl_smgr_truncate +xl_standby_lock +xl_standby_locks +xl_tblspc_create_rec +xl_tblspc_drop_rec +xl_xact_abort +xl_xact_assignment +xl_xact_commit +xl_xact_dbinfo +xl_xact_invals +xl_xact_origin +xl_xact_parsed_abort +xl_xact_parsed_commit +xl_xact_parsed_prepare +xl_xact_prepare +xl_xact_relfilenodes +xl_xact_subxacts +xl_xact_twophase +xl_xact_xinfo +xmlBuffer +xmlBufferPtr +xmlChar +xmlDocPtr +xmlErrorPtr +xmlExternalEntityLoader +xmlGenericErrorFunc +xmlNodePtr +xmlNodeSetPtr +xmlParserCtxtPtr +xmlParserInputPtr +xmlStructuredErrorFunc +xmlTextWriter +xmlTextWriterPtr +xmlXPathCompExprPtr +xmlXPathContextPtr +xmlXPathObjectPtr +xmltype +xpath_workspace +xsltSecurityPrefsPtr +xsltStylesheetPtr +xsltTransformContextPtr +yy_parser +yy_size_t +yyscan_t +z_stream +z_streamp +zic_t From b8eb908a3df34f437b4f123461b14b599be4a8b4 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 15:43:53 +0300 Subject: [PATCH 056/166] Rename old project name references --- Cargo.lock | 8 +- Cargo.toml | 2 +- Dockerfile | 16 +- compute_tools/Cargo.toml | 4 +- control_plane/Cargo.toml | 2 +- control_plane/simple.conf | 2 +- control_plane/src/bin/neon_local.rs | 34 +- control_plane/src/compute.rs | 20 +- control_plane/src/local_env.rs | 54 +-- control_plane/src/postgresql_conf.rs | 2 +- control_plane/src/safekeeper.rs | 8 +- control_plane/src/storage.rs | 36 +- docs/authentication.md | 4 +- docs/multitenancy.md | 18 +- docs/pageserver-services.md | 2 +- docs/pageserver-storage.md | 10 +- docs/pageserver-tenant-migration.md | 4 +- docs/rfcs/013-term-history.md | 2 +- docs/rfcs/cluster-size-limits.md | 8 +- docs/sourcetree.md | 11 +- libs/etcd_broker/src/subscription_key.rs | 26 +- libs/postgres_ffi/Cargo.toml | 2 +- libs/postgres_ffi/wal_craft/Cargo.toml | 2 +- libs/utils/Cargo.toml | 4 +- libs/utils/benches/benchmarks.rs | 4 +- libs/utils/src/auth.rs | 14 +- libs/utils/src/http/endpoint.rs | 6 +- libs/utils/src/http/mod.rs | 2 +- libs/utils/src/{zid.rs => id.rs} | 88 ++-- libs/utils/src/lib.rs | 2 +- libs/utils/src/postgres_backend.rs | 12 +- libs/utils/src/postgres_backend_async.rs | 4 +- pageserver/Cargo.toml | 8 +- pageserver/src/bin/dump_layerfile.rs | 2 +- pageserver/src/bin/pageserver.rs | 4 +- pageserver/src/bin/update_metadata.rs | 2 +- pageserver/src/config.rs | 16 +- pageserver/src/http/models.rs | 24 +- pageserver/src/http/routes.rs | 42 +- pageserver/src/import_datadir.rs | 2 +- pageserver/src/lib.rs | 10 +- pageserver/src/metrics.rs | 6 +- pageserver/src/page_cache.rs | 14 +- pageserver/src/page_service.rs | 76 ++-- pageserver/src/pgdatadir_mapping.rs | 8 +- pageserver/src/repository.rs | 4 +- pageserver/src/storage_sync.rs | 72 ++-- pageserver/src/storage_sync/delete.rs | 8 +- pageserver/src/storage_sync/download.rs | 40 +- pageserver/src/storage_sync/index.rs | 42 +- pageserver/src/storage_sync/upload.rs | 12 +- pageserver/src/task_mgr.rs | 18 +- pageserver/src/tenant.rs | 100 ++--- pageserver/src/tenant/delta_layer.rs | 84 ++-- pageserver/src/tenant/ephemeral_file.rs | 36 +- pageserver/src/tenant/image_layer.rs | 84 ++-- pageserver/src/tenant/inmemory_layer.rs | 38 +- pageserver/src/tenant/layer_map.rs | 2 +- pageserver/src/tenant/metadata.rs | 20 +- pageserver/src/tenant/storage_layer.rs | 10 +- pageserver/src/tenant/timeline.rs | 16 +- pageserver/src/tenant_config.rs | 6 +- pageserver/src/tenant_mgr.rs | 36 +- pageserver/src/tenant_tasks.rs | 12 +- pageserver/src/timelines.rs | 14 +- pageserver/src/virtual_file.rs | 38 +- pageserver/src/walingest.rs | 26 +- .../src/walreceiver/connection_manager.rs | 18 +- .../src/walreceiver/walreceiver_connection.rs | 8 +- pageserver/src/walrecord.rs | 16 +- pageserver/src/walredo.rs | 80 ++-- pgxn/neon/inmem_smgr.c | 2 +- pgxn/neon/libpagestore.c | 49 ++- pgxn/neon/neon.c | 2 - pgxn/neon/pagestore_client.h | 153 ++++--- pgxn/neon/pagestore_smgr.c | 408 +++++++++--------- pgxn/neon/relsize_cache.c | 6 +- pgxn/neon/walproposer.c | 114 ++--- pgxn/neon/walproposer.h | 38 +- pgxn/neon_test_utils/neontest.c | 32 +- proxy/Cargo.toml | 2 +- pyproject.toml | 2 +- safekeeper/Cargo.toml | 6 +- safekeeper/src/bin/safekeeper.rs | 6 +- safekeeper/src/broker.rs | 10 +- safekeeper/src/control_file.rs | 18 +- safekeeper/src/control_file_upgrade.rs | 25 +- safekeeper/src/handler.rs | 30 +- safekeeper/src/http/models.rs | 4 +- safekeeper/src/http/routes.rs | 14 +- safekeeper/src/json_ctrl.rs | 4 +- safekeeper/src/lib.rs | 6 +- safekeeper/src/metrics.rs | 4 +- safekeeper/src/receive_wal.rs | 2 +- safekeeper/src/safekeeper.rs | 36 +- safekeeper/src/send_wal.rs | 8 +- safekeeper/src/timeline.rs | 49 ++- safekeeper/src/wal_backup.rs | 14 +- safekeeper/src/wal_storage.rs | 8 +- scripts/generate_and_push_perf_report.sh | 8 +- scripts/perf_report_template.html | 4 +- test_runner/README.md | 2 +- test_runner/fixtures/benchmark_fixture.py | 6 +- test_runner/fixtures/neon_fixtures.py | 132 +++--- test_runner/fixtures/types.py | 14 +- test_runner/performance/README.md | 2 +- test_runner/regress/test_ancestor_branch.py | 8 +- test_runner/regress/test_auth.py | 4 +- test_runner/regress/test_branch_behind.py | 4 +- test_runner/regress/test_broken_timeline.py | 4 +- test_runner/regress/test_fullbackup.py | 4 +- test_runner/regress/test_gc_aggressive.py | 8 +- test_runner/regress/test_import.py | 12 +- test_runner/regress/test_neon_cli.py | 8 +- test_runner/regress/test_old_request_lsn.py | 4 +- test_runner/regress/test_pageserver_api.py | 24 +- test_runner/regress/test_pitr_gc.py | 4 +- test_runner/regress/test_remote_storage.py | 8 +- test_runner/regress/test_tenant_detach.py | 8 +- test_runner/regress/test_tenant_relocation.py | 22 +- test_runner/regress/test_tenant_tasks.py | 10 +- test_runner/regress/test_tenants.py | 4 +- .../test_tenants_with_remote_storage.py | 8 +- test_runner/regress/test_timeline_delete.py | 6 +- test_runner/regress/test_timeline_size.py | 8 +- test_runner/regress/test_wal_acceptor.py | 64 ++- .../regress/test_wal_acceptor_async.py | 10 +- test_runner/regress/test_wal_restore.py | 4 +- 128 files changed, 1428 insertions(+), 1495 deletions(-) rename libs/utils/src/{zid.rs => id.rs} (76%) diff --git a/Cargo.lock b/Cargo.lock index e9ebcdc5ac..d4234d2b00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2048,7 +2048,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.2" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", @@ -2061,7 +2061,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "base64", "byteorder", @@ -2079,7 +2079,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.3" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", @@ -3295,7 +3295,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.6" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "async-trait", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 1936b261f7..bc2a705558 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,4 +70,4 @@ lto = true # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/Dockerfile b/Dockerfile index 3e173f4d5b..eacb88d168 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN set -e \ && rm -rf pg_install/v15/build \ && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . -# Build zenith binaries +# Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local @@ -60,12 +60,12 @@ RUN set -e \ openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ - && useradd -d /data zenith \ - && chown -R zenith:zenith /data + && useradd -d /data neon \ + && chown -R neon:neon /data -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin # v14 is default for now COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ @@ -73,7 +73,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. -RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ +RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ -c "broker_endpoints=['http://etcd:2379']" \ @@ -82,7 +82,7 @@ RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ -c "listen_http_addr='0.0.0.0:9898'" VOLUME ["/data"] -USER zenith +USER neon EXPOSE 6400 EXPOSE 9898 CMD ["/bin/bash"] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 78b85d0e79..b13f7f191d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,12 +10,12 @@ clap = "3.0" env_logger = "0.9" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } url = "2.2.2" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 8a79a6e566..ab9df8534c 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -8,7 +8,7 @@ clap = "3.0" comfy-table = "5.0.1" git-version = "0.3.5" tar = "0.4.38" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" toml = "0.5" diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 925e2f14ee..ae60657400 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -1,4 +1,4 @@ -# Minimal zenith environment with one safekeeper. This is equivalent to the built-in +# Minimal neon environment with one safekeeper. This is equivalent to the built-in # defaults that you get with no --config [pageserver] listen_pg_addr = '127.0.0.1:64000' diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index e3160db53b..e16fd8764a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -27,10 +27,10 @@ use std::process::exit; use std::str::FromStr; use utils::{ auth::{Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, project_git_version, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; // Default id of a safekeeper node, if not specified on the command line. @@ -72,7 +72,7 @@ struct TimelineTreeEl { /// Name, recovered from neon config mappings pub name: Option, /// Holds all direct children of this timeline referenced using `timeline_id`. - pub children: BTreeSet, + pub children: BTreeSet, } // Main entry point for the 'neon_local' CLI utility @@ -321,7 +321,7 @@ fn main() -> Result<()> { /// fn print_timelines_tree( timelines: Vec, - mut timeline_name_mappings: HashMap, + mut timeline_name_mappings: HashMap, ) -> Result<()> { let mut timelines_hash = timelines .iter() @@ -332,7 +332,7 @@ fn print_timelines_tree( info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings - .remove(&ZTenantTimelineId::new(t.tenant_id, t.timeline_id)), + .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)), }, ) }) @@ -374,7 +374,7 @@ fn print_timeline( nesting_level: usize, is_last: &[bool], timeline: &TimelineTreeEl, - timelines: &HashMap, + timelines: &HashMap, ) -> Result<()> { let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) { (None, None) => unreachable!("in this case no info for a timeline is found"), @@ -452,8 +452,8 @@ fn print_timeline( /// Connects to the pageserver to query this information. fn get_timeline_infos( env: &local_env::LocalEnv, - tenant_id: &ZTenantId, -) -> Result> { + tenant_id: &TenantId, +) -> Result> { Ok(PageServerNode::from_env(env) .timeline_list(tenant_id)? .into_iter() @@ -462,7 +462,7 @@ fn get_timeline_infos( } // Helper function to parse --tenant_id option, or get the default from config file -fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { +fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { tenant_id_from_arguments } else if let Some(default_id) = env.default_tenant_id { @@ -472,18 +472,18 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } -fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { +fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .value_of("tenant-id") - .map(ZTenantId::from_str) + .map(TenantId::from_str) .transpose() .context("Failed to parse tenant id from the argument string") } -fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { +fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .value_of("timeline-id") - .map(ZTimelineId::from_str) + .map(TimelineId::from_str) .transpose() .context("Failed to parse timeline id from the argument string") } @@ -504,9 +504,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; env.init().context("Failed to initialize neon repository")?; - - // default_tenantid was generated by the `env.init()` call above - let initial_tenant_id = env.default_tenant_id.unwrap(); + let initial_tenant_id = env + .default_tenant_id + .expect("default_tenant_id should be generated by the `env.init()` call above"); // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); @@ -759,7 +759,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; let branch_name = timeline_name_mappings - .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) + .get(&TenantTimelineId::new(tenant_id, node.timeline_id)) .map(|name| name.as_str()) .unwrap_or("?"); @@ -810,7 +810,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); - let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) { + let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 57b5e1e10a..b678d620df 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -13,9 +13,9 @@ use std::time::Duration; use anyhow::{Context, Result}; use utils::{ connstring::connection_host_port, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - zid::{ZTenantId, ZTimelineId}, }; use crate::local_env::LocalEnv; @@ -28,7 +28,7 @@ use crate::storage::PageServerNode; pub struct ComputeControlPlane { base_port: u16, pageserver: Arc, - pub nodes: BTreeMap<(ZTenantId, String), Arc>, + pub nodes: BTreeMap<(TenantId, String), Arc>, env: LocalEnv, } @@ -76,9 +76,9 @@ impl ComputeControlPlane { pub fn new_node( &mut self, - tenant_id: ZTenantId, + tenant_id: TenantId, name: &str, - timeline_id: ZTimelineId, + timeline_id: TimelineId, lsn: Option, port: Option, ) -> Result> { @@ -114,9 +114,9 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, uses_wal_proposer: bool, } @@ -148,8 +148,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; - let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; + let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; + let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // parse recovery_target_lsn, if any @@ -292,7 +292,7 @@ impl PostgresNode { // variable during compute pg startup. It is done this way because // otherwise user will be able to retrieve the value using SHOW // command or pg_settings - let password = if let AuthType::ZenithJWT = auth_type { + let password = if let AuthType::NeonJWT = auth_type { "$ZENITH_AUTH_TOKEN" } else { "" @@ -301,7 +301,7 @@ impl PostgresNode { // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN // We parse this string and build it back with token from env var, and for simplicity rebuild // uses only needed variables namely host, port, user, password. - format!("postgresql://no_user:{}@{}:{}", password, host, port) + format!("postgresql://no_user:{password}@{host}:{port}") }; conf.append("shared_preload_libraries", "neon"); conf.append_line(""); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index c4a61dbd7b..7afaad26dc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -14,8 +14,8 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, postgres_backend::AuthType, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use crate::safekeeper::SafekeeperNode; @@ -48,13 +48,13 @@ pub struct LocalEnv { // Path to pageserver binary. #[serde(default)] - pub zenith_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, - // Default tenant ID to use with the 'zenith' command line utility, when - // --tenantid is not explicitly specified. + // Default tenant ID to use with the 'neon_local' command line utility, when + // --tenant_id is not explicitly specified. #[serde(default)] #[serde_as(as = "Option")] - pub default_tenant_id: Option, + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -69,11 +69,11 @@ pub struct LocalEnv { /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. #[serde(default)] - // A `HashMap>` would be more appropriate here, + // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")] - branch_name_mappings: HashMap>, + branch_name_mappings: HashMap>, } /// Etcd broker config for cluster internal communication. @@ -204,20 +204,20 @@ impl LocalEnv { } pub fn pageserver_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("pageserver")) + Ok(self.neon_distrib_dir.join("pageserver")) } pub fn safekeeper_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("safekeeper")) + Ok(self.neon_distrib_dir.join("safekeeper")) } pub fn pg_data_dirs_path(&self) -> PathBuf { self.base_data_dir.join("pgdatadirs").join("tenants") } - pub fn pg_data_dir(&self, tenantid: &ZTenantId, branch_name: &str) -> PathBuf { + pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf { self.pg_data_dirs_path() - .join(tenantid.to_string()) + .join(tenant_id.to_string()) .join(branch_name) } @@ -233,8 +233,8 @@ impl LocalEnv { pub fn register_branch_mapping( &mut self, branch_name: String, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> anyhow::Result<()> { let existing_values = self .branch_name_mappings @@ -260,22 +260,22 @@ impl LocalEnv { pub fn get_branch_timeline_id( &self, branch_name: &str, - tenant_id: ZTenantId, - ) -> Option { + tenant_id: TenantId, + ) -> Option { self.branch_name_mappings .get(branch_name)? .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) - .map(ZTimelineId::from) + .map(TimelineId::from) } - pub fn timeline_name_mappings(&self) -> HashMap { + pub fn timeline_name_mappings(&self) -> HashMap { self.branch_name_mappings .iter() .flat_map(|(name, tenant_timelines)| { tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { - (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone()) + (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) }) }) .collect() @@ -299,14 +299,14 @@ impl LocalEnv { } } - // Find zenith binaries. - if env.zenith_distrib_dir == Path::new("") { - env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); + // Find neon binaries. + if env.neon_distrib_dir == Path::new("") { + env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } // If no initial tenant ID was given, generate it. if env.default_tenant_id.is_none() { - env.default_tenant_id = Some(ZTenantId::generate()); + env.default_tenant_id = Some(TenantId::generate()); } env.base_data_dir = base_path(); @@ -320,12 +320,12 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'neon_local init' first", + "Neon config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } - // TODO: check that it looks like a zenith repository + // TODO: check that it looks like a neon repository // load and parse file let config = fs::read_to_string(repopath.join("config"))?; @@ -404,10 +404,10 @@ impl LocalEnv { ); } for binary in ["pageserver", "safekeeper"] { - if !self.zenith_distrib_dir.join(binary).exists() { + if !self.neon_distrib_dir.join(binary).exists() { bail!( - "Can't find binary '{binary}' in zenith distrib dir '{}'", - self.zenith_distrib_dir.display() + "Can't find binary '{binary}' in neon distrib dir '{}'", + self.neon_distrib_dir.display() ); } } diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index a71108da01..34dc769e78 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -2,7 +2,7 @@ /// Module for parsing postgresql.conf file. /// /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just -/// enough to extract a few settings we need in Zenith, assuming you don't do +/// enough to extract a few settings we need in Neon, assuming you don't do /// funny stuff like include-directives or funny escaping. use anyhow::{bail, Context, Result}; use once_cell::sync::Lazy; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 2cc1ae7853..600a9ffe05 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -17,7 +17,7 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, - zid::{NodeId, ZTenantId, ZTimelineId}, + id::{NodeId, TenantId, TimelineId}, }; use crate::local_env::{LocalEnv, SafekeeperConf}; @@ -269,7 +269,7 @@ impl SafekeeperNode { fn http_request(&self, method: Method, url: U) -> RequestBuilder { // TODO: authentication - //if self.env.auth_type == AuthType::ZenithJWT { + //if self.env.auth_type == AuthType::NeonJWT { // builder = builder.bearer_auth(&self.env.safekeeper_auth_token) //} self.http_client.request(method, url) @@ -284,8 +284,8 @@ impl SafekeeperNode { pub fn timeline_create( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, peer_ids: Vec, ) -> Result<()> { Ok(self diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9fdab5f88c..d2cc5e096c 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -21,9 +21,9 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - zid::{ZTenantId, ZTimelineId}, }; use crate::local_env::LocalEnv; @@ -83,7 +83,7 @@ pub struct PageServerNode { impl PageServerNode { pub fn from_env(env: &LocalEnv) -> PageServerNode { - let password = if env.pageserver.auth_type == AuthType::ZenithJWT { + let password = if env.pageserver.auth_type == AuthType::NeonJWT { &env.pageserver.auth_token } else { "" @@ -109,10 +109,10 @@ impl PageServerNode { pub fn initialize( &self, - create_tenant: Option, - initial_timeline_id: Option, + create_tenant: Option, + initial_timeline_id: Option, config_overrides: &[&str], - ) -> anyhow::Result { + ) -> anyhow::Result { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = @@ -173,9 +173,9 @@ impl PageServerNode { fn try_init_timeline( &self, - new_tenant_id: Option, - new_timeline_id: Option, - ) -> anyhow::Result { + new_tenant_id: Option, + new_timeline_id: Option, + ) -> anyhow::Result { let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; let initial_timeline_info = self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; @@ -345,7 +345,7 @@ impl PageServerNode { fn http_request(&self, method: Method, url: U) -> RequestBuilder { let mut builder = self.http_client.request(method, url); - if self.env.pageserver.auth_type == AuthType::ZenithJWT { + if self.env.pageserver.auth_type == AuthType::NeonJWT { builder = builder.bearer_auth(&self.env.pageserver.auth_token) } builder @@ -368,9 +368,9 @@ impl PageServerNode { pub fn tenant_create( &self, - new_tenant_id: Option, + new_tenant_id: Option, settings: HashMap<&str, &str>, - ) -> anyhow::Result { + ) -> anyhow::Result { self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { new_tenant_id, @@ -422,7 +422,7 @@ impl PageServerNode { }) } - pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { + pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url)) .json(&TenantConfigRequest { tenant_id, @@ -471,7 +471,7 @@ impl PageServerNode { Ok(()) } - pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { + pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { let timeline_infos: Vec = self .http_request( Method::GET, @@ -486,10 +486,10 @@ impl PageServerNode { pub fn timeline_create( &self, - tenant_id: ZTenantId, - new_timeline_id: Option, + tenant_id: TenantId, + new_timeline_id: Option, ancestor_start_lsn: Option, - ancestor_timeline_id: Option, + ancestor_timeline_id: Option, ) -> anyhow::Result { self.http_request( Method::POST, @@ -524,8 +524,8 @@ impl PageServerNode { /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) pub fn timeline_import( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, ) -> anyhow::Result<()> { diff --git a/docs/authentication.md b/docs/authentication.md index 7200ffc62f..9748a7ab0d 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -2,14 +2,14 @@ ### Overview -Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `zenith init`. Using following openssl commands: +Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `neon_local init`. Using following openssl commands: ```bash openssl genrsa -out private_key.pem 2048 openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem ``` -CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `ZenithJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. +CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `NeonJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. Currently there is no authentication between compute and safekeepers, because this communication layer is under heavy refactoring. After this refactoring support for authentication will be added there too. Now safekeeper supports "hardcoded" token passed via environment variable to be able to use callmemaybe command in pageserver. diff --git a/docs/multitenancy.md b/docs/multitenancy.md index c697ae93cd..35c69e69a1 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -2,26 +2,26 @@ ### Overview -Zenith supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via zenith CLI. During page server setup tenant can be created using ```zenith init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```zenith tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So zenith tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. +Neon supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via neon_local CLI. During page server setup tenant can be created using ```neon_local init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```neon_local tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So neon_local tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. ### Tenants in other commands -By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. +By default during `neon_local init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenant_id=` is provided. So generally tenant_id more frequently appears in internal pageserver interface. Its commands take tenant_id argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: ```sh -zenith tenant list +neon_local tenant list -zenith tenant create // generates new id +neon_local tenant create // generates new id -zenith tenant create ee6016ec31116c1b7c33dfdfca38892f +neon_local tenant create ee6016ec31116c1b7c33dfdfca38892f -zenith pg create main // default tenant from zenith init +neon_local pg create main // default tenant from neon init -zenith pg create main --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local pg create main --tenant_id=ee6016ec31116c1b7c33dfdfca38892f -zenith branch --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local branch --tenant_id=ee6016ec31116c1b7c33dfdfca38892f ``` ### Data layout @@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id ### Safety -For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline). +For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenant_id, timeline_id) pair so there can only be one writer for particular (tenant_id, timeline_id). diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index 07a91f543d..fc259c8a5f 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -109,7 +109,7 @@ Repository The repository stores all the page versions, or WAL records needed to reconstruct them. Each tenant has a separate Repository, which is -stored in the .neon/tenants/ directory. +stored in the .neon/tenants/ directory. Repository is an abstract trait, defined in `repository.rs`. It is implemented by the LayeredRepository object in diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md index 8d03e68ac7..77e7ff35bc 100644 --- a/docs/pageserver-storage.md +++ b/docs/pageserver-storage.md @@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under -`.neon/tenants//timelines`. +`.neon/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file @@ -351,7 +351,7 @@ branch. Note: It doesn't make any difference if the child branch is created when the end of the main branch was at LSN 250, or later when the tip of the main branch had already moved on. The latter case, creating a -branch at a historic LSN, is how we support PITR in Zenith. +branch at a historic LSN, is how we support PITR in Neon. # Garbage collection @@ -396,9 +396,9 @@ table: main/orders_200_300 DELETE main/orders_300 STILL NEEDED BY orders_300_400 main/orders_300_400 KEEP, NEWER THAN GC HORIZON - main/orders_400 .. - main/orders_400_500 .. - main/orders_500 .. + main/orders_400 .. + main/orders_400_500 .. + main/orders_500 .. main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION diff --git a/docs/pageserver-tenant-migration.md b/docs/pageserver-tenant-migration.md index a846213ab2..5fb2097030 100644 --- a/docs/pageserver-tenant-migration.md +++ b/docs/pageserver-tenant-migration.md @@ -9,7 +9,7 @@ This feature allows to migrate a timeline from one pageserver to another by util Pageserver implements two new http handlers: timeline attach and timeline detach. Timeline migration is performed in a following way: 1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3. -2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049)) +2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/neondatabase/neon/issues/997)/[#1049](https://github.com/neondatabase/neon/issues/1049)) 3. Replication state can be tracked via timeline detail pageserver call. 4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console). 5. Timeline is detached from old pageserver. On disk data is removed. @@ -18,5 +18,5 @@ Timeline migration is performed in a following way: ### Implementation details Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code: -* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). +* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/neondatabase/neon/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). * We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail) diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 59833526c5..7e815abf73 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -70,7 +70,7 @@ two options. ...start sending WAL conservatively since the horizon (1.1), and truncate obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is -reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes. +reached, i.e. 2.3 transferred -- that's what https://github.com/neondatabase/neon/pull/505 proposes. Then the following is possible: diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md index bd4cb9ef32..4ef006d9a6 100644 --- a/docs/rfcs/cluster-size-limits.md +++ b/docs/rfcs/cluster-size-limits.md @@ -15,7 +15,7 @@ The stateless compute node that performs validation is separate from the storage Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future). First of all, this is needed to control our free tier production costs. -Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters, +Another reason to limit resources is risk management — we haven't (fully) tested and optimized neon for big clusters, so we don't want to give users access to the functionality that we don't think is ready. ## Components @@ -43,20 +43,20 @@ Then this size should be reported to compute node. `current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.` -(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037). +(PR about protocol changes https://github.com/neondatabase/neon/pull/1037). This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`. Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. -And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +And then every neon_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. (see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) TODO: We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`. It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level. See issues https://github.com/neondatabase/neon/issues/1245 -https://github.com/zenithdb/zenith/issues/1445 +https://github.com/neondatabase/neon/issues/1445 TODO: We should warn users if the limit is soon to be reached. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 339a90e0ba..c1a860f126 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation `/docs`: -Documentation of the Zenith features and concepts. +Documentation of the Neon features and concepts. Now it is mostly dev documentation. `/monitoring`: @@ -19,7 +19,7 @@ TODO `/pageserver`: -Zenith storage service. +Neon storage service. The pageserver has a few different duties: - Store and manage the data. @@ -54,7 +54,7 @@ PostgreSQL extension that contains functions needed for testing and debugging. `/safekeeper`: -The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. +The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. For more detailed info, see [walservice.md](./walservice.md) @@ -64,11 +64,6 @@ The workspace_hack crate exists only to pin down some dependencies. We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation. -`/zenith` - -Main entry point for the 'zenith' CLI utility. -TODO: Doesn't it belong to control_plane? - `/libs`: Unites granular neon helper crates under the hood. diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs index 8f8579f4e5..a11d2ab106 100644 --- a/libs/etcd_broker/src/subscription_key.rs +++ b/libs/etcd_broker/src/subscription_key.rs @@ -11,7 +11,7 @@ use std::{fmt::Display, str::FromStr}; use once_cell::sync::Lazy; use regex::{Captures, Regex}; -use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId}; /// The subscription kind to the timeline updates from safekeeper. #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -30,13 +30,13 @@ pub enum SubscriptionKind { /// Get every update in etcd. All, /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind. - TenantTimelines(ZTenantId), + TenantTimelines(TenantId), /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind. - Timeline(ZTenantTimelineId), + Timeline(TenantTimelineId), /// Get etcd timeline updates, specific to a certain node kind. - Node(ZTenantTimelineId, NodeKind), + Node(TenantTimelineId, NodeKind), /// Get etcd timeline updates for a certain operation on specific nodes. - Operation(ZTenantTimelineId, NodeKind, OperationKind), + Operation(TenantTimelineId, NodeKind, OperationKind), } /// All kinds of nodes, able to write into etcd. @@ -67,7 +67,7 @@ static SUBSCRIPTION_FULL_KEY_REGEX: Lazy = Lazy::new(|| { /// No other etcd keys are considered during system's work. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct SubscriptionFullKey { - pub id: ZTenantTimelineId, + pub id: TenantTimelineId, pub node_kind: NodeKind, pub operation: OperationKind, pub node_id: NodeId, @@ -83,7 +83,7 @@ impl SubscriptionKey { } /// Subscribes to a given timeline info updates from safekeepers. - pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self { + pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self { Self { cluster_prefix, kind: SubscriptionKind::Operation( @@ -97,7 +97,7 @@ impl SubscriptionKey { /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes. pub fn operation( cluster_prefix: String, - timeline: ZTenantTimelineId, + timeline: TenantTimelineId, node_kind: NodeKind, operation: OperationKind, ) -> Self { @@ -175,7 +175,7 @@ impl FromStr for SubscriptionFullKey { }; Ok(Self { - id: ZTenantTimelineId::new( + id: TenantTimelineId::new( parse_capture(&key_captures, 1)?, parse_capture(&key_captures, 2)?, ), @@ -247,7 +247,7 @@ impl FromStr for SkOperationKind { #[cfg(test)] mod tests { - use utils::zid::ZTimelineId; + use utils::id::TimelineId; use super::*; @@ -256,9 +256,9 @@ mod tests { let prefix = "neon"; let node_kind = NodeKind::Safekeeper; let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup); - let tenant_id = ZTenantId::generate(); - let timeline_id = ZTimelineId::generate(); - let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + let id = TenantTimelineId::new(tenant_id, timeline_id); let node_id = NodeId(1); let timeline_subscription_keys = [ diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 5b9ecb7394..2b453fa0dc 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] env_logger = "0.9" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } wal_craft = { path = "wal_craft" } [build-dependencies] diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 114f08113b..f848ac1273 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -11,6 +11,6 @@ clap = "3.0" env_logger = "0.9" log = "0.4" once_cell = "1.13.0" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ce55277f29..ef2aa8b305 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -10,8 +10,8 @@ bincode = "1.3" bytes = "1.0.1" hyper = { version = "0.14.7", features = ["full"] } pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } routerify = "3" serde = { version = "1.0", features = ["derive"] } serde_json = "1" diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 0339939934..badcb5774e 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,11 +1,11 @@ #![allow(unused)] use criterion::{criterion_group, criterion_main, Criterion}; -use utils::zid; +use utils::id; pub fn bench_zid_stringify(c: &mut Criterion) { // Can only use public methods. - let ztl = zid::ZTenantTimelineId::generate(); + let ztl = id::TenantTimelineId::generate(); c.bench_function("zid.to_string", |b| { b.iter(|| { diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 3bdabacad4..b190b0d1c5 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -14,7 +14,7 @@ use jsonwebtoken::{ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use crate::zid::ZTenantId; +use crate::id::TenantId; const JWT_ALGORITHM: Algorithm = Algorithm::RS256; @@ -30,23 +30,23 @@ pub enum Scope { pub struct Claims { #[serde(default)] #[serde_as(as = "Option")] - pub tenant_id: Option, + pub tenant_id: Option, pub scope: Scope, } impl Claims { - pub fn new(tenant_id: Option, scope: Scope) -> Self { + pub fn new(tenant_id: Option, scope: Scope) -> Self { Self { tenant_id, scope } } } -pub fn check_permission(claims: &Claims, tenantid: Option) -> Result<()> { - match (&claims.scope, tenantid) { +pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<()> { + match (&claims.scope, tenant_id) { (Scope::Tenant, None) => { bail!("Attempt to access management api with tenant scope. Permission denied") } - (Scope::Tenant, Some(tenantid)) => { - if claims.tenant_id.unwrap() != tenantid { + (Scope::Tenant, Some(tenant_id)) => { + if claims.tenant_id.unwrap() != tenant_id { bail!("Tenant id mismatch. Permission denied") } Ok(()) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 69bf5ef87a..4066791e2b 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,6 +1,6 @@ use crate::auth::{self, Claims, JwtAuth}; use crate::http::error; -use crate::zid::ZTenantId; +use crate::id::TenantId; use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; @@ -137,9 +137,9 @@ pub fn auth_middleware( }) } -pub fn check_permission(req: &Request, tenantid: Option) -> Result<(), ApiError> { +pub fn check_permission(req: &Request, tenant_id: Option) -> Result<(), ApiError> { match req.context::() { - Some(claims) => Ok(auth::check_permission(&claims, tenantid) + Some(claims) => Ok(auth::check_permission(&claims, tenant_id) .map_err(|err| ApiError::Forbidden(err.to_string()))?), None => Ok(()), // claims is None because auth is disabled } diff --git a/libs/utils/src/http/mod.rs b/libs/utils/src/http/mod.rs index 0bb53ef51d..74ed6bb5b2 100644 --- a/libs/utils/src/http/mod.rs +++ b/libs/utils/src/http/mod.rs @@ -3,6 +3,6 @@ pub mod error; pub mod json; pub mod request; -/// Current fast way to apply simple http routing in various Zenith binaries. +/// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; diff --git a/libs/utils/src/zid.rs b/libs/utils/src/id.rs similarity index 76% rename from libs/utils/src/zid.rs rename to libs/utils/src/id.rs index 6da5355f61..059ce69ca4 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/id.rs @@ -4,7 +4,7 @@ use hex::FromHex; use rand::Rng; use serde::{Deserialize, Serialize}; -/// Zenith ID is a 128-bit random ID. +/// Neon ID is a 128-bit random ID. /// Used to represent various identifiers. Provides handy utility methods and impls. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look @@ -13,13 +13,13 @@ use serde::{Deserialize, Serialize}; /// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. /// Check the `serde_with::serde_as` documentation for options for more complex types. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -struct ZId([u8; 16]); +struct Id([u8; 16]); -impl ZId { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { +impl Id { + pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id { let mut arr = [0u8; 16]; buf.copy_to_slice(&mut arr); - ZId::from(arr) + Id::from(arr) } pub fn as_arr(&self) -> [u8; 16] { @@ -29,7 +29,7 @@ impl ZId { pub fn generate() -> Self { let mut tli_buf = [0u8; 16]; rand::thread_rng().fill(&mut tli_buf); - ZId::from(tli_buf) + Id::from(tli_buf) } fn hex_encode(&self) -> String { @@ -44,54 +44,54 @@ impl ZId { } } -impl FromStr for ZId { +impl FromStr for Id { type Err = hex::FromHexError; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { Self::from_hex(s) } } -// this is needed for pretty serialization and deserialization of ZId's using serde integration with hex crate -impl FromHex for ZId { +// this is needed for pretty serialization and deserialization of Id's using serde integration with hex crate +impl FromHex for Id { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { let mut buf: [u8; 16] = [0u8; 16]; hex::decode_to_slice(hex, &mut buf)?; - Ok(ZId(buf)) + Ok(Id(buf)) } } -impl AsRef<[u8]> for ZId { +impl AsRef<[u8]> for Id { fn as_ref(&self) -> &[u8] { &self.0 } } -impl From<[u8; 16]> for ZId { +impl From<[u8; 16]> for Id { fn from(b: [u8; 16]) -> Self { - ZId(b) + Id(b) } } -impl fmt::Display for ZId { +impl fmt::Display for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } -impl fmt::Debug for ZId { +impl fmt::Debug for Id { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.hex_encode()) } } -macro_rules! zid_newtype { +macro_rules! id_newtype { ($t:ident) => { impl $t { pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t { - $t(ZId::get_from_buf(buf)) + $t(Id::get_from_buf(buf)) } pub fn as_arr(&self) -> [u8; 16] { @@ -99,11 +99,11 @@ macro_rules! zid_newtype { } pub fn generate() -> Self { - $t(ZId::generate()) + $t(Id::generate()) } pub const fn from_array(b: [u8; 16]) -> Self { - $t(ZId(b)) + $t(Id(b)) } } @@ -111,14 +111,14 @@ macro_rules! zid_newtype { type Err = hex::FromHexError; fn from_str(s: &str) -> Result<$t, Self::Err> { - let value = ZId::from_str(s)?; + let value = Id::from_str(s)?; Ok($t(value)) } } impl From<[u8; 16]> for $t { fn from(b: [u8; 16]) -> Self { - $t(ZId::from(b)) + $t(Id::from(b)) } } @@ -126,7 +126,7 @@ macro_rules! zid_newtype { type Error = hex::FromHexError; fn from_hex>(hex: T) -> Result { - Ok($t(ZId::from_hex(hex)?)) + Ok($t(Id::from_hex(hex)?)) } } @@ -150,7 +150,7 @@ macro_rules! zid_newtype { }; } -/// Zenith timeline IDs are different from PostgreSQL timeline +/// Neon timeline IDs are different from PostgreSQL timeline /// IDs. They serve a similar purpose though: they differentiate /// between different "histories" of the same cluster. However, /// PostgreSQL timeline IDs are a bit cumbersome, because they are only @@ -158,7 +158,7 @@ macro_rules! zid_newtype { /// timeline history. Those limitations mean that we cannot generate a /// new PostgreSQL timeline ID by just generating a random number. And /// that in turn is problematic for the "pull/push" workflow, where you -/// have a local copy of a zenith repository, and you periodically sync +/// have a local copy of a Neon repository, and you periodically sync /// the local changes with a remote server. When you work "detached" /// from the remote server, you cannot create a PostgreSQL timeline ID /// that's guaranteed to be different from all existing timelines in @@ -168,55 +168,55 @@ macro_rules! zid_newtype { /// branches? If they pick the same one, and later try to push the /// branches to the same remote server, they will get mixed up. /// -/// To avoid those issues, Zenith has its own concept of timelines that +/// To avoid those issues, Neon has its own concept of timelines that /// is separate from PostgreSQL timelines, and doesn't have those -/// limitations. A zenith timeline is identified by a 128-bit ID, which +/// limitations. A Neon timeline is identified by a 128-bit ID, which /// is usually printed out as a hex string. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`ZId`] for alternative ways to serialize it. +/// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] -pub struct ZTimelineId(ZId); +pub struct TimelineId(Id); -zid_newtype!(ZTimelineId); +id_newtype!(TimelineId); -/// Zenith Tenant Id represents identifiar of a particular tenant. +/// Neon Tenant Id represents identifiar of a particular tenant. /// Is used for distinguishing requests and data belonging to different users. /// /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`ZId`] for alternative ways to serialize it. +/// See [`Id`] for alternative ways to serialize it. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -pub struct ZTenantId(ZId); +pub struct TenantId(Id); -zid_newtype!(ZTenantId); +id_newtype!(TenantId); -// A pair uniquely identifying Zenith instance. +// A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct ZTenantTimelineId { - pub tenant_id: ZTenantId, - pub timeline_id: ZTimelineId, +pub struct TenantTimelineId { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, } -impl ZTenantTimelineId { - pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { - ZTenantTimelineId { +impl TenantTimelineId { + pub fn new(tenant_id: TenantId, timeline_id: TimelineId) -> Self { + TenantTimelineId { tenant_id, timeline_id, } } pub fn generate() -> Self { - Self::new(ZTenantId::generate(), ZTimelineId::generate()) + Self::new(TenantId::generate(), TimelineId::generate()) } pub fn empty() -> Self { - Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16])) + Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16])) } } -impl fmt::Display for ZTenantTimelineId { +impl fmt::Display for TenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}/{}", self.tenant_id, self.timeline_id) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index caa7ac6c09..2c80556446 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -29,7 +29,7 @@ pub mod crashsafe_dir; pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. -pub mod zid; +pub mod id; // http endpoint utils pub mod http; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 604eb75aaf..0498e0887b 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -63,7 +63,7 @@ pub enum AuthType { Trust, MD5, // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT - ZenithJWT, + NeonJWT, } impl FromStr for AuthType { @@ -73,8 +73,8 @@ impl FromStr for AuthType { match s { "Trust" => Ok(Self::Trust), "MD5" => Ok(Self::MD5), - "ZenithJWT" => Ok(Self::ZenithJWT), - _ => bail!("invalid value \"{}\" for auth type", s), + "NeonJWT" => Ok(Self::NeonJWT), + _ => bail!("invalid value \"{s}\" for auth type"), } } } @@ -84,7 +84,7 @@ impl fmt::Display for AuthType { f.write_str(match self { AuthType::Trust => "Trust", AuthType::MD5 => "MD5", - AuthType::ZenithJWT => "ZenithJWT", + AuthType::NeonJWT => "NeonJWT", }) } } @@ -376,7 +376,7 @@ impl PostgresBackend { ))?; self.state = ProtoState::Authentication; } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; } @@ -403,7 +403,7 @@ impl PostgresBackend { bail!("auth failed: {}", e); } } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index 383ad3742f..87e4478a99 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -346,7 +346,7 @@ impl PostgresBackend { ))?; self.state = ProtoState::Authentication; } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; } @@ -374,7 +374,7 @@ impl PostgresBackend { bail!("auth failed: {}", e); } } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index e73c73bd9c..11d2d94906 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -27,10 +27,10 @@ clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio-util = { version = "0.7.3", features = ["io", "io-util"] } -postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = { version = "1.0", features = ["backtrace"] } crc32c = "0.6.0" thiserror = "1.0" diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs index 7e766ce859..f5247ee609 100644 --- a/pageserver/src/bin/dump_layerfile.rs +++ b/pageserver/src/bin/dump_layerfile.rs @@ -12,7 +12,7 @@ use utils::project_git_version; project_git_version!(GIT_VERSION); fn main() -> Result<()> { - let arg_matches = App::new("Zenith dump_layerfile utility") + let arg_matches = App::new("Neon dump_layerfile utility") .about("Dump contents of one layer file, for debugging") .version(GIT_VERSION) .arg( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 679c6f76e7..92d5eab379 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -40,7 +40,7 @@ fn version() -> String { } fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Zenith page server") + let arg_matches = App::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) .arg( @@ -293,7 +293,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, - AuthType::ZenithJWT => { + AuthType::NeonJWT => { // unwrap is ok because check is performed when creating config, so path is set and file exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); Some(JwtAuth::from_key_path(key_path)?.into()) diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 3339564b0f..16359c2532 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -11,7 +11,7 @@ use utils::{lsn::Lsn, project_git_version}; project_git_version!(GIT_VERSION); fn main() -> Result<()> { - let arg_matches = App::new("Zenith update metadata utility") + let arg_matches = App::new("Neon update metadata utility") .about("Dump or update metadata file") .version(GIT_VERSION) .arg( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 56171f46e3..75c71b09d2 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -15,8 +15,8 @@ use toml_edit; use toml_edit::{Document, Item}; use url::Url; use utils::{ + id::{NodeId, TenantId, TimelineId}, postgres_backend::AuthType, - zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::tenant::TIMELINES_SEGMENT_NAME; @@ -342,16 +342,16 @@ impl PageServerConf { self.workdir.join("tenants") } - pub fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenants_path().join(tenantid.to_string()) + pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenants_path().join(tenant_id.to_string()) } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } - pub fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timelines_path(tenantid).join(timelineid.to_string()) + pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf { + self.timelines_path(tenant_id).join(timeline_id.to_string()) } // @@ -419,7 +419,7 @@ impl PageServerConf { let mut conf = builder.build().context("invalid config")?; - if conf.auth_type == AuthType::ZenithJWT { + if conf.auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 0ccf23776c..c0dc5b9677 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -3,8 +3,8 @@ use std::num::NonZeroU64; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ + id::{NodeId, TenantId, TimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::tenant::TenantState; @@ -14,10 +14,10 @@ use crate::tenant::TenantState; pub struct TimelineCreateRequest { #[serde(default)] #[serde_as(as = "Option")] - pub new_timeline_id: Option, + pub new_timeline_id: Option, #[serde(default)] #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, + pub ancestor_timeline_id: Option, #[serde(default)] #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, @@ -28,7 +28,7 @@ pub struct TimelineCreateRequest { pub struct TenantCreateRequest { #[serde(default)] #[serde_as(as = "Option")] - pub new_tenant_id: Option, + pub new_tenant_id: Option, pub checkpoint_distance: Option, pub checkpoint_timeout: Option, pub compaction_target_size: Option, @@ -46,7 +46,7 @@ pub struct TenantCreateRequest { #[serde_as] #[derive(Serialize, Deserialize)] #[serde(transparent)] -pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId); +pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId); #[derive(Serialize)] pub struct StatusResponse { @@ -54,7 +54,7 @@ pub struct StatusResponse { } impl TenantCreateRequest { - pub fn new(new_tenant_id: Option) -> TenantCreateRequest { + pub fn new(new_tenant_id: Option) -> TenantCreateRequest { TenantCreateRequest { new_tenant_id, ..Default::default() @@ -65,7 +65,7 @@ impl TenantCreateRequest { #[serde_as] #[derive(Serialize, Deserialize)] pub struct TenantConfigRequest { - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, #[serde(default)] #[serde_as(as = "Option")] pub checkpoint_distance: Option, @@ -83,7 +83,7 @@ pub struct TenantConfigRequest { } impl TenantConfigRequest { - pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest { + pub fn new(tenant_id: TenantId) -> TenantConfigRequest { TenantConfigRequest { tenant_id, checkpoint_distance: None, @@ -106,7 +106,7 @@ impl TenantConfigRequest { #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] - pub id: ZTenantId, + pub id: TenantId, pub state: TenantState, pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, @@ -116,7 +116,7 @@ pub struct TenantInfo { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct LocalTimelineInfo { #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, + pub ancestor_timeline_id: Option, #[serde_as(as = "Option")] pub ancestor_lsn: Option, #[serde_as(as = "DisplayFromStr")] @@ -154,9 +154,9 @@ pub struct RemoteTimelineInfo { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { #[serde_as(as = "DisplayFromStr")] - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, #[serde_as(as = "DisplayFromStr")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub local: Option, pub remote: Option, } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 36ba2e9b66..2e49429f38 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -25,8 +25,8 @@ use utils::{ request::parse_request_param, RequestExt, RouterBuilder, }, + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; struct State { @@ -128,10 +128,10 @@ fn local_timeline_info_from_timeline( } fn list_local_timelines( - tenant_id: ZTenantId, + tenant_id: TenantId, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, -) -> Result> { +) -> Result> { let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let timelines = tenant.list_timelines(); @@ -156,7 +156,7 @@ async fn status_handler(request: Request) -> Result, ApiErr } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_id))?; @@ -164,8 +164,8 @@ async fn timeline_create_handler(mut request: Request) -> Result { @@ -193,7 +193,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = @@ -229,7 +229,7 @@ async fn timeline_list_handler(request: Request) -> Result, .remote_index .read() .await - .timeline_entry(&ZTenantTimelineId { + .timeline_entry(&TenantTimelineId { tenant_id, timeline_id, }) @@ -257,8 +257,8 @@ fn query_param_present(request: &Request, param: &str) -> bool { } async fn timeline_detail_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = @@ -289,7 +289,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; info!("Handling tenant attach {tenant_id}"); @@ -402,8 +402,8 @@ async fn tenant_attach_handler(request: Request) -> Result, /// for details see comment to `storage_sync::gather_tenant_timelines_index_parts` async fn gather_tenant_timelines_index_parts( state: &State, - tenant_id: ZTenantId, -) -> anyhow::Result>> { + tenant_id: TenantId, +) -> anyhow::Result>> { let index_parts = match state.remote_storage.as_ref() { Some(storage) => { storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await @@ -425,8 +425,8 @@ async fn gather_tenant_timelines_index_parts( } async fn timeline_delete_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; let state = get_state(&request); @@ -436,7 +436,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let state = get_state(&request); @@ -479,7 +479,7 @@ async fn tenant_list_handler(request: Request) -> Result, A } async fn tenant_status(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; // if tenant is in progress of downloading it can be absent in global tenant map @@ -588,8 +588,8 @@ async fn tenant_create_handler(mut request: Request) -> Result(HashMap>); +pub struct TenantTimelineValues(HashMap>); impl TenantTimelineValues { fn new() -> Self { @@ -187,8 +187,8 @@ mod tests { #[test] fn tenant_timeline_value_mapping() { - let first_tenant = ZTenantId::generate(); - let second_tenant = ZTenantId::generate(); + let first_tenant = TenantId::generate(); + let second_tenant = TenantId::generate(); assert_ne!(first_tenant, second_tenant); let mut initial = TenantTimelineValues::new(); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ada0bbd359..2f03943429 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -5,7 +5,7 @@ use metrics::{ IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; /// Prometheus histogram buckets (in seconds) that capture the majority of /// latencies in the microsecond range but also extend far enough up to distinguish @@ -327,7 +327,7 @@ pub struct TimelineMetrics { } impl TimelineMetrics { - pub fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); let reconstruct_time_histo = RECONSTRUCT_TIME @@ -414,6 +414,6 @@ impl Drop for TimelineMetrics { } } -pub fn remove_tenant_metrics(tenant_id: &ZTenantId) { +pub fn remove_tenant_metrics(tenant_id: &TenantId) { let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 15c3c22dd6..d2fe06697e 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -49,8 +49,8 @@ use anyhow::Context; use once_cell::sync::OnceCell; use tracing::error; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::repository::Key; @@ -109,8 +109,8 @@ enum CacheKey { #[derive(Debug, PartialEq, Eq, Hash, Clone)] struct MaterializedPageHashKey { - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: Key, } @@ -308,8 +308,8 @@ impl PageCache { /// returned page. pub fn lookup_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: &Key, lsn: Lsn, ) -> Option<(Lsn, PageReadGuard)> { @@ -338,8 +338,8 @@ impl PageCache { /// pub fn memorize_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key: Key, lsn: Lsn, img: &[u8], diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 388f40f916..b06814c557 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -23,12 +23,12 @@ use tokio_util::io::SyncIoBridge; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, + id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, simple_rcu::RcuReadGuard, - zid::{ZTenantId, ZTimelineId}, }; use crate::basebackup; @@ -123,7 +123,7 @@ impl PagestreamFeMessage { fn parse(mut body: Bytes) -> anyhow::Result { // TODO these gets can fail - // these correspond to the ZenithMessageTag enum in pagestore_client.h + // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. @@ -370,7 +370,7 @@ struct PageRequestMetrics { } impl PageRequestMetrics { - fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); @@ -415,8 +415,8 @@ impl PageServerHandler { async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> anyhow::Result<()> { // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association @@ -452,11 +452,11 @@ impl PageServerHandler { None => break, // client disconnected }; - trace!("query: {:?}", copy_data_bytes); + trace!("query: {copy_data_bytes:?}"); - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - let response = match zenith_fe_msg { + let response = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { let _timer = metrics.get_rel_exists.start_timer(); self.handle_get_rel_exists_request(&timeline, &req).await @@ -494,8 +494,8 @@ impl PageServerHandler { async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, ) -> anyhow::Result<()> { @@ -557,8 +557,8 @@ impl PageServerHandler { async fn handle_import_wal( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, ) -> anyhow::Result<()> { @@ -750,8 +750,8 @@ impl PageServerHandler { async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, @@ -792,7 +792,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -815,7 +815,7 @@ impl postgres_backend_async::Handler for PageServerHandler { _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> anyhow::Result<()> { - // this unwrap is never triggered, because check_auth_jwt only called when auth_type is ZenithJWT + // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self .auth @@ -853,8 +853,8 @@ impl postgres_backend_async::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; @@ -869,8 +869,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; @@ -895,8 +895,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for get_last_record_rlsn command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; self.check_permission(Some(tenant_id))?; let timeline = get_local_timeline(tenant_id, timeline_id)?; @@ -923,8 +923,8 @@ impl postgres_backend_async::Handler for PageServerHandler { "invalid param number for fullbackup command" ); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { @@ -959,8 +959,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; @@ -984,8 +984,8 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); ensure!(params.len() == 4); - let tenant_id = ZTenantId::from_str(params[0])?; - let timeline_id = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; let start_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; @@ -1035,7 +1035,7 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); - let tenant_id = ZTenantId::from_str(params[0])?; + let tenant_id = TenantId::from_str(params[0])?; let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), @@ -1087,8 +1087,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let tenant = tenant_mgr::get_tenant(tenant_id, true)?; @@ -1131,8 +1131,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("Invalid compact: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; timeline.compact()?; @@ -1148,8 +1148,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). @@ -1166,8 +1166,8 @@ impl postgres_backend_async::Handler for PageServerHandler { .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; + let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; + let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; let timeline = get_local_timeline(tenant_id, timeline_id)?; let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; @@ -1192,7 +1192,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } } -fn get_local_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Result> { +fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result> { tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id)) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 2454b6f54f..9d4b438dc4 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -10,7 +10,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::*; use crate::tenant::Timeline; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; use postgres_ffi::v14::pg_constants; @@ -570,7 +570,7 @@ impl<'a> DatadirModification<'a> { &mut self, rel: RelTag, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); @@ -583,7 +583,7 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { self.put( slru_block_to_key(kind, segno, blknum), @@ -1401,7 +1401,7 @@ fn is_slru_block_key(key: Key) -> bool { #[cfg(test)] pub fn create_test_timeline( tenant: &crate::tenant::Tenant, - timeline_id: utils::zid::ZTimelineId, + timeline_id: utils::id::TimelineId, ) -> Result> { let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c3b08c93de..f6ea9d8c5d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,4 +1,4 @@ -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use bytes::Bytes; @@ -157,7 +157,7 @@ pub enum Value { /// replayed get the full value. Replaying the WAL record /// might need a previous version of the value (if will_init() /// returns false), or it may be replayed stand-alone (true). - WalRecord(ZenithWalRecord), + WalRecord(NeonWalRecord), } impl Value { diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index c104dba298..9d259bf1e2 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -68,7 +68,7 @@ //! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. //! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. -//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], +//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`TenantId`] and [`TimelineId`], //! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. @@ -183,7 +183,7 @@ use crate::{ TenantTimelineValues, }; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; use self::download::download_index_parts; pub use self::download::gather_tenant_timelines_index_parts; @@ -227,7 +227,7 @@ pub struct SyncStartupData { struct SyncQueue { max_timelines_per_batch: NonZeroUsize, - queue: Mutex>, + queue: Mutex>, condvar: Condvar, } @@ -241,7 +241,7 @@ impl SyncQueue { } /// Queue a new task - fn push(&self, sync_id: ZTenantTimelineId, new_task: SyncTask) { + fn push(&self, sync_id: TenantTimelineId, new_task: SyncTask) { let mut q = self.queue.lock().unwrap(); q.push_back((sync_id, new_task)); @@ -254,7 +254,7 @@ impl SyncQueue { /// A timeline has to care to not to delete certain layers from the remote storage before the corresponding uploads happen. /// Other than that, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). - fn next_task_batch(&self) -> (HashMap, usize) { + fn next_task_batch(&self) -> (HashMap, usize) { // Wait for the first task in blocking fashion let mut q = self.queue.lock().unwrap(); while q.is_empty() { @@ -488,8 +488,8 @@ struct LayersDeletion { /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_upload( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, layers_to_upload: HashSet, metadata: Option, ) { @@ -501,7 +501,7 @@ pub fn schedule_layer_upload( } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -519,8 +519,8 @@ pub fn schedule_layer_upload( /// /// Ensure that the loop is started otherwise the task is never processed. pub fn schedule_layer_delete( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, layers_to_delete: HashSet, ) { let sync_queue = match SYNC_QUEUE.get() { @@ -531,7 +531,7 @@ pub fn schedule_layer_delete( } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -551,7 +551,7 @@ pub fn schedule_layer_delete( /// On any failure, the task gets retried, omitting already downloaded layers. /// /// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { +pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) { debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}"); let sync_queue = match SYNC_QUEUE.get() { Some(queue) => queue, @@ -561,7 +561,7 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { } }; sync_queue.push( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, }, @@ -604,7 +604,7 @@ pub fn spawn_storage_sync_task( let _ = empty_tenants.0.entry(tenant_id).or_default(); } else { for (timeline_id, timeline_data) in timeline_data { - let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let id = TenantTimelineId::new(tenant_id, timeline_id); keys_for_index_part_downloads.insert(id); timelines_to_sync.insert(id, timeline_data); } @@ -766,9 +766,9 @@ async fn process_batches( max_sync_errors: NonZeroU32, storage: GenericRemoteStorage, index: &RemoteIndex, - batched_tasks: HashMap, + batched_tasks: HashMap, sync_queue: &SyncQueue, -) -> HashSet { +) -> HashSet { let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { @@ -808,7 +808,7 @@ async fn process_sync_task_batch( conf: &'static PageServerConf, (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, batch: SyncTaskBatch, ) -> DownloadStatus { let sync_start = Instant::now(); @@ -949,7 +949,7 @@ async fn download_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, new_download_data: SyncData, sync_start: Instant, task_name: &str, @@ -999,7 +999,7 @@ async fn download_timeline_data( async fn update_local_metadata( conf: &'static PageServerConf, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, remote_timeline: Option<&RemoteTimeline>, ) -> anyhow::Result<()> { let remote_metadata = match remote_timeline { @@ -1031,7 +1031,7 @@ async fn update_local_metadata( info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); // clone because spawn_blocking requires static lifetime let cloned_metadata = remote_metadata.to_owned(); - let ZTenantTimelineId { + let TenantTimelineId { tenant_id, timeline_id, } = sync_id; @@ -1061,7 +1061,7 @@ async fn update_local_metadata( async fn delete_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, task_name: &str, @@ -1104,7 +1104,7 @@ async fn upload_timeline_data( conf: &'static PageServerConf, (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, new_upload_data: SyncData, sync_start: Instant, task_name: &str, @@ -1163,7 +1163,7 @@ async fn update_remote_data( conf: &'static PageServerConf, storage: &GenericRemoteStorage, index: &RemoteIndex, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, update: RemoteDataUpdate<'_>, ) -> anyhow::Result<()> { let updated_remote_timeline = { @@ -1261,7 +1261,7 @@ async fn validate_task_retries( fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, - local_timeline_files: HashMap)>, + local_timeline_files: HashMap)>, ) -> TenantTimelineValues { let mut local_timeline_init_statuses = TenantTimelineValues::new(); @@ -1331,8 +1331,8 @@ fn schedule_first_sync_tasks( /// bool in return value stands for awaits_download fn compare_local_and_remote_timeline( - new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, - sync_id: ZTenantTimelineId, + new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>, + sync_id: TenantTimelineId, local_metadata: TimelineMetadata, local_files: HashSet, remote_entry: &RemoteTimeline, @@ -1377,7 +1377,7 @@ fn compare_local_and_remote_timeline( } fn register_sync_status( - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, sync_start: Instant, sync_name: &str, sync_status: Option, @@ -1409,7 +1409,7 @@ mod test_utils { pub(super) async fn create_local_timeline( harness: &TenantHarness<'_>, - timeline_id: ZTimelineId, + timeline_id: TimelineId, filenames: &[&str], metadata: TimelineMetadata, ) -> anyhow::Result { @@ -1454,8 +1454,8 @@ mod tests { use super::*; - const TEST_SYNC_ID: ZTenantTimelineId = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("11223344556677881122334455667788")), + const TEST_SYNC_ID: TenantTimelineId = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("11223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; @@ -1464,12 +1464,12 @@ mod tests { let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); assert_eq!(sync_queue.len(), 0); - let sync_id_2 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; - let sync_id_3 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("33223344556677881122334455667788")), + let sync_id_3 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("33223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; assert!(sync_id_2 != TEST_SYNC_ID); @@ -1591,8 +1591,8 @@ mod tests { layers_to_skip: HashSet::from([PathBuf::from("sk4")]), }; - let sync_id_2 = ZTenantTimelineId { - tenant_id: ZTenantId::from_array(hex!("22223344556677881122334455667788")), + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), timeline_id: TIMELINE_ID, }; assert!(sync_id_2 != TEST_SYNC_ID); diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 945f5fded8..21a3372e70 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -8,7 +8,7 @@ use tracing::{debug, error, info}; use crate::storage_sync::{SyncQueue, SyncTask}; use remote_storage::GenericRemoteStorage; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use super::{LayersDeletion, SyncData}; @@ -17,7 +17,7 @@ use super::{LayersDeletion, SyncData}; pub(super) async fn delete_timeline_layers( storage: &GenericRemoteStorage, sync_queue: &SyncQueue, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut delete_data: SyncData, ) -> bool { if !delete_data.data.deletion_registered { @@ -123,7 +123,7 @@ mod tests { async fn delete_timeline_negative() -> anyhow::Result<()> { let harness = TenantHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), @@ -157,7 +157,7 @@ mod tests { let harness = TenantHarness::create("delete_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_path_buf(), diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 32f228b447..80d5ca5994 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -20,7 +20,7 @@ use crate::{ config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, TEMP_FILE_SUFFIX, }; -use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::{ index::{IndexPart, RemoteTimeline}, @@ -33,14 +33,14 @@ use super::{ // When data is received succesfully without errors Present variant is used. pub enum TenantIndexParts { Poisoned { - present: HashMap, - missing: HashSet, + present: HashMap, + missing: HashSet, }, - Present(HashMap), + Present(HashMap), } impl TenantIndexParts { - fn add_poisoned(&mut self, timeline_id: ZTimelineId) { + fn add_poisoned(&mut self, timeline_id: TimelineId) { match self { TenantIndexParts::Poisoned { missing, .. } => { missing.insert(timeline_id); @@ -64,9 +64,9 @@ impl Default for TenantIndexParts { pub async fn download_index_parts( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - keys: HashSet, -) -> HashMap { - let mut index_parts: HashMap = HashMap::new(); + keys: HashSet, +) -> HashMap { + let mut index_parts: HashMap = HashMap::new(); let mut part_downloads = keys .into_iter() @@ -112,8 +112,8 @@ pub async fn download_index_parts( pub async fn gather_tenant_timelines_index_parts( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - tenant_id: ZTenantId, -) -> anyhow::Result> { + tenant_id: TenantId, +) -> anyhow::Result> { let tenant_path = conf.timelines_path(&tenant_id); let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id) .await @@ -135,7 +135,7 @@ pub async fn gather_tenant_timelines_index_parts( async fn download_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, ) -> Result { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); @@ -197,7 +197,7 @@ pub(super) async fn download_timeline_layers<'a>( storage: &'a GenericRemoteStorage, sync_queue: &'a SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut download_data: SyncData, ) -> DownloadedTimeline { let remote_timeline = match remote_timeline { @@ -335,7 +335,7 @@ pub(super) async fn download_timeline_layers<'a>( } // fsync timeline directory which is a parent directory for downloaded files - let ZTenantTimelineId { + let TenantTimelineId { tenant_id, timeline_id, } = &sync_id; @@ -366,8 +366,8 @@ pub(super) async fn download_timeline_layers<'a>( async fn get_timeline_sync_ids( storage: &GenericRemoteStorage, tenant_path: &Path, - tenant_id: ZTenantId, -) -> anyhow::Result> { + tenant_id: TenantId, +) -> anyhow::Result> { let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { format!( "Failed to get tenant storage path for local path '{}'", @@ -395,11 +395,11 @@ async fn get_timeline_sync_ids( anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") })?; - let timeline_id: ZTimelineId = object_name.parse().with_context(|| { + let timeline_id: TimelineId = object_name.parse().with_context(|| { format!("failed to parse object name into timeline id '{object_name}'") })?; - sync_ids.insert(ZTenantTimelineId { + sync_ids.insert(TenantTimelineId { tenant_id, timeline_id, }); @@ -439,7 +439,7 @@ mod tests { let harness = TenantHarness::create("download_timeline")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), @@ -539,7 +539,7 @@ mod tests { async fn download_timeline_negatives() -> anyhow::Result<()> { let harness = TenantHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), harness.conf.workdir.clone(), @@ -597,7 +597,7 @@ mod tests { #[tokio::test] async fn test_download_index_part() -> anyhow::Result<()> { let harness = TenantHarness::create("test_download_index_part")?; - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index cff14cde49..13495ffefe 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -17,8 +17,8 @@ use tracing::log::warn; use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata}; use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::download::TenantIndexParts; @@ -49,7 +49,7 @@ impl RelativePath { } #[derive(Debug, Clone, Default)] -pub struct TenantEntry(HashMap); +pub struct TenantEntry(HashMap); impl TenantEntry { pub fn has_in_progress_downloads(&self) -> bool { @@ -59,7 +59,7 @@ impl TenantEntry { } impl Deref for TenantEntry { - type Target = HashMap; + type Target = HashMap; fn deref(&self) -> &Self::Target { &self.0 @@ -72,8 +72,8 @@ impl DerefMut for TenantEntry { } } -impl From> for TenantEntry { - fn from(inner: HashMap) -> Self { +impl From> for TenantEntry { + fn from(inner: HashMap) -> Self { Self(inner) } } @@ -81,7 +81,7 @@ impl From> for TenantEntry { /// An index to track tenant files that exist on the remote storage. #[derive(Debug, Clone, Default)] pub struct RemoteTimelineIndex { - entries: HashMap, + entries: HashMap, } /// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. @@ -91,9 +91,9 @@ pub struct RemoteIndex(Arc>); impl RemoteIndex { pub fn from_parts( conf: &'static PageServerConf, - index_parts: HashMap, + index_parts: HashMap, ) -> anyhow::Result { - let mut entries: HashMap = HashMap::new(); + let mut entries: HashMap = HashMap::new(); for (tenant_id, index_parts) in index_parts { match index_parts { @@ -136,30 +136,30 @@ impl Clone for RemoteIndex { impl RemoteTimelineIndex { pub fn timeline_entry( &self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: &ZTenantTimelineId, + }: &TenantTimelineId, ) -> Option<&RemoteTimeline> { self.entries.get(tenant_id)?.get(timeline_id) } pub fn timeline_entry_mut( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: &ZTenantTimelineId, + }: &TenantTimelineId, ) -> Option<&mut RemoteTimeline> { self.entries.get_mut(tenant_id)?.get_mut(timeline_id) } pub fn add_timeline_entry( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, entry: RemoteTimeline, ) { self.entries @@ -170,10 +170,10 @@ impl RemoteTimelineIndex { pub fn remove_timeline_entry( &mut self, - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, ) -> Option { self.entries .entry(tenant_id) @@ -181,25 +181,25 @@ impl RemoteTimelineIndex { .remove(&timeline_id) } - pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> { + pub fn tenant_entry(&self, tenant_id: &TenantId) -> Option<&TenantEntry> { self.entries.get(tenant_id) } - pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> { + pub fn tenant_entry_mut(&mut self, tenant_id: &TenantId) -> Option<&mut TenantEntry> { self.entries.get_mut(tenant_id) } - pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry { + pub fn add_tenant_entry(&mut self, tenant_id: TenantId) -> &mut TenantEntry { self.entries.entry(tenant_id).or_default() } - pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option { + pub fn remove_tenant_entry(&mut self, tenant_id: &TenantId) -> Option { self.entries.remove(tenant_id) } pub fn set_awaits_download( &mut self, - id: &ZTenantTimelineId, + id: &TenantTimelineId, awaits_download: bool, ) -> anyhow::Result<()> { self.timeline_entry_mut(id) diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index bd09e6b898..aa5a2232cf 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -8,7 +8,7 @@ use remote_storage::GenericRemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use super::{ index::{IndexPart, RemoteTimeline}, @@ -21,7 +21,7 @@ use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::me pub(super) async fn upload_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, index_part: IndexPart, ) -> anyhow::Result<()> { let index_part_bytes = serde_json::to_vec(&index_part) @@ -58,7 +58,7 @@ pub(super) async fn upload_timeline_layers<'a>( storage: &'a GenericRemoteStorage, sync_queue: &SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, - sync_id: ZTenantTimelineId, + sync_id: TenantTimelineId, mut upload_data: SyncData, ) -> UploadedTimeline { let upload = &mut upload_data.data; @@ -213,7 +213,7 @@ mod tests { async fn regular_layer_upload() -> anyhow::Result<()> { let harness = TenantHarness::create("regular_layer_upload")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; let storage = GenericRemoteStorage::new(LocalFs::new( @@ -301,7 +301,7 @@ mod tests { async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { let harness = TenantHarness::create("layer_upload_after_local_fs_update")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; let storage = GenericRemoteStorage::new(LocalFs::new( @@ -395,7 +395,7 @@ mod tests { #[tokio::test] async fn test_upload_index_part() -> anyhow::Result<()> { let harness = TenantHarness::create("test_upload_index_part")?; - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let storage = GenericRemoteStorage::new(LocalFs::new( tempdir()?.path().to_owned(), diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 2aa803d119..dad6e0039d 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -51,7 +51,7 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; use crate::shutdown_pageserver; @@ -210,8 +210,8 @@ pub enum TaskKind { #[derive(Default)] struct MutableTaskState { /// Tenant and timeline that this task is associated with. - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, /// Handle for waiting for the task to exit. It can be None, if the /// the task has already exited. @@ -238,8 +238,8 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, name: &str, shutdown_process_on_error: bool, future: F, @@ -371,7 +371,7 @@ async fn task_finish( } // expected to be called from the task of the given id. -pub fn associate_with(tenant_id: Option, timeline_id: Option) { +pub fn associate_with(tenant_id: Option, timeline_id: Option) { CURRENT_TASK.with(|ct| { let mut task_mut = ct.mutable.lock().unwrap(); task_mut.tenant_id = tenant_id; @@ -391,12 +391,12 @@ pub fn associate_with(tenant_id: Option, timeline_id: Option, - tenant_id: Option, - timeline_id: Option, + tenant_id: Option, + timeline_id: Option, ) { let mut victim_tasks = Vec::new(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4ef810faba..41fd98ec07 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4,7 +4,7 @@ //! The functions here are responsible for locating the correct layer for the //! get/put call, walking back the timeline branching history as needed. //! -//! The files are stored in the .neon/tenants//timelines/ +//! The files are stored in the .neon/tenants//timelines/ //! directory. See docs/pageserver-storage.md for how the files are managed. //! In addition to the layer files, there is a metadata file in the same //! directory that contains information about the timeline, in particular its @@ -48,8 +48,8 @@ use crate::CheckpointConfig; use toml_edit; use utils::{ crashsafe_dir, + id::{TenantId, TimelineId}, lsn::{Lsn, RecordLsn}, - zid::{ZTenantId, ZTimelineId}, }; mod blob_io; @@ -80,7 +80,7 @@ pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; -/// Parts of the `.neon/tenants//timelines/` directory prefix. +/// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// @@ -98,8 +98,8 @@ pub struct Tenant { // This is necessary to allow global config updates. tenant_conf: Arc>, - tenant_id: ZTenantId, - timelines: Mutex>>, + tenant_id: TenantId, + timelines: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration (especially with enforced checkpoint) @@ -134,7 +134,7 @@ pub enum TenantState { impl Tenant { /// Get Timeline handle for given zenith timeline ID. /// This function is idempotent. It doesn't change internal state in any way. - pub fn get_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result> { + pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { self.timelines .lock() .unwrap() @@ -151,7 +151,7 @@ impl Tenant { /// Lists timelines the tenant contains. /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. - pub fn list_timelines(&self) -> Vec<(ZTimelineId, Arc)> { + pub fn list_timelines(&self) -> Vec<(TimelineId, Arc)> { self.timelines .lock() .unwrap() @@ -164,7 +164,7 @@ impl Tenant { /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. pub fn create_empty_timeline( &self, - new_timeline_id: ZTimelineId, + new_timeline_id: TimelineId, initdb_lsn: Lsn, ) -> Result> { // XXX: keep the lock to avoid races during timeline creation @@ -207,8 +207,8 @@ impl Tenant { /// Branch a timeline pub fn branch_timeline( &self, - src: ZTimelineId, - dst: ZTimelineId, + src: TimelineId, + dst: TimelineId, start_lsn: Option, ) -> Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn @@ -302,14 +302,14 @@ impl Tenant { /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// - /// 'timelineid' specifies the timeline to GC, or None for all. + /// 'target_timeline_id' specifies the timeline to GC, or None for all. /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC /// to make tests more deterministic. /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? pub fn gc_iteration( &self, - target_timeline_id: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, @@ -337,13 +337,13 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) + .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone())) .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_compact { + for (timeline_id, timeline) in &timelines_to_compact { let _entered = - info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered(); + info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered(); timeline.compact()?; } @@ -362,13 +362,13 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); let timelines_to_compact = timelines .iter() - .map(|(timelineid, timeline)| (*timelineid, Arc::clone(timeline))) + .map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline))) .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_compact { + for (timeline_id, timeline) in &timelines_to_compact { let _entered = - info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenant_id) + info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id) .entered(); timeline.checkpoint(CheckpointConfig::Flush)?; } @@ -377,7 +377,7 @@ impl Tenant { } /// Removes timeline-related in-memory data - pub fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + pub fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> { // in order to be retriable detach needs to be idempotent // (or at least to a point that each time the detach is called it can make progress) let mut timelines = self.timelines.lock().unwrap(); @@ -416,7 +416,7 @@ impl Tenant { pub fn init_attach_timelines( &self, - timelines: HashMap, + timelines: HashMap, ) -> anyhow::Result<()> { let sorted_timelines = if timelines.len() == 1 { timelines.into_iter().collect() @@ -505,13 +505,13 @@ impl Tenant { /// perform a topological sort, so that the parent of each timeline comes /// before the children. fn tree_sort_timelines( - timelines: HashMap, -) -> Result> { + timelines: HashMap, +) -> Result> { let mut result = Vec::with_capacity(timelines.len()); let mut now = Vec::with_capacity(timelines.len()); // (ancestor, children) - let mut later: HashMap> = + let mut later: HashMap> = HashMap::with_capacity(timelines.len()); for (timeline_id, metadata) in timelines { @@ -636,9 +636,9 @@ impl Tenant { fn initialize_new_timeline( &self, - new_timeline_id: ZTimelineId, + new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, + timelines: &mut MutexGuard>>, ) -> anyhow::Result> { let ancestor = match new_metadata.ancestor_timeline() { Some(ancestor_timeline_id) => Some( @@ -680,7 +680,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_conf: TenantConfOpt, walredo_mgr: Arc, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: RemoteIndex, upload_layers: bool, ) -> Tenant { @@ -701,7 +701,7 @@ impl Tenant { /// Locate and load config pub fn load_tenant_config( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result { let target_config_path = TenantConf::path(conf, tenant_id); let target_config_display = target_config_path.display(); @@ -830,7 +830,7 @@ impl Tenant { // we do. fn gc_iteration_internal( &self, - target_timeline_id: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, @@ -848,7 +848,7 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); + let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new(); let timeline_ids = { if let Some(target_timeline_id) = target_timeline_id.as_ref() { if timelines.get(target_timeline_id).is_none() { @@ -861,11 +861,11 @@ impl Tenant { .map(|(timeline_id, timeline_entry)| { // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. - // Somewhat related: https://github.com/zenithdb/zenith/issues/999 + // Somewhat related: https://github.com/neondatabase/neon/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timeline_id { - if ancestor_timeline_id == &timelineid { + if let Some(timeline_id) = target_timeline_id { + if ancestor_timeline_id == &timeline_id { all_branchpoints.insert(( *ancestor_timeline_id, timeline_entry.get_ancestor_lsn(), @@ -895,8 +895,8 @@ impl Tenant { .with_context(|| format!("Timeline {timeline_id} was not found"))?; // If target_timeline is specified, ignore all other timelines - if let Some(target_timelineid) = target_timeline_id { - if timeline_id != target_timelineid { + if let Some(target_timeline_id) = target_timeline_id { + if timeline_id != target_timeline_id { continue; } } @@ -952,7 +952,7 @@ impl Tenant { Ok(totals) } - pub fn tenant_id(&self) -> ZTenantId { + pub fn tenant_id(&self) -> TenantId { self.tenant_id } } @@ -998,7 +998,7 @@ pub mod harness { config::PageServerConf, repository::Key, tenant::Tenant, - walrecord::ZenithWalRecord, + walrecord::NeonWalRecord, walredo::{WalRedoError, WalRedoManager}, }; @@ -1006,12 +1006,12 @@ pub mod harness { use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::zid::{ZTenantId, ZTimelineId}; + use utils::id::{TenantId, TimelineId}; - pub const TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const NEW_TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + pub const NEW_TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("AA223344556677881122334455667788")); /// Convenience function to create a page image with given string as the only content #[allow(non_snake_case)] @@ -1047,7 +1047,7 @@ pub mod harness { pub struct TenantHarness<'a> { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, - pub tenant_id: ZTenantId, + pub tenant_id: TenantId, pub lock_guard: ( Option>, @@ -1080,7 +1080,7 @@ pub mod harness { let tenant_conf = TenantConf::dummy_conf(); - let tenant_id = ZTenantId::generate(); + let tenant_id = TenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.timelines_path(&tenant_id))?; @@ -1113,7 +1113,7 @@ pub mod harness { .expect("should be able to read timelines dir") { let timeline_dir_entry = timeline_dir_entry?; - let timeline_id: ZTimelineId = timeline_dir_entry + let timeline_id: TimelineId = timeline_dir_entry .path() .file_name() .unwrap() @@ -1128,15 +1128,15 @@ pub mod harness { Ok(tenant) } - pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { + pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf { self.conf.timeline_path(timeline_id, &self.tenant_id) } } fn load_metadata( conf: &'static PageServerConf, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, ) -> anyhow::Result { let metadata_path = metadata_path(conf, timeline_id, tenant_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { @@ -1162,7 +1162,7 @@ pub mod harness { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result { let s = format!( "redo for {} to get to {}, with {} and {} records", @@ -1747,7 +1747,7 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { - let new_tline_id = ZTimelineId::generate(); + let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant .get_timeline(new_tline_id) @@ -1808,7 +1808,7 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { - let new_tline_id = ZTimelineId::generate(); + let new_tline_id = TimelineId::generate(); tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = tenant .get_timeline(new_tline_id) diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index ff6d3652f9..892000c20b 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -7,7 +7,7 @@ //! must be page images or WAL records with the 'will_init' flag set, so that //! they can be replayed without referring to an older page version. //! -//! The delta files are stored in timelines/ directory. Currently, +//! The delta files are stored in timelines/ directory. Currently, //! there are no subdirectories, and each delta file is named like this: //! //! -__-, lsn_range: Range, @@ -81,8 +81,8 @@ impl From<&DeltaLayer> for Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: layer.tenantid, - timelineid: layer.timelineid, + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, key_range: layer.key_range.clone(), lsn_range: layer.lsn_range.clone(), @@ -173,8 +173,8 @@ impl DeltaKey { pub struct DeltaLayer { path_or_conf: PathOrConf, - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub key_range: Range, pub lsn_range: Range, @@ -194,12 +194,12 @@ pub struct DeltaLayerInner { } impl Layer for DeltaLayer { - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -344,8 +344,8 @@ impl Layer for DeltaLayer { fn dump(&self, verbose: bool) -> Result<()> { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenantid, - self.timelineid, + self.tenant_id, + self.timeline_id, self.key_range.start, self.key_range.end, self.lsn_range.start, @@ -419,22 +419,22 @@ impl Layer for DeltaLayer { impl DeltaLayer { fn path_for( path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &DeltaFileName, ) -> PathBuf { match path_or_conf { PathOrConf::Path(path) => path.clone(), PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(fname.to_string()), } } fn temp_path_for( conf: &PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: &Range, ) -> PathBuf { @@ -444,7 +444,7 @@ impl DeltaLayer { .map(char::from) .collect(); - conf.timeline_path(&timelineid, &tenantid).join(format!( + conf.timeline_path(&timeline_id, &tenant_id).join(format!( "{}-XXX__{:016X}-{:016X}.{}.{}", key_start, u64::from(lsn_range.start), @@ -535,14 +535,14 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, filename: &DeltaFileName, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: filename.key_range.clone(), lsn_range: filename.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { @@ -568,8 +568,8 @@ impl DeltaLayer { Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, key_range: summary.key_range, lsn_range: summary.lsn_range, inner: RwLock::new(DeltaLayerInner { @@ -592,8 +592,8 @@ impl DeltaLayer { pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &self.layer_name(), ) } @@ -613,8 +613,8 @@ impl DeltaLayer { pub struct DeltaLayerWriter { conf: &'static PageServerConf, path: PathBuf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: Range, @@ -630,8 +630,8 @@ impl DeltaLayerWriter { /// pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_start: Key, lsn_range: Range, ) -> Result { @@ -641,7 +641,7 @@ impl DeltaLayerWriter { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range); + let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range); let mut file = VirtualFile::create(&path)?; // make room for the header block @@ -656,8 +656,8 @@ impl DeltaLayerWriter { Ok(DeltaLayerWriter { conf, path, - timelineid, - tenantid, + timeline_id, + tenant_id, key_start, lsn_range, tree: tree_builder, @@ -718,8 +718,8 @@ impl DeltaLayerWriter { let summary = Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), index_start_blk, @@ -733,8 +733,8 @@ impl DeltaLayerWriter { // set inner.file here. The first read will have to re-open it. let layer = DeltaLayer { path_or_conf: PathOrConf::Conf(self.conf), - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), inner: RwLock::new(DeltaLayerInner { @@ -753,8 +753,8 @@ impl DeltaLayerWriter { // FIXME: throw an error instead? let final_path = DeltaLayer::path_for( &PathOrConf::Conf(self.conf), - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &DeltaFileName { key_range: self.key_start..key_end, lsn_range: self.lsn_range, diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index c675e4e778..0774fa42a6 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -17,7 +17,7 @@ use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use tracing::*; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; use std::os::unix::fs::FileExt; @@ -39,8 +39,8 @@ pub struct EphemeralFiles { pub struct EphemeralFile { file_id: u64, - _tenantid: ZTenantId, - _timelineid: ZTimelineId, + _tenant_id: TenantId, + _timeline_id: TimelineId, file: Arc, pub size: u64, @@ -49,15 +49,15 @@ pub struct EphemeralFile { impl EphemeralFile { pub fn create( conf: &PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> Result { let mut l = EPHEMERAL_FILES.write().unwrap(); let file_id = l.next_file_id; l.next_file_id += 1; let filename = conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(PathBuf::from(format!("ephemeral-{}", file_id))); let file = VirtualFile::open_with_options( @@ -69,8 +69,8 @@ impl EphemeralFile { Ok(EphemeralFile { file_id, - _tenantid: tenantid, - _timelineid: timelineid, + _tenant_id: tenant_id, + _timeline_id: timeline_id, file: file_rc, size: 0, }) @@ -338,7 +338,7 @@ mod tests { fn harness( test_name: &str, - ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { + ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); let conf = PageServerConf::dummy_conf(repo_dir); @@ -346,11 +346,11 @@ mod tests { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap(); - let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap(); - fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?; + let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap(); + let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); + fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?; - Ok((conf, tenantid, timelineid)) + Ok((conf, tenant_id, timeline_id)) } // Helper function to slurp contents of a file, starting at the current position, @@ -368,9 +368,9 @@ mod tests { #[test] fn test_ephemeral_files() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = harness("ephemeral_files")?; + let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?; - let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; + let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?; file_a.write_all_at(b"foo", 0)?; assert_eq!("foo", read_string(&file_a, 0, 20)?); @@ -381,7 +381,7 @@ mod tests { // Open a lot of files, enough to cause some page evictions. let mut efiles = Vec::new(); for fileno in 0..100 { - let efile = EphemeralFile::create(conf, tenantid, timelineid)?; + let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?; efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?; assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?); efiles.push((fileno, efile)); @@ -399,9 +399,9 @@ mod tests { #[test] fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenantid, timelineid) = harness("ephemeral_blobs")?; + let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?; let pos_foo = file.write_blob(b"foo")?; assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs index 518643241d..92bf022fee 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/image_layer.rs @@ -4,7 +4,7 @@ //! but does not exist in the layer, does not exist. //! //! An image layer is stored in a file on disk. The file is stored in -//! timelines/ directory. Currently, there are no +//! timelines/ directory. Currently, there are no //! subdirectories, and each image layer file is named like this: //! //! -__ @@ -44,8 +44,8 @@ use tracing::*; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; /// @@ -56,12 +56,12 @@ use utils::{ /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { - /// Magic value to identify this as a zenith image file. Always IMAGE_FILE_MAGIC. + /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC. magic: u16, format_version: u16, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, key_range: Range, lsn: Lsn, @@ -77,8 +77,8 @@ impl From<&ImageLayer> for Summary { Self { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: layer.tenantid, - timelineid: layer.timelineid, + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, key_range: layer.key_range.clone(), lsn: layer.lsn, @@ -97,8 +97,8 @@ impl From<&ImageLayer> for Summary { /// pub struct ImageLayer { path_or_conf: PathOrConf, - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub key_range: Range, // This entry contains an image of all pages as of this LSN @@ -128,12 +128,12 @@ impl Layer for ImageLayer { Some(self.path()) } - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -202,7 +202,7 @@ impl Layer for ImageLayer { fn dump(&self, verbose: bool) -> Result<()> { println!( "----- image layer for ten {} tli {} key {}-{} at {} ----", - self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn + self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn ); if !verbose { @@ -228,22 +228,22 @@ impl Layer for ImageLayer { impl ImageLayer { fn path_for( path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &ImageFileName, ) -> PathBuf { match path_or_conf { PathOrConf::Path(path) => path.to_path_buf(), PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) + .timeline_path(&timeline_id, &tenant_id) .join(fname.to_string()), } } fn temp_path_for( conf: &PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, fname: &ImageFileName, ) -> PathBuf { let rand_string: String = rand::thread_rng() @@ -252,7 +252,7 @@ impl ImageLayer { .map(char::from) .collect(); - conf.timeline_path(&timelineid, &tenantid) + conf.timeline_path(&timeline_id, &tenant_id) .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } @@ -336,14 +336,14 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, filename: &ImageFileName, ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: filename.key_range.clone(), lsn: filename.lsn, inner: RwLock::new(ImageLayerInner { @@ -369,8 +369,8 @@ impl ImageLayer { Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, key_range: summary.key_range, lsn: summary.lsn, inner: RwLock::new(ImageLayerInner { @@ -393,8 +393,8 @@ impl ImageLayer { pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &self.layer_name(), ) } @@ -414,8 +414,8 @@ impl ImageLayer { pub struct ImageLayerWriter { conf: &'static PageServerConf, path: PathBuf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_range: Range, lsn: Lsn, @@ -426,8 +426,8 @@ pub struct ImageLayerWriter { impl ImageLayerWriter { pub fn new( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, key_range: &Range, lsn: Lsn, ) -> anyhow::Result { @@ -435,8 +435,8 @@ impl ImageLayerWriter { // We'll atomically rename it to the final name when we're done. let path = ImageLayer::temp_path_for( conf, - timelineid, - tenantid, + timeline_id, + tenant_id, &ImageFileName { key_range: key_range.clone(), lsn, @@ -458,8 +458,8 @@ impl ImageLayerWriter { let writer = ImageLayerWriter { conf, path, - timelineid, - tenantid, + timeline_id, + tenant_id, key_range: key_range.clone(), lsn, tree: tree_builder, @@ -502,8 +502,8 @@ impl ImageLayerWriter { let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenantid: self.tenantid, - timelineid: self.timelineid, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, key_range: self.key_range.clone(), lsn: self.lsn, index_start_blk, @@ -517,8 +517,8 @@ impl ImageLayerWriter { // set inner.file here. The first read will have to re-open it. let layer = ImageLayer { path_or_conf: PathOrConf::Conf(self.conf), - timelineid: self.timelineid, - tenantid: self.tenantid, + timeline_id: self.timeline_id, + tenant_id: self.tenant_id, key_range: self.key_range.clone(), lsn: self.lsn, inner: RwLock::new(ImageLayerInner { @@ -538,8 +538,8 @@ impl ImageLayerWriter { // FIXME: throw an error instead? let final_path = ImageLayer::path_for( &PathOrConf::Conf(self.conf), - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, &ImageFileName { key_range: self.key_range.clone(), lsn: self.lsn, diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs index 0e7b215b1e..9aa33a72ca 100644 --- a/pageserver/src/tenant/inmemory_layer.rs +++ b/pageserver/src/tenant/inmemory_layer.rs @@ -18,9 +18,9 @@ use std::collections::HashMap; use tracing::*; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, vec_map::VecMap, - zid::{ZTenantId, ZTimelineId}, }; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods @@ -37,8 +37,8 @@ thread_local! { pub struct InMemoryLayer { conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, /// /// This layer contains all the changes from 'start_lsn'. The @@ -94,12 +94,12 @@ impl Layer for InMemoryLayer { None } - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id } fn get_key_range(&self) -> Range { @@ -197,7 +197,7 @@ impl Layer for InMemoryLayer { println!( "----- in-memory layer for tli {} LSNs {}-{} ----", - self.timelineid, self.start_lsn, end_str, + self.timeline_id, self.start_lsn, end_str, ); if !verbose { @@ -251,22 +251,18 @@ impl InMemoryLayer { /// pub fn create( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, start_lsn: Lsn, ) -> Result { - trace!( - "initializing new empty InMemoryLayer for writing on timeline {} at {}", - timelineid, - start_lsn - ); + trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenantid, timelineid)?; + let file = EphemeralFile::create(conf, tenant_id, timeline_id)?; Ok(InMemoryLayer { conf, - timelineid, - tenantid, + timeline_id, + tenant_id, start_lsn, inner: RwLock::new(InMemoryLayerInner { end_lsn: None, @@ -281,7 +277,7 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); + trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); @@ -344,8 +340,8 @@ impl InMemoryLayer { let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, - self.timelineid, - self.tenantid, + self.timeline_id, + self.tenant_id, Key::MIN, self.start_lsn..inner.end_lsn.unwrap(), )?; diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c24e3976fb..8abeebf54c 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -2,7 +2,7 @@ //! The layer map tracks what layers exist in a timeline. //! //! When the timeline is first accessed, the server lists of all layer files -//! in the timelines/ directory, and populates this map with +//! in the timelines/ directory, and populates this map with //! ImageLayer and DeltaLayer structs corresponding to each file. When the first //! new WAL record is received, we create an InMemoryLayer to hold the incoming //! records. Now and then, in the checkpoint() function, the in-memory layer is diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 4ea2b7d55b..ace4dc91e9 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -15,8 +15,8 @@ use serde::{Deserialize, Serialize}; use tracing::info_span; use utils::{ bin_ser::BeSer, + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; @@ -63,7 +63,7 @@ struct TimelineMetadataBody { // doing a clean shutdown, so that there is no more WAL beyond // 'disk_consistent_lsn' prev_record_lsn: Option, - ancestor_timeline: Option, + ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, @@ -73,7 +73,7 @@ impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, prev_record_lsn: Option, - ancestor_timeline: Option, + ancestor_timeline: Option, ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, @@ -149,7 +149,7 @@ impl TimelineMetadata { self.body.prev_record_lsn } - pub fn ancestor_timeline(&self) -> Option { + pub fn ancestor_timeline(&self) -> Option { self.body.ancestor_timeline } @@ -170,23 +170,23 @@ impl TimelineMetadata { /// where certain timeline's metadata file should be located. pub fn metadata_path( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, ) -> PathBuf { - conf.timeline_path(&timelineid, &tenantid) + conf.timeline_path(&timeline_id, &tenant_id) .join(METADATA_FILE_NAME) } /// Save timeline metadata to file pub fn save_metadata( conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, data: &TimelineMetadata, first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); + let path = metadata_path(conf, timeline_id, tenant_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e10330bdd3..8dafcab124 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -3,15 +3,15 @@ //! use crate::repository::{Key, Value}; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; pub fn range_overlaps(a: &Range, b: &Range) -> bool @@ -50,7 +50,7 @@ where /// #[derive(Debug)] pub struct ValueReconstructState { - pub records: Vec<(Lsn, ZenithWalRecord)>, + pub records: Vec<(Lsn, NeonWalRecord)>, pub img: Option<(Lsn, Bytes)>, } @@ -84,10 +84,10 @@ pub enum ValueReconstructResult { /// LSN /// pub trait Layer: Send + Sync { - fn get_tenant_id(&self) -> ZTenantId; + fn get_tenant_id(&self) -> TenantId; /// Identify the timeline this layer belongs to - fn get_timeline_id(&self) -> ZTimelineId; + fn get_timeline_id(&self) -> TimelineId; /// Range of keys that this layer covers fn get_key_range(&self) -> Range; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c96ad99909..e821ef1b9a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -39,10 +39,10 @@ use crate::tenant_config::TenantConfOpt; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ + id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, simple_rcu::{Rcu, RcuReadGuard}, - zid::{ZTenantId, ZTimelineId}, }; use crate::repository::GcResult; @@ -58,8 +58,8 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, - pub tenant_id: ZTenantId, - pub timeline_id: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub layers: RwLock, @@ -312,7 +312,7 @@ impl Timeline { } /// Get the ancestor's timeline id - pub fn get_ancestor_timeline_id(&self) -> Option { + pub fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id) @@ -531,8 +531,8 @@ impl Timeline { tenant_conf: Arc>, metadata: TimelineMetadata, ancestor: Option>, - timeline_id: ZTimelineId, - tenant_id: ZTenantId, + timeline_id: TimelineId, + tenant_id: TenantId, walredo_mgr: Arc, upload_layers: bool, ) -> Timeline { @@ -1250,7 +1250,7 @@ impl Timeline { None }; - let ancestor_timelineid = self + let ancestor_timeline_id = self .ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id); @@ -1258,7 +1258,7 @@ impl Timeline { let metadata = TimelineMetadata::new( disk_consistent_lsn, ondisk_prev_record_lsn, - ancestor_timelineid, + ancestor_timeline_id, self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 73bf3636d2..4448ffc456 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; use std::path::PathBuf; use std::time::Duration; -use utils::zid::ZTenantId; +use utils::id::TenantId; pub const TENANT_CONFIG_NAME: &str = "config"; @@ -217,8 +217,8 @@ impl TenantConf { /// Points to a place in pageserver's local directory, /// where certain tenant's tenantconf file should be located. - pub fn path(conf: &'static PageServerConf, tenantid: ZTenantId) -> PathBuf { - conf.tenant_path(&tenantid).join(TENANT_CONFIG_NAME) + pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf { + conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) } #[cfg(test)] diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index a8a9926c77..d6fa843305 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -27,7 +27,7 @@ use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; use utils::crashsafe_dir; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::{TenantId, TimelineId}; mod tenants_state { use once_cell::sync::Lazy; @@ -35,20 +35,20 @@ mod tenants_state { collections::HashMap, sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; - use utils::zid::ZTenantId; + use utils::id::TenantId; use crate::tenant::Tenant; - static TENANTS: Lazy>>> = + static TENANTS: Lazy>>> = Lazy::new(|| RwLock::new(HashMap::new())); - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { TENANTS .read() .expect("Failed to read() tenants lock, it got poisoned") } - pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { TENANTS .write() .expect("Failed to write() tenants lock, it got poisoned") @@ -159,7 +159,7 @@ pub fn attach_local_tenants( fn load_local_tenant( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: &RemoteIndex, ) -> Arc { let tenant = Arc::new(Tenant::new( @@ -225,7 +225,7 @@ pub async fn shutdown_all_tenants() { fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( @@ -310,9 +310,9 @@ fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyho pub fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, remote_index: RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { match tenants_state::write_tenants().entry(tenant_id) { hash_map::Entry::Occupied(_) => { debug!("tenant {tenant_id} already exists"); @@ -339,7 +339,7 @@ pub fn create_tenant( pub fn update_tenant_config( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); @@ -349,7 +349,7 @@ pub fn update_tenant_config( /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result> { +pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -361,7 +361,7 @@ pub fn get_tenant(tenant_id: ZTenantId, active_only: bool) -> anyhow::Result anyhow::Result<()> { +pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { // Start with the shutdown of timeline tasks (this shuts down the walreceiver) // It is important that we do not take locks here, and do not check whether the timeline exists // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join @@ -398,7 +398,7 @@ pub async fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> pub async fn detach_tenant( conf: &'static PageServerConf, - tenant_id: ZTenantId, + tenant_id: TenantId, ) -> anyhow::Result<()> { let tenant = match { let mut tenants_accessor = tenants_state::write_tenants(); @@ -565,14 +565,14 @@ fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, ) -> anyhow::Result<( - ZTenantId, - HashMap)>, + TenantId, + HashMap)>, )> { let tenant_id = tenant_path .file_name() .and_then(OsStr::to_str) .unwrap_or_default() - .parse::() + .parse::() .context("Could not parse tenant id out of the tenant dir name")?; let timelines_dir = config.timelines_path(&tenant_id); @@ -644,7 +644,7 @@ fn collect_timelines_for_tenant( // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet)> { +) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet)> { let mut timeline_files = HashSet::new(); let mut timeline_metadata_path = None; @@ -652,7 +652,7 @@ fn collect_timeline_files( .file_name() .and_then(OsStr::to_str) .unwrap_or_default() - .parse::() + .parse::() .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index 3ef54838af..c543a0ecb1 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -10,9 +10,9 @@ use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::{Tenant, TenantState}; use crate::tenant_mgr; use tracing::*; -use utils::zid::ZTenantId; +use utils::id::TenantId; -pub fn start_background_loops(tenant_id: ZTenantId) { +pub fn start_background_loops(tenant_id: TenantId) { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -42,9 +42,8 @@ pub fn start_background_loops(tenant_id: ZTenantId) { /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: ZTenantId) { +async fn compaction_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting compaction loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { @@ -90,9 +89,8 @@ async fn compaction_loop(tenant_id: ZTenantId) { /// /// GC task's main loop /// -async fn gc_loop(tenant_id: ZTenantId) { +async fn gc_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting gc loop for {tenant_id}"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { @@ -138,7 +136,7 @@ async fn gc_loop(tenant_id: ZTenantId) { } async fn wait_for_active_tenant( - tenant_id: ZTenantId, + tenant_id: TenantId, wait: Duration, ) -> ControlFlow<(), Arc> { let tenant = loop { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 69d14babf0..88b26e18f4 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -14,8 +14,8 @@ use tracing::*; use remote_storage::path_with_suffix_extension; use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; use crate::config::PageServerConf; @@ -61,8 +61,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // fn bootstrap_timeline( conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, tenant: &Tenant, ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` @@ -115,12 +115,12 @@ fn bootstrap_timeline( /// pub(crate) async fn create_timeline( conf: &'static PageServerConf, - tenant_id: ZTenantId, - new_timeline_id: Option, - ancestor_timeline_id: Option, + tenant_id: TenantId, + new_timeline_id: Option, + ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, ) -> Result>> { - let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); let tenant = tenant_mgr::get_tenant(tenant_id, true)?; if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 7a2c699b44..896c2603a2 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -53,8 +53,8 @@ pub struct VirtualFile { pub path: PathBuf, open_options: OpenOptions, - tenantid: String, - timelineid: String, + tenant_id: String, + timeline_id: String, } #[derive(Debug, PartialEq, Clone, Copy)] @@ -149,7 +149,7 @@ impl OpenFiles { // old file. // if let Some(old_file) = slot_guard.file.take() { - // We do not have information about tenantid/timelineid of evicted file. + // We do not have information about tenant_id/timeline_id of evicted file. // It is possible to store path together with file or use filepath crate, // but as far as close() is not expected to be fast, it is not so critical to gather // precise per-tenant statistic here. @@ -197,18 +197,18 @@ impl VirtualFile { ) -> Result { let path_str = path.to_string_lossy(); let parts = path_str.split('/').collect::>(); - let tenantid; - let timelineid; + let tenant_id; + let timeline_id; if parts.len() > 5 && parts[parts.len() - 5] == "tenants" { - tenantid = parts[parts.len() - 4].to_string(); - timelineid = parts[parts.len() - 2].to_string(); + tenant_id = parts[parts.len() - 4].to_string(); + timeline_id = parts[parts.len() - 2].to_string(); } else { - tenantid = "*".to_string(); - timelineid = "*".to_string(); + tenant_id = "*".to_string(); + timeline_id = "*".to_string(); } let (handle, mut slot_guard) = get_open_files().find_victim_slot(); let file = STORAGE_IO_TIME - .with_label_values(&["open", &tenantid, &timelineid]) + .with_label_values(&["open", &tenant_id, &timeline_id]) .observe_closure_duration(|| open_options.open(path))?; // Strip all options other than read and write. @@ -226,8 +226,8 @@ impl VirtualFile { pos: 0, path: path.to_path_buf(), open_options: reopen_options, - tenantid, - timelineid, + tenant_id, + timeline_id, }; slot_guard.file.replace(file); @@ -267,7 +267,7 @@ impl VirtualFile { // Found a cached file descriptor. slot.recently_used.store(true, Ordering::Relaxed); return Ok(STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(file))); } } @@ -294,7 +294,7 @@ impl VirtualFile { // Open the physical file let file = STORAGE_IO_TIME - .with_label_values(&["open", &self.tenantid, &self.timelineid]) + .with_label_values(&["open", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it @@ -308,7 +308,7 @@ impl VirtualFile { // may deadlock on subsequent read calls. // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(&file)); // Store the File in the slot and update the handle in the VirtualFile @@ -333,11 +333,11 @@ impl Drop for VirtualFile { if slot_guard.tag == handle.tag { slot.recently_used.store(false, Ordering::Relaxed); // Unlike files evicted by replacement algorithm, here - // we group close time by tenantid/timelineid. + // we group close time by tenant_id/timeline_id. // At allows to compare number/time of "normal" file closes // with file eviction. STORAGE_IO_TIME - .with_label_values(&["close", &self.tenantid, &self.timelineid]) + .with_label_values(&["close", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| slot_guard.file.take()); } } @@ -399,7 +399,7 @@ impl FileExt for VirtualFile { let result = self.with_file("read", |file| file.read_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["read", &self.tenantid, &self.timelineid]) + .with_label_values(&["read", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result @@ -409,7 +409,7 @@ impl FileExt for VirtualFile { let result = self.with_file("write", |file| file.write_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenantid, &self.timelineid]) + .with_label_values(&["write", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 45d0916dec..bede4ac13e 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1,5 +1,5 @@ //! -//! Parse PostgreSQL WAL records and store them in a zenith Timeline. +//! Parse PostgreSQL WAL records and store them in a neon Timeline. //! //! The pipeline for ingesting WAL looks like this: //! @@ -9,7 +9,7 @@ //! and decodes it to individual WAL records. It feeds the WAL records //! to WalIngest, which parses them and stores them in the Repository. //! -//! The zenith Repository can store page versions in two formats: as +//! The neon Repository can store page versions in two formats: as //! page images, or a WAL records. WalIngest::ingest_record() extracts //! page images out of some WAL records, but most it stores as WAL //! records. If a WAL record modifies multiple pages, WalIngest @@ -315,7 +315,7 @@ impl<'a> WalIngest<'a> { assert_eq!(image.len(), BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { - let rec = ZenithWalRecord::Postgres { + let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; @@ -428,7 +428,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, new_vm_blk.unwrap(), - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -442,7 +442,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, new_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno: None, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -454,7 +454,7 @@ impl<'a> WalIngest<'a> { modification, vm_rel, old_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno: None, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, @@ -642,12 +642,12 @@ impl<'a> WalIngest<'a> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { + NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; page_xids = Vec::new(); @@ -662,12 +662,12 @@ impl<'a> WalIngest<'a> { segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { + NeonWalRecord::ClogSetCommitted { xids: page_xids, timestamp: parsed.xact_time, } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; @@ -760,7 +760,7 @@ impl<'a> WalIngest<'a> { SlruKind::MultiXactOffsets, segno, rpageno, - ZenithWalRecord::MultixactOffsetCreate { + NeonWalRecord::MultixactOffsetCreate { mid: xlrec.mid, moff: xlrec.moff, }, @@ -794,7 +794,7 @@ impl<'a> WalIngest<'a> { SlruKind::MultiXactMembers, pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, pageno % pg_constants::SLRU_PAGES_PER_SEGMENT, - ZenithWalRecord::MultixactMembersCreate { + NeonWalRecord::MultixactMembersCreate { moff: offset, members: this_page_members, }, @@ -901,7 +901,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, - rec: ZenithWalRecord, + rec: NeonWalRecord, ) -> Result<()> { self.handle_rel_extend(modification, rel, blknum)?; modification.put_rel_wal_record(rel, blknum, rec)?; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 69e400f291..1e4b4e7d52 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -34,8 +34,8 @@ use crate::{ DEFAULT_MAX_BACKOFF_SECONDS, }; use utils::{ + id::{NodeId, TenantTimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantTimelineId}, }; use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; @@ -101,7 +101,7 @@ async fn connection_manager_loop_step( etcd_client: &mut Client, walreceiver_state: &mut WalreceiverState, ) { - let id = ZTenantTimelineId { + let id = TenantTimelineId { tenant_id: walreceiver_state.timeline.tenant_id, timeline_id: walreceiver_state.timeline.timeline_id, }; @@ -230,7 +230,7 @@ fn cleanup_broker_connection( async fn subscribe_for_timeline_updates( etcd_client: &mut Client, broker_prefix: &str, - id: ZTenantTimelineId, + id: TenantTimelineId, ) -> BrokerSubscription { let mut attempt = 0; loop { @@ -266,7 +266,7 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { - id: ZTenantTimelineId, + id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, @@ -331,7 +331,7 @@ impl WalreceiverState { lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, ) -> Self { - let id = ZTenantTimelineId { + let id = TenantTimelineId { tenant_id: timeline.tenant_id, timeline_id: timeline.timeline_id, }; @@ -746,10 +746,10 @@ enum ReconnectReason { } fn wal_stream_connection_string( - ZTenantTimelineId { + TenantTimelineId { tenant_id, timeline_id, - }: ZTenantTimelineId, + }: TenantTimelineId, listen_pg_addr_str: &str, ) -> anyhow::Result { let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); @@ -760,7 +760,7 @@ fn wal_stream_connection_string( })?; let (host, port) = utils::connstring::connection_host_port(&me_conf); Ok(format!( - "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + "host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" )) } @@ -1355,7 +1355,7 @@ mod tests { fn dummy_state(harness: &TenantHarness) -> WalreceiverState { WalreceiverState { - id: ZTenantTimelineId { + id: TenantTimelineId { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 6f1fbc2c9d..29c4cea882 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -30,7 +30,7 @@ use crate::{ walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use utils::zid::ZTenantTimelineId; +use utils::id::TenantTimelineId; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; /// Status of the connection. @@ -288,7 +288,7 @@ pub async fn handle_walreceiver_connection( .await // here we either do not have this timeline in remote index // or there were no checkpoints for it yet - .timeline_entry(&ZTenantTimelineId { + .timeline_entry(&TenantTimelineId { tenant_id, timeline_id, }) @@ -316,7 +316,7 @@ pub async fn handle_walreceiver_connection( }; *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal); - // Send zenith feedback message. + // Send the replication feedback message. // Regular standby_status_update fields are put into this message. let status_update = ReplicationFeedback { current_timeline_size: timeline @@ -328,7 +328,7 @@ pub async fn handle_walreceiver_connection( ps_replytime: ts, }; - debug!("zenith_status_update {status_update:?}"); + debug!("neon_status_update {status_update:?}"); let mut data = BytesMut::new(); status_update.serialize(&mut data)?; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index c718a4c30c..dbf9bf9d33 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -13,10 +13,10 @@ use serde::{Deserialize, Serialize}; use tracing::*; use utils::bin_ser::DeserializeError; -/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom zenith-specific "record". +/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom neon-specific "record". #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum ZenithWalRecord { +pub enum NeonWalRecord { /// Native PostgreSQL WAL record Postgres { will_init: bool, rec: Bytes }, @@ -45,14 +45,14 @@ pub enum ZenithWalRecord { }, } -impl ZenithWalRecord { +impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { match self { - ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, + NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, - // None of the special zenith record types currently initialize the page + // None of the special neon record types currently initialize the page _ => false, } } @@ -767,9 +767,9 @@ pub fn decode_wal_record( /// Build a human-readable string to describe a WAL record /// /// For debugging purposes -pub fn describe_wal_record(rec: &ZenithWalRecord) -> Result { +pub fn describe_wal_record(rec: &NeonWalRecord) -> Result { match rec { - ZenithWalRecord::Postgres { will_init, rec } => Ok(format!( + NeonWalRecord::Postgres { will_init, rec } => Ok(format!( "will_init: {}, {}", will_init, describe_postgres_wal_record(rec)? diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index dd946659bb..9faabfebda 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -36,7 +36,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId}; +use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, @@ -44,7 +44,7 @@ use crate::metrics::{ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; -use crate::walrecord::ZenithWalRecord; +use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, @@ -81,7 +81,7 @@ pub trait WalRedoManager: Send + Sync { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result; } @@ -93,20 +93,20 @@ pub trait WalRedoManager: Send + Sync { /// records. /// pub struct PostgresRedoManager { - tenantid: ZTenantId, + tenant_id: TenantId, conf: &'static PageServerConf, process: Mutex>, } -/// Can this request be served by zenith redo functions +/// Can this request be served by neon redo functions /// or we need to pass it to wal-redo postgres process? -fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { +fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any - // Postgres WAL records. But everything else is handled in zenith. + // Postgres WAL records. But everything else is handled in neon. #[allow(clippy::match_like_matches_macro)] match rec { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, } => false, @@ -143,7 +143,7 @@ impl WalRedoManager for PostgresRedoManager { key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -151,14 +151,14 @@ impl WalRedoManager for PostgresRedoManager { } let mut img: Option = base_img; - let mut batch_zenith = can_apply_in_zenith(&records[0].1); + let mut batch_neon = can_apply_in_neon(&records[0].1); let mut batch_start = 0; for i in 1..records.len() { - let rec_zenith = can_apply_in_zenith(&records[i].1); + let rec_neon = can_apply_in_neon(&records[i].1); - if rec_zenith != batch_zenith { - let result = if batch_zenith { - self.apply_batch_zenith(key, lsn, img, &records[batch_start..i]) + if rec_neon != batch_neon { + let result = if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..i]) } else { self.apply_batch_postgres( key, @@ -170,13 +170,13 @@ impl WalRedoManager for PostgresRedoManager { }; img = Some(result?); - batch_zenith = rec_zenith; + batch_neon = rec_neon; batch_start = i; } } // last batch - if batch_zenith { - self.apply_batch_zenith(key, lsn, img, &records[batch_start..]) + if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..]) } else { self.apply_batch_postgres( key, @@ -193,10 +193,10 @@ impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// - pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager { + pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { - tenantid, + tenant_id, conf, process: Mutex::new(None), } @@ -210,7 +210,7 @@ impl PostgresRedoManager { key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; @@ -222,7 +222,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenantid)?; + let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -263,14 +263,14 @@ impl PostgresRedoManager { } /// - /// Process a batch of WAL records using bespoken Zenith code. + /// Process a batch of WAL records using bespoken Neon code. /// - fn apply_batch_zenith( + fn apply_batch_neon( &self, key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], ) -> Result { let start_time = Instant::now(); @@ -280,13 +280,13 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - error!("invalid zenith WAL redo request with no base image"); + error!("invalid neon WAL redo request with no base image"); return Err(WalRedoError::InvalidRequest); } // Apply all the WAL records in the batch for (record_lsn, record) in records.iter() { - self.apply_record_zenith(key, &mut page, *record_lsn, record)?; + self.apply_record_neon(key, &mut page, *record_lsn, record)?; } // Success! let end_time = Instant::now(); @@ -294,7 +294,7 @@ impl PostgresRedoManager { WAL_REDO_TIME.observe(duration.as_secs_f64()); debug!( - "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}", + "neon applied {} WAL records in {} ms to reconstruct page image at LSN {}", records.len(), duration.as_micros(), lsn @@ -303,22 +303,22 @@ impl PostgresRedoManager { Ok(page.freeze()) } - fn apply_record_zenith( + fn apply_record_neon( &self, key: Key, page: &mut BytesMut, _record_lsn: Lsn, - record: &ZenithWalRecord, + record: &NeonWalRecord, ) -> Result<(), WalRedoError> { match record { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, } => { - error!("tried to pass postgres wal record to zenith WAL redo"); + error!("tried to pass postgres wal record to neon WAL redo"); return Err(WalRedoError::InvalidRequest); } - ZenithWalRecord::ClearVisibilityMapFlags { + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags, @@ -360,7 +360,7 @@ impl PostgresRedoManager { } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. - ZenithWalRecord::ClogSetCommitted { xids, timestamp } => { + NeonWalRecord::ClogSetCommitted { xids, timestamp } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -410,7 +410,7 @@ impl PostgresRedoManager { ); } } - ZenithWalRecord::ClogSetAborted { xids } => { + NeonWalRecord::ClogSetAborted { xids } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -441,7 +441,7 @@ impl PostgresRedoManager { transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); } } - ZenithWalRecord::MultixactOffsetCreate { mid, moff } => { + NeonWalRecord::MultixactOffsetCreate { mid, moff } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -474,7 +474,7 @@ impl PostgresRedoManager { LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } - ZenithWalRecord::MultixactMembersCreate { moff, members } => { + NeonWalRecord::MultixactMembersCreate { moff, members } => { let (slru_kind, segno, blknum) = key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; assert_eq!( @@ -570,7 +570,7 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenant_id: &ZTenantId) -> Result { + fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. @@ -686,7 +686,7 @@ impl PostgresRedoProcess { &mut self, tag: BufferTag, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { // Serialize all the messages to send the WAL redo process first. @@ -700,7 +700,7 @@ impl PostgresRedoProcess { build_push_page_msg(tag, &img, &mut writebuf); } for (lsn, rec) in records.iter() { - if let ZenithWalRecord::Postgres { + if let NeonWalRecord::Postgres { will_init: _, rec: postgres_rec, } = rec @@ -709,7 +709,7 @@ impl PostgresRedoProcess { } else { return Err(Error::new( ErrorKind::Other, - "tried to pass zenith wal record to postgres WAL redo", + "tried to pass neon wal record to postgres WAL redo", )); } } diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c index 4926d759e8..bc0ee352b8 100644 --- a/pgxn/neon/inmem_smgr.c +++ b/pgxn/neon/inmem_smgr.c @@ -86,7 +86,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum) } /* - * inmem_create() -- Create a new relation on zenithd storage + * inmem_create() -- Create a new relation on neon storage * * If isRedo is true, it's okay for the relation to exist already. */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 55285a6345..296865838d 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -30,13 +30,12 @@ #include "walproposer.h" #include "walproposer_utils.h" - #define PageStoreTrace DEBUG5 #define NEON_TAG "[NEON_SMGR] " -#define neon_log(tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ - errhidestmt(true), errhidecontext(true))) +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) bool connected = false; PGconn *pageserver_conn = NULL; @@ -65,7 +64,7 @@ pageserver_connect() errdetail_internal("%s", msg))); } - query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); + query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); ret = PQsendQuery(pageserver_conn, query); if (ret != 1) { @@ -169,7 +168,7 @@ pageserver_disconnect(void) } static void -pageserver_send(ZenithRequest *request) +pageserver_send(NeonRequest * request) { StringInfoData req_buff; @@ -205,18 +204,18 @@ pageserver_send(ZenithRequest *request) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((ZenithMessage *) request); + char *msg = zm_to_string((NeonMessage *) request); neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); } } -static ZenithResponse * +static NeonResponse * pageserver_receive(void) { StringInfoData resp_buff; - ZenithResponse *resp; + NeonResponse *resp; PG_TRY(); { @@ -236,7 +235,7 @@ pageserver_receive(void) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((ZenithMessage *) resp); + char *msg = zm_to_string((NeonMessage *) resp); neon_log(PageStoreTrace, "got response: %s", msg); pfree(msg); @@ -249,7 +248,7 @@ pageserver_receive(void) } PG_END_TRY(); - return (ZenithResponse *) resp; + return (NeonResponse *) resp; } @@ -265,8 +264,8 @@ pageserver_flush(void) } } -static ZenithResponse * -pageserver_call(ZenithRequest *request) +static NeonResponse * +pageserver_call(NeonRequest * request) { pageserver_send(request); pageserver_flush(); @@ -281,7 +280,7 @@ page_server_api api = { }; static bool -check_zenith_id(char **newval, void **extra, GucSource source) +check_neon_id(char **newval, void **extra, GucSource source) { uint8 zid[16]; @@ -403,22 +402,22 @@ pg_init_libpagestore(void) NULL, NULL, NULL); DefineCustomStringVariable("neon.timeline_id", - "Zenith timelineid the server is running on", + "Neon timeline_id the server is running on", NULL, - &zenith_timeline, + &neon_timeline, "", PGC_POSTMASTER, 0, /* no flags required */ - check_zenith_id, NULL, NULL); + check_neon_id, NULL, NULL); DefineCustomStringVariable("neon.tenant_id", - "Neon tenantid the server is running on", + "Neon tenant_id the server is running on", NULL, - &zenith_tenant, + &neon_tenant, "", PGC_POSTMASTER, 0, /* no flags required */ - check_zenith_id, NULL, NULL); + check_neon_id, NULL, NULL); DefineCustomBoolVariable("neon.wal_redo", "start in wal-redo mode", @@ -450,8 +449,8 @@ pg_init_libpagestore(void) page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); /* Is there more correct way to pass CustomGUC to postgres code? */ - zenith_timeline_walproposer = zenith_timeline; - zenith_tenant_walproposer = zenith_tenant; + neon_timeline_walproposer = neon_timeline; + neon_tenant_walproposer = neon_tenant; if (wal_redo) { @@ -462,8 +461,8 @@ pg_init_libpagestore(void) else if (page_server_connstring && page_server_connstring[0]) { neon_log(PageStoreTrace, "set neon_smgr hook"); - smgr_hook = smgr_zenith; - smgr_init_hook = smgr_init_zenith; - dbsize_hook = zenith_dbsize; + smgr_hook = smgr_neon; + smgr_init_hook = smgr_init_neon; + dbsize_hook = neon_dbsize; } } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 5346680b0b..2a2a163ee8 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -28,7 +28,6 @@ PG_MODULE_MAGIC; void _PG_init(void); - void _PG_init(void) { @@ -56,7 +55,6 @@ pg_cluster_size(PG_FUNCTION_ARGS) PG_RETURN_INT64(size); } - Datum backpressure_lsns(PG_FUNCTION_ARGS) { diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 7dc38c13fb..633c7b465c 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -28,31 +28,29 @@ typedef enum { /* pagestore_client -> pagestore */ - T_ZenithExistsRequest = 0, - T_ZenithNblocksRequest, - T_ZenithGetPageRequest, - T_ZenithDbSizeRequest, + T_NeonExistsRequest = 0, + T_NeonNblocksRequest, + T_NeonGetPageRequest, + T_NeonDbSizeRequest, /* pagestore -> pagestore_client */ - T_ZenithExistsResponse = 100, - T_ZenithNblocksResponse, - T_ZenithGetPageResponse, - T_ZenithErrorResponse, - T_ZenithDbSizeResponse, -} ZenithMessageTag; - - + T_NeonExistsResponse = 100, + T_NeonNblocksResponse, + T_NeonGetPageResponse, + T_NeonErrorResponse, + T_NeonDbSizeResponse, +} NeonMessageTag; /* base struct for c-style inheritance */ typedef struct { - ZenithMessageTag tag; -} ZenithMessage; + NeonMessageTag tag; +} NeonMessage; -#define messageTag(m) (((const ZenithMessage *)(m))->tag) +#define messageTag(m) (((const NeonMessage *)(m))->tag) /* - * supertype of all the Zenith*Request structs below + * supertype of all the Neon*Request structs below * * If 'latest' is true, we are requesting the latest page version, and 'lsn' * is just a hint to the server that we know there are no versions of the page @@ -60,81 +58,79 @@ typedef struct */ typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; bool latest; /* if true, request latest page version */ XLogRecPtr lsn; /* request page version @ this LSN */ -} ZenithRequest; +} NeonRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; -} ZenithExistsRequest; +} NeonExistsRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; -} ZenithNblocksRequest; - +} NeonNblocksRequest; typedef struct { - ZenithRequest req; + NeonRequest req; Oid dbNode; -} ZenithDbSizeRequest; - +} NeonDbSizeRequest; typedef struct { - ZenithRequest req; + NeonRequest req; RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; -} ZenithGetPageRequest; +} NeonGetPageRequest; -/* supertype of all the Zenith*Response structs below */ +/* supertype of all the Neon*Response structs below */ typedef struct { - ZenithMessageTag tag; -} ZenithResponse; + NeonMessageTag tag; +} NeonResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; bool exists; -} ZenithExistsResponse; +} NeonExistsResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; uint32 n_blocks; -} ZenithNblocksResponse; +} NeonNblocksResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; char page[FLEXIBLE_ARRAY_MEMBER]; -} ZenithGetPageResponse; +} NeonGetPageResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; int64 db_size; -} ZenithDbSizeResponse; +} NeonDbSizeResponse; typedef struct { - ZenithMessageTag tag; + NeonMessageTag tag; char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error * message */ -} ZenithErrorResponse; +} NeonErrorResponse; -extern StringInfoData zm_pack_request(ZenithRequest *msg); -extern ZenithResponse *zm_unpack_response(StringInfo s); -extern char *zm_to_string(ZenithMessage *msg); +extern StringInfoData zm_pack_request(NeonRequest * msg); +extern NeonResponse * zm_unpack_response(StringInfo s); +extern char *zm_to_string(NeonMessage * msg); /* * API @@ -142,57 +138,57 @@ extern char *zm_to_string(ZenithMessage *msg); typedef struct { - ZenithResponse *(*request) (ZenithRequest *request); - void (*send) (ZenithRequest *request); - ZenithResponse *(*receive) (void); + NeonResponse *(*request) (NeonRequest * request); + void (*send) (NeonRequest * request); + NeonResponse *(*receive) (void); void (*flush) (void); } page_server_api; extern page_server_api * page_server; extern char *page_server_connstring; -extern char *zenith_timeline; -extern char *zenith_tenant; +extern char *neon_timeline; +extern char *neon_tenant; extern bool wal_redo; extern int32 max_cluster_size; -extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); -extern void smgr_init_zenith(void); +extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); +extern void smgr_init_neon(void); extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); extern void smgr_init_inmem(void); extern void smgr_shutdown_inmem(void); -/* zenith storage manager functionality */ +/* Neon storage manager functionality */ -extern void zenith_init(void); -extern void zenith_open(SMgrRelation reln); -extern void zenith_close(SMgrRelation reln, ForkNumber forknum); -extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); -extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void zenith_reset_prefetch(SMgrRelation reln); -extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); +extern void neon_init(void); +extern void neon_open(SMgrRelation reln); +extern void neon_close(SMgrRelation reln, ForkNumber forknum); +extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool neon_exists(SMgrRelation reln, ForkNumber forknum); +extern void neon_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void neon_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void neon_reset_prefetch(SMgrRelation reln); +extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); -extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +extern void neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); -extern void zenith_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); -extern int64 zenith_dbsize(Oid dbNode); -extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); +extern void neon_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); +extern int64 neon_dbsize(Oid dbNode); +extern void neon_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); -/* zenith wal-redo storage manager functionality */ +/* neon wal-redo storage manager functionality */ extern void inmem_init(void); extern void inmem_open(SMgrRelation reln); @@ -215,8 +211,7 @@ extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - -/* utils for zenith relsize cache */ +/* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 504ae60d4a..24adee019f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -96,9 +96,9 @@ page_server_api *page_server; /* GUCs */ char *page_server_connstring; -//with substituted password -char *zenith_timeline; -char *zenith_tenant; +/*with substituted password*/ +char *neon_timeline; +char *neon_tenant; bool wal_redo = false; int32 max_cluster_size; @@ -143,7 +143,7 @@ consume_prefetch_responses(void) { for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++) { - ZenithResponse *resp = page_server->receive(); + NeonResponse *resp = page_server->receive(); pfree(resp); } @@ -151,16 +151,16 @@ consume_prefetch_responses(void) n_prefetch_responses = 0; } -static ZenithResponse * +static NeonResponse * page_server_request(void const *req) { consume_prefetch_responses(); - return page_server->request((ZenithRequest *) req); + return page_server->request((NeonRequest *) req); } StringInfoData -zm_pack_request(ZenithRequest *msg) +zm_pack_request(NeonRequest * msg) { StringInfoData s; @@ -170,9 +170,9 @@ zm_pack_request(ZenithRequest *msg) switch (messageTag(msg)) { /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: + case T_NeonExistsRequest: { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -183,9 +183,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithNblocksRequest: + case T_NeonNblocksRequest: { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -196,9 +196,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithDbSizeRequest: + case T_NeonDbSizeRequest: { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -206,9 +206,9 @@ zm_pack_request(ZenithRequest *msg) break; } - case T_ZenithGetPageRequest: + case T_NeonGetPageRequest: { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); @@ -222,91 +222,91 @@ zm_pack_request(ZenithRequest *msg) } /* pagestore -> pagestore_client. We never need to create these. */ - case T_ZenithExistsResponse: - case T_ZenithNblocksResponse: - case T_ZenithGetPageResponse: - case T_ZenithErrorResponse: - case T_ZenithDbSizeResponse: + case T_NeonExistsResponse: + case T_NeonNblocksResponse: + case T_NeonGetPageResponse: + case T_NeonErrorResponse: + case T_NeonDbSizeResponse: default: - elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); + elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag); break; } return s; } -ZenithResponse * +NeonResponse * zm_unpack_response(StringInfo s) { - ZenithMessageTag tag = pq_getmsgbyte(s); - ZenithResponse *resp = NULL; + NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse *resp = NULL; switch (tag) { /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: + case T_NeonExistsResponse: { - ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); + NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); msg_resp->tag = tag; msg_resp->exists = pq_getmsgbyte(s); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithNblocksResponse: + case T_NeonNblocksResponse: { - ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); + NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); msg_resp->tag = tag; msg_resp->n_blocks = pq_getmsgint(s, 4); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithGetPageResponse: + case T_NeonGetPageResponse: { - ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); + NeonGetPageResponse *msg_resp = palloc0(offsetof(NeonGetPageResponse, page) + BLCKSZ); msg_resp->tag = tag; /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithDbSizeResponse: + case T_NeonDbSizeResponse: { - ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); + NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); msg_resp->tag = tag; msg_resp->db_size = pq_getmsgint64(s); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } - case T_ZenithErrorResponse: + case T_NeonErrorResponse: { - ZenithErrorResponse *msg_resp; + NeonErrorResponse *msg_resp; size_t msglen; const char *msgtext; msgtext = pq_getmsgrawstring(s); msglen = strlen(msgtext); - msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); + msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); msg_resp->tag = tag; memcpy(msg_resp->message, msgtext, msglen + 1); pq_getmsgend(s); - resp = (ZenithResponse *) msg_resp; + resp = (NeonResponse *) msg_resp; break; } @@ -315,12 +315,12 @@ zm_unpack_response(StringInfo s) * * We create these ourselves, and don't need to decode them. */ - case T_ZenithExistsRequest: - case T_ZenithNblocksRequest: - case T_ZenithGetPageRequest: - case T_ZenithDbSizeRequest: + case T_NeonExistsRequest: + case T_NeonNblocksRequest: + case T_NeonGetPageRequest: + case T_NeonDbSizeRequest: default: - elog(ERROR, "unexpected zenith message tag 0x%02x", tag); + elog(ERROR, "unexpected neon message tag 0x%02x", tag); break; } @@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -zm_to_string(ZenithMessage *msg) +zm_to_string(NeonMessage * msg) { StringInfoData s; @@ -338,11 +338,11 @@ zm_to_string(ZenithMessage *msg) switch (messageTag(msg)) { /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: + case T_NeonExistsRequest: { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -354,11 +354,11 @@ zm_to_string(ZenithMessage *msg) break; } - case T_ZenithNblocksRequest: + case T_NeonNblocksRequest: { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -370,11 +370,11 @@ zm_to_string(ZenithMessage *msg) break; } - case T_ZenithGetPageRequest: + case T_NeonGetPageRequest: { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", msg_req->rnode.spcNode, msg_req->rnode.dbNode, @@ -386,11 +386,11 @@ zm_to_string(ZenithMessage *msg) appendStringInfoChar(&s, '}'); break; } - case T_ZenithDbSizeRequest: + case T_NeonDbSizeRequest: { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); @@ -398,61 +398,57 @@ zm_to_string(ZenithMessage *msg) break; } - /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: + case T_NeonExistsResponse: { - ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; + NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); appendStringInfo(&s, ", \"exists\": %d}", - msg_resp->exists - ); + msg_resp->exists); appendStringInfoChar(&s, '}'); break; } - case T_ZenithNblocksResponse: + case T_NeonNblocksResponse: { - ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; + NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks - ); + msg_resp->n_blocks); appendStringInfoChar(&s, '}'); break; } - case T_ZenithGetPageResponse: + case T_NeonGetPageResponse: { #if 0 - ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; + NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; #endif - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); appendStringInfo(&s, ", \"page\": \"XXX\"}"); appendStringInfoChar(&s, '}'); break; } - case T_ZenithErrorResponse: + case T_NeonErrorResponse: { - ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; + NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; /* FIXME: escape double-quotes in the message */ - appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); appendStringInfoChar(&s, '}'); break; } - case T_ZenithDbSizeResponse: + case T_NeonDbSizeResponse: { - ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; + NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); appendStringInfo(&s, ", \"db_size\": %ld}", - msg_resp->db_size - ); + msg_resp->db_size); appendStringInfoChar(&s, '}'); break; @@ -494,7 +490,7 @@ PageIsEmptyHeapPage(char *buffer) } static void -zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { XLogRecPtr lsn = PageGetLSN(buffer); @@ -551,8 +547,8 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { /* * When PostgreSQL extends a relation, it calls smgrextend() with an - * all-zeros pages, and we can just ignore that in Zenith. We do need - * to remember the new size, though, so that smgrnblocks() returns the + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the * right answer after the rel has been extended. We rely on the * relsize cache for that. * @@ -616,12 +612,11 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); } - /* - * zenith_init() -- Initialize private state + * neon_init() -- Initialize private state */ void -zenith_init(void) +neon_init(void) { /* noop */ #ifdef DEBUG_COMPARE_LOCAL @@ -658,7 +653,7 @@ zm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; @@ -666,14 +661,14 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc { *latest = false; lsn = GetXLogReplayRecPtr(NULL); - elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", (uint32) ((lsn) >> 32), (uint32) (lsn)); } else if (am_walsender) { *latest = true; lsn = InvalidXLogRecPtr; - elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); + elog(DEBUG1, "am walsender neon_get_request_lsn lsn 0 "); } else { @@ -687,7 +682,7 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc *latest = true; lsn = GetLastWrittenLSN(rnode, forknum, blkno); Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ", + elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); lsn = zm_adjust_lsn(lsn); @@ -717,15 +712,14 @@ zenith_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, Bloc return lsn; } - /* - * zenith_exists() -- Does the physical file exist? + * neon_exists() -- Does the physical file exist? */ bool -zenith_exists(SMgrRelation reln, ForkNumber forkNum) +neon_exists(SMgrRelation reln, ForkNumber forkNum) { bool exists; - ZenithResponse *resp; + NeonResponse *resp; BlockNumber n_blocks; bool latest; XLogRecPtr request_lsn; @@ -777,26 +771,25 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithExistsRequest request = { - .req.tag = T_ZenithExistsRequest, + NeonExistsRequest request = { + .req.tag = T_NeonExistsRequest, .req.latest = latest, .req.lsn = request_lsn, .rnode = reln->smgr_rnode.node, - .forknum = forkNum - }; + .forknum = forkNum}; resp = page_server_request(&request); } switch (resp->tag) { - case T_ZenithExistsResponse: - exists = ((ZenithExistsResponse *) resp)->exists; + case T_NeonExistsResponse: + exists = ((NeonExistsResponse *) resp)->exists; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -806,7 +799,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -817,12 +810,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) } /* - * zenith_create() -- Create a new relation on zenithd storage + * neon_create() -- Create a new relation on neond storage * * If isRedo is true, it's okay for the relation to exist already. */ void -zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { switch (reln->smgr_relpersistence) { @@ -866,7 +859,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) } /* - * zenith_unlink() -- Unlink a relation. + * neon_unlink() -- Unlink a relation. * * Note that we're passed a RelFileNodeBackend --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. @@ -884,7 +877,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * we are usually not in a transaction anymore when this is called. */ void -zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { /* * Might or might not exist locally, depending on whether it's an unlogged @@ -899,7 +892,7 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) } /* - * zenith_extend() -- Add a block to the specified relation. + * neon_extend() -- Add a block to the specified relation. * * The semantics are nearly the same as mdwrite(): write at the * specified position. However, this is to be used for the case of @@ -908,8 +901,8 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) * causes intervening file space to become filled with zeroes. */ void -zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer, bool skipFsync) +neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) { XLogRecPtr lsn; @@ -951,7 +944,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, errhint("This limit is defined by neon.max_cluster_size GUC"))); } - zenith_wallog_page(reln, forkNum, blkno, buffer); + neon_wallog_page(reln, forkNum, blkno, buffer); set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); lsn = PageGetLSN(buffer); @@ -971,10 +964,10 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, } /* - * zenith_open() -- Initialize newly-opened relation. + * neon_open() -- Initialize newly-opened relation. */ void -zenith_open(SMgrRelation reln) +neon_open(SMgrRelation reln) { /* * We don't have anything special to do here. Call mdopen() to let md.c @@ -985,14 +978,14 @@ zenith_open(SMgrRelation reln) mdopen(reln); /* no work */ - elog(SmgrTrace, "[ZENITH_SMGR] open noop"); + elog(SmgrTrace, "[NEON_SMGR] open noop"); } /* - * zenith_close() -- Close the specified relation, if it isn't closed already. + * neon_close() -- Close the specified relation, if it isn't closed already. */ void -zenith_close(SMgrRelation reln, ForkNumber forknum) +neon_close(SMgrRelation reln, ForkNumber forknum) { /* * Let md.c close it, if it had it open. Doesn't hurt to do this even for @@ -1003,19 +996,19 @@ zenith_close(SMgrRelation reln, ForkNumber forknum) /* - * zenith_reset_prefetch() -- reoe all previously rgistered prefeth requests + * neon_reset_prefetch() -- reoe all previously rgistered prefeth requests */ void -zenith_reset_prefetch(SMgrRelation reln) +neon_reset_prefetch(SMgrRelation reln) { n_prefetch_requests = 0; } /* - * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ bool -zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { switch (reln->smgr_relpersistence) { @@ -1046,14 +1039,14 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) } /* - * zenith_writeback() -- Tell the kernel to write pages back to storage. + * neon_writeback() -- Tell the kernel to write pages back to storage. * * This accepts a range of blocks because flushing several pages at once is * considerably more efficient than doing so individually. */ void -zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) +neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) { switch (reln->smgr_relpersistence) { @@ -1075,7 +1068,7 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, } /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); + elog(SmgrTrace, "[NEON_SMGR] writeback noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1084,14 +1077,14 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum, } /* - * While function is defined in the zenith extension it's used within neon_test_utils directly. + * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ void -zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) +neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) { - ZenithResponse *resp; + NeonResponse *resp; int i; /* @@ -1103,12 +1096,12 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, for (i = n_prefetched_buffers; i < n_prefetch_responses; i++) { resp = page_server->receive(); - if (resp->tag == T_ZenithGetPageResponse && + if (resp->tag == T_NeonGetPageResponse && RelFileNodeEquals(prefetch_responses[i].rnode, rnode) && prefetch_responses[i].forkNum == forkNum && prefetch_responses[i].blockNum == blkno) { - char *page = ((ZenithGetPageResponse *) resp)->page; + char *page = ((NeonGetPageResponse *) resp)->page; /* * Check if prefetched page is still relevant. If it is updated by @@ -1135,8 +1128,8 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, n_prefetch_responses = 0; n_prefetch_misses += 1; { - ZenithGetPageRequest request = { - .req.tag = T_ZenithGetPageRequest, + NeonGetPageRequest request = { + .req.tag = T_NeonGetPageRequest, .req.latest = request_latest, .req.lsn = request_lsn, .rnode = rnode, @@ -1147,14 +1140,14 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, if (n_prefetch_requests > 0) { /* Combine all prefetch requests with primary request */ - page_server->send((ZenithRequest *) &request); + page_server->send((NeonRequest *) & request); for (i = 0; i < n_prefetch_requests; i++) { request.rnode = prefetch_requests[i].rnode; request.forknum = prefetch_requests[i].forkNum; request.blkno = prefetch_requests[i].blockNum; prefetch_responses[i] = prefetch_requests[i]; - page_server->send((ZenithRequest *) &request); + page_server->send((NeonRequest *) & request); } page_server->flush(); n_prefetch_responses = n_prefetch_requests; @@ -1164,16 +1157,16 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, } else { - resp = page_server->request((ZenithRequest *) &request); + resp = page_server->request((NeonRequest *) & request); } } switch (resp->tag) { - case T_ZenithGetPageResponse: - memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -1184,7 +1177,7 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -1195,11 +1188,11 @@ zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, } /* - * zenith_read() -- Read the specified block from a relation. + * neon_read() -- Read the specified block from a relation. */ void -zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) { bool latest; XLogRecPtr request_lsn; @@ -1221,8 +1214,8 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); - zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); + neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1328,15 +1321,15 @@ hexdump_page(char *page) #endif /* - * zenith_write() -- Write the supplied block at the appropriate location. + * neon_write() -- Write the supplied block at the appropriate location. * * This is to be used only for updating already-existing blocks of a * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ void -zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) +neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) { XLogRecPtr lsn; @@ -1372,7 +1365,7 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - zenith_wallog_page(reln, forknum, blocknum, buffer); + neon_wallog_page(reln, forknum, blocknum, buffer); lsn = PageGetLSN(buffer); elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", @@ -1389,12 +1382,12 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * zenith_nblocks() -- Get the number of blocks stored in a relation. + * neon_nblocks() -- Get the number of blocks stored in a relation. */ BlockNumber -zenith_nblocks(SMgrRelation reln, ForkNumber forknum) +neon_nblocks(SMgrRelation reln, ForkNumber forknum) { - ZenithResponse *resp; + NeonResponse *resp; BlockNumber n_blocks; bool latest; XLogRecPtr request_lsn; @@ -1426,10 +1419,10 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithNblocksRequest request = { - .req.tag = T_ZenithNblocksRequest, + NeonNblocksRequest request = { + .req.tag = T_NeonNblocksRequest, .req.latest = latest, .req.lsn = request_lsn, .rnode = reln->smgr_rnode.node, @@ -1441,11 +1434,11 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) switch (resp->tag) { - case T_ZenithNblocksResponse: - n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; + case T_NeonNblocksResponse: + n_blocks = ((NeonNblocksResponse *) resp)->n_blocks; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -1455,7 +1448,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: @@ -1463,7 +1456,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) } update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); - elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, @@ -1476,21 +1469,21 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) } /* - * zenith_db_size() -- Get the size of the database in bytes. + * neon_db_size() -- Get the size of the database in bytes. */ int64 -zenith_dbsize(Oid dbNode) +neon_dbsize(Oid dbNode) { - ZenithResponse *resp; + NeonResponse *resp; int64 db_size; XLogRecPtr request_lsn; bool latest; RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; - request_lsn = zenith_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { - ZenithDbSizeRequest request = { - .req.tag = T_ZenithDbSizeRequest, + NeonDbSizeRequest request = { + .req.tag = T_NeonDbSizeRequest, .req.latest = latest, .req.lsn = request_lsn, .dbNode = dbNode, @@ -1501,25 +1494,25 @@ zenith_dbsize(Oid dbNode) switch (resp->tag) { - case T_ZenithDbSizeResponse: - db_size = ((ZenithDbSizeResponse *) resp)->db_size; + case T_NeonDbSizeResponse: + db_size = ((NeonDbSizeResponse *) resp)->db_size; break; - case T_ZenithErrorResponse: + case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read db size of db %u from page server at lsn %X/%08X", dbNode, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); + ((NeonErrorResponse *) resp)->message))); break; default: elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); } - elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", + elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", dbNode, (uint32) (request_lsn >> 32), (uint32) request_lsn, db_size); @@ -1529,10 +1522,10 @@ zenith_dbsize(Oid dbNode) } /* - * zenith_truncate() -- Truncate relation to specified number of blocks. + * neon_truncate() -- Truncate relation to specified number of blocks. */ void -zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { XLogRecPtr lsn; @@ -1591,7 +1584,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } /* - * zenith_immedsync() -- Immediately sync a relation to stable storage. + * neon_immedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. We @@ -1602,7 +1595,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * segment may survive recovery, reintroducing unwanted data into the table. */ void -zenith_immedsync(SMgrRelation reln, ForkNumber forknum) +neon_immedsync(SMgrRelation reln, ForkNumber forknum) { switch (reln->smgr_relpersistence) { @@ -1622,7 +1615,7 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); + elog(SmgrTrace, "[NEON_SMGR] immedsync noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1631,16 +1624,16 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum) } /* - * zenith_start_unlogged_build() -- Starting build operation on a rel. + * neon_start_unlogged_build() -- Starting build operation on a rel. * * Some indexes are built in two phases, by first populating the table with * regular inserts, using the shared buffer cache but skipping WAL-logging, - * and WAL-logging the whole relation after it's done. Zenith relies on the + * and WAL-logging the whole relation after it's done. Neon relies on the * WAL to reconstruct pages, so we cannot use the page server in the * first phase when the changes are not logged. */ static void -zenith_start_unlogged_build(SMgrRelation reln) +neon_start_unlogged_build(SMgrRelation reln) { /* * Currently, there can be only one unlogged relation build operation in @@ -1692,13 +1685,13 @@ zenith_start_unlogged_build(SMgrRelation reln) } /* - * zenith_finish_unlogged_build_phase_1() + * neon_finish_unlogged_build_phase_1() * * Call this after you have finished populating a relation in unlogged mode, * before you start WAL-logging it. */ static void -zenith_finish_unlogged_build_phase_1(SMgrRelation reln) +neon_finish_unlogged_build_phase_1(SMgrRelation reln) { Assert(unlogged_build_rel == reln); @@ -1718,7 +1711,7 @@ zenith_finish_unlogged_build_phase_1(SMgrRelation reln) } /* - * zenith_end_unlogged_build() -- Finish an unlogged rel build. + * neon_end_unlogged_build() -- Finish an unlogged rel build. * * Call this after you have finished WAL-logging an relation that was * first populated without WAL-logging. @@ -1727,7 +1720,7 @@ zenith_finish_unlogged_build_phase_1(SMgrRelation reln) * WAL-logged and is present in the page server. */ static void -zenith_end_unlogged_build(SMgrRelation reln) +neon_end_unlogged_build(SMgrRelation reln) { Assert(unlogged_build_rel == reln); @@ -1769,7 +1762,7 @@ zenith_end_unlogged_build(SMgrRelation reln) } static void -AtEOXact_zenith(XactEvent event, void *arg) +AtEOXact_neon(XactEvent event, void *arg) { switch (event) { @@ -1802,47 +1795,46 @@ AtEOXact_zenith(XactEvent event, void *arg) } } -static const struct f_smgr zenith_smgr = +static const struct f_smgr neon_smgr = { - .smgr_init = zenith_init, + .smgr_init = neon_init, .smgr_shutdown = NULL, - .smgr_open = zenith_open, - .smgr_close = zenith_close, - .smgr_create = zenith_create, - .smgr_exists = zenith_exists, - .smgr_unlink = zenith_unlink, - .smgr_extend = zenith_extend, - .smgr_prefetch = zenith_prefetch, - .smgr_reset_prefetch = zenith_reset_prefetch, - .smgr_read = zenith_read, - .smgr_write = zenith_write, - .smgr_writeback = zenith_writeback, - .smgr_nblocks = zenith_nblocks, - .smgr_truncate = zenith_truncate, - .smgr_immedsync = zenith_immedsync, + .smgr_open = neon_open, + .smgr_close = neon_close, + .smgr_create = neon_create, + .smgr_exists = neon_exists, + .smgr_unlink = neon_unlink, + .smgr_extend = neon_extend, + .smgr_prefetch = neon_prefetch, + .smgr_reset_prefetch = neon_reset_prefetch, + .smgr_read = neon_read, + .smgr_write = neon_write, + .smgr_writeback = neon_writeback, + .smgr_nblocks = neon_nblocks, + .smgr_truncate = neon_truncate, + .smgr_immedsync = neon_immedsync, - .smgr_start_unlogged_build = zenith_start_unlogged_build, - .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, - .smgr_end_unlogged_build = zenith_end_unlogged_build, + .smgr_start_unlogged_build = neon_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = neon_end_unlogged_build, }; - const f_smgr * -smgr_zenith(BackendId backend, RelFileNode rnode) +smgr_neon(BackendId backend, RelFileNode rnode) { /* Don't use page server for temp relations */ if (backend != InvalidBackendId) return smgr_standard(backend, rnode); else - return &zenith_smgr; + return &neon_smgr; } void -smgr_init_zenith(void) +smgr_init_neon(void) { - RegisterXactCallback(AtEOXact_zenith, NULL); + RegisterXactCallback(AtEOXact_neon, NULL); smgr_init_standard(); - zenith_init(); + neon_init(); } diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 31021f3e41..d4262c730a 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -56,7 +56,7 @@ static void relsize_shmem_request(void); #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) static void -zenith_smgr_shmem_startup(void) +neon_smgr_shmem_startup(void) { static HASHCTL info; @@ -174,14 +174,14 @@ relsize_hash_init(void) #endif prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = zenith_smgr_shmem_startup; + shmem_startup_hook = neon_smgr_shmem_startup; } } #if PG_VERSION_NUM >= 150000 /* * shmem_request hook: request additional shared resources. We'll allocate or - * attach to the shared resources in zenith_smgr_shmem_startup(). + * attach to the shared resources in neon_smgr_shmem_startup(). */ static void relsize_shmem_request(void) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 05257ced4c..fc0b660a64 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -71,14 +71,13 @@ #include "walproposer_utils.h" #include "replication/walpropshim.h" - char *wal_acceptors_list; int wal_acceptor_reconnect_timeout; int wal_acceptor_connect_timeout; bool am_wal_proposer; -char *zenith_timeline_walproposer = NULL; -char *zenith_tenant_walproposer = NULL; +char *neon_timeline_walproposer = NULL; +char *neon_tenant_walproposer = NULL; /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ WalProposerFunctionsType *WalProposerFunctions = NULL; @@ -89,7 +88,7 @@ static int n_safekeepers = 0; static int quorum = 0; static Safekeeper safekeeper[MAX_SAFEKEEPERS]; static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to* * safekeepers */ static ProposerGreeting greetRequest; static VoteRequest voteRequest; /* Vote request for safekeeper */ @@ -162,7 +161,6 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); - static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void nwp_prepare_shmem(void); @@ -176,7 +174,6 @@ static shmem_request_hook_type prev_shmem_request_hook = NULL; static void walproposer_shmem_request(void); #endif - void pg_init_walproposer(void) { @@ -207,10 +204,9 @@ nwp_register_gucs(void) &wal_acceptors_list, /* valueAddr */ "", /* bootValue */ PGC_POSTMASTER, - GUC_LIST_INPUT, /* extensions can't use + GUC_LIST_INPUT, /* extensions can't use* * GUC_LIST_QUOTE */ - NULL, NULL, NULL - ); + NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", @@ -220,8 +216,7 @@ nwp_register_gucs(void) 1000, 0, INT_MAX, /* default, min, max */ PGC_SIGHUP, /* context */ GUC_UNIT_MS, /* flags */ - NULL, NULL, NULL - ); + NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_connect_timeout", @@ -231,9 +226,7 @@ nwp_register_gucs(void) 5000, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, - NULL, NULL, NULL - ); - + NULL, NULL, NULL); } /* shmem handling */ @@ -499,19 +492,19 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) greetRequest.pgVersion = PG_VERSION_NUM; pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); greetRequest.systemId = systemId; - if (!zenith_timeline_walproposer) + if (!neon_timeline_walproposer) elog(FATAL, "neon.timeline_id is not provided"); - if (*zenith_timeline_walproposer != '\0' && - !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) - elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); - if (!zenith_tenant_walproposer) + if (*neon_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer); + if (!neon_tenant_walproposer) elog(FATAL, "neon.tenant_id is not provided"); - if (*zenith_tenant_walproposer != '\0' && - !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) - elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); + if (*neon_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer); #if PG_VERSION_NUM >= 150000 -/* FIXME don't use hardcoded timeline id */ + /* FIXME don't use hardcoded timeline id */ greetRequest.timeline = 1; #else greetRequest.timeline = ThisTimeLineID; @@ -657,8 +650,8 @@ ResetConnection(Safekeeper *sk) int written = 0; written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer); /* * currently connection string is not that long, but once we pass @@ -1326,8 +1319,7 @@ DetermineEpochStartLsn(void) propTerm, LSN_FORMAT_ARGS(propEpochStartLsn), safekeeper[donor].host, safekeeper[donor].port, - LSN_FORMAT_ARGS(truncateLsn) - ); + LSN_FORMAT_ARGS(truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN @@ -1373,8 +1365,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec WalReceiverConn *wrconn; WalRcvStreamOptions options; - sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + safekeeper[donor].host, safekeeper[donor].port, neon_timeline_walproposer, neon_tenant_walproposer); wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); if (!wrconn) { @@ -1544,8 +1536,7 @@ SendProposerElected(Safekeeper *sk) else { XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; - XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : - sk->voteResponse.flushLsn); + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : sk->voteResponse.flushLsn); sk->startStreamingAt = Min(propEndLsn, skEndLsn); } @@ -1759,7 +1750,7 @@ SendAppendRequests(Safekeeper *sk) req->beginLsn, req->endLsn - req->beginLsn, #if PG_VERSION_NUM >= 150000 - /* FIXME don't use hardcoded timelineid here */ + /* FIXME don't use hardcoded timeline_id here */ 1, #else ThisTimeLineID, @@ -1784,9 +1775,9 @@ SendAppendRequests(Safekeeper *sk) case PG_ASYNC_WRITE_TRY_FLUSH: /* - * We still need to call PQflush some more to finish the job. - * Caller function will handle this by setting right event - * set. + * * We still need to call PQflush some more to finish the + * job. Caller function will handle this by setting right + * event* set. */ sk->flushWrite = true; return true; @@ -1885,40 +1876,40 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * if (strcmp(key, "current_timeline_size") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->currentClusterSize = pq_getmsgint64(reply_message); + /* read value length */ + rf->currentClusterSize = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", rf->currentClusterSize); } else if (strcmp(key, "ps_writelsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_writelsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_writelsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", LSN_FORMAT_ARGS(rf->ps_writelsn)); } else if (strcmp(key, "ps_flushlsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_flushlsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_flushlsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", LSN_FORMAT_ARGS(rf->ps_flushlsn)); } else if (strcmp(key, "ps_applylsn") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_applylsn = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_applylsn = pq_getmsgint64(reply_message); elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", LSN_FORMAT_ARGS(rf->ps_applylsn)); } else if (strcmp(key, "ps_replytime") == 0) { pq_getmsgint(reply_message, sizeof(int32)); - //read value length - rf->ps_replytime = pq_getmsgint64(reply_message); + /* read value length */ + rf->ps_replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; @@ -1933,13 +1924,13 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * else { len = pq_getmsgint(reply_message, sizeof(int32)); - //read value length + /* read value length */ /* * Skip unknown keys to support backward compatibile protocol * changes */ - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1973,7 +1964,6 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) } } - /* * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the * last WAL record that can be safely discarded. @@ -2009,8 +1999,7 @@ GetAcknowledgedByQuorumWALPosition(void) * Like in Raft, we aren't allowed to commit entries from previous * terms, so ignore reported LSN until it gets to epochStartLsn. */ - responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? - safekeeper[i].appendResponse.flushLsn : 0; + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? safekeeper[i].appendResponse.flushLsn : 0; } qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); @@ -2058,7 +2047,6 @@ replication_feedback_set(ReplicationFeedback * rf) SpinLockRelease(&walprop_shared->mutex); } - void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { @@ -2069,12 +2057,11 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe SpinLockRelease(&walprop_shared->mutex); } - /* * Get ReplicationFeedback fields from the most advanced safekeeper */ static void -GetLatestZentihFeedback(ReplicationFeedback * rf) +GetLatestNeonFeedback(ReplicationFeedback * rf) { int latest_safekeeper = 0; XLogRecPtr ps_writelsn = InvalidXLogRecPtr; @@ -2094,7 +2081,7 @@ GetLatestZentihFeedback(ReplicationFeedback * rf) rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; - elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," + elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", rf->currentClusterSize, LSN_FORMAT_ARGS(rf->ps_writelsn), @@ -2113,14 +2100,13 @@ HandleSafekeeperResponse(void) XLogRecPtr diskConsistentLsn; XLogRecPtr minFlushLsn; - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; if (!syncSafekeepers) { /* Get ReplicationFeedback fields from the most advanced safekeeper */ - GetLatestZentihFeedback(&quorumFeedback.rf); + GetLatestNeonFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } @@ -2139,7 +2125,7 @@ HandleSafekeeperResponse(void) quorumFeedback.flushLsn, /* - * apply_lsn - This is what processed and durably saved at + * apply_lsn - This is what processed and durably saved at* * pageserver. */ quorumFeedback.rf.ps_flushlsn, @@ -2460,7 +2446,7 @@ backpressure_lag_impl(void) XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); -#define MB ((XLogRecPtr)1024*1024) +#define MB ((XLogRecPtr)1024 * 1024) elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), @@ -2468,23 +2454,17 @@ backpressure_lag_impl(void) LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr)); - if ((writePtr != InvalidXLogRecPtr - && max_replication_write_lag > 0 - && myFlushLsn > writePtr + max_replication_write_lag * MB)) + if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB)) { return (myFlushLsn - writePtr - max_replication_write_lag * MB); } - if ((flushPtr != InvalidXLogRecPtr - && max_replication_flush_lag > 0 - && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) + if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) { return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); } - if ((applyPtr != InvalidXLogRecPtr - && max_replication_apply_lag > 0 - && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) + if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) { return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 59e70f33bf..051c7c02a6 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -10,16 +10,16 @@ #include "utils/uuid.h" #include "replication/walreceiver.h" -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 -#define MAX_SAFEKEEPERS 32 -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single - * WAL message */ -#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL + * message */ +#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* * message header */ -#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender +#define XLOG_HDR_END_POS (1 + 8) /* offset of end position in wal sender* * message header */ /* @@ -39,8 +39,8 @@ typedef struct WalProposerConn WalProposerConn; struct WalMessage; typedef struct WalMessage WalMessage; -extern char *zenith_timeline_walproposer; -extern char *zenith_tenant_walproposer; +extern char *neon_timeline_walproposer; +extern char *neon_tenant_walproposer; /* Possible return values from ReadPGAsync */ typedef enum @@ -170,8 +170,8 @@ typedef struct ProposerGreeting uint32 pgVersion; pg_uuid_t proposerId; uint64 systemId; /* Postgres system identifier */ - uint8 ztimelineid[16]; /* Zenith timeline id */ - uint8 ztenantid[16]; + uint8 timeline_id[16]; /* Neon timeline id */ + uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; } ProposerGreeting; @@ -226,7 +226,7 @@ typedef struct VoteResponse * proposer to choose the most advanced one. */ XLogRecPtr flushLsn; - XLogRecPtr truncateLsn; /* minimal LSN which may be needed for + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ @@ -283,7 +283,6 @@ typedef struct HotStandbyFeedback FullTransactionId catalog_xmin; } HotStandbyFeedback; - typedef struct ReplicationFeedback { /* current size of the timeline on pageserver */ @@ -295,7 +294,6 @@ typedef struct ReplicationFeedback TimestampTz ps_replytime; } ReplicationFeedback; - typedef struct WalproposerShmemState { slock_t mutex; @@ -323,7 +321,7 @@ typedef struct AppendResponse XLogRecPtr commitLsn; HotStandbyFeedback hs; /* Feedback recieved from pageserver includes standby_status_update fields */ - /* and custom zenith feedback. */ + /* and custom neon feedback. */ /* This part of the message is extensible. */ ReplicationFeedback rf; } AppendResponse; @@ -332,7 +330,6 @@ typedef struct AppendResponse /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) - /* * Descriptor of safekeeper */ @@ -340,7 +337,7 @@ typedef struct Safekeeper { char const *host; char const *port; - char conninfo[MAXCONNINFO]; /* connection info for + char conninfo[MAXCONNINFO]; /* connection info for* * connecting/reconnecting */ /* @@ -366,12 +363,12 @@ typedef struct Safekeeper */ XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush, + bool flushWrite; /* set to true if we need to call AsyncFlush,* * to flush pending messages */ XLogRecPtr streamingAt; /* current streaming position */ AppendRequestHeader appendRequest; /* request for sending to safekeeper */ - int eventPos; /* position in wait event set. Equal to -1 if + int eventPos; /* position in wait event set. Equal to -1 if* * no event */ SafekeeperState state; /* safekeeper state machine state */ TimestampTz startedConnAt; /* when connection attempt started */ @@ -380,7 +377,6 @@ typedef struct Safekeeper AppendResponse appendResponse; /* feedback for master */ } Safekeeper; - extern PGDLLIMPORT void WalProposerMain(Datum main_arg); void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); void WalProposerPoll(void); diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 07bd7bdd28..e0cea4177b 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -36,13 +36,13 @@ PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); /* - * Linkage to functions in zenith module. + * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*zenith_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); +typedef void (*neon_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); -static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; +static neon_read_at_lsn_type neon_read_at_lsn_ptr; /* * Module initialize function: fetch function pointers for cross-module calls. @@ -51,13 +51,13 @@ void _PG_init(void) { /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); - zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) - load_external_function("$libdir/neon", "zenith_read_at_lsn", + AssertVariableIsOfType(&neon_read_at_lsn, neon_read_at_lsn_type); + neon_read_at_lsn_ptr = (neon_read_at_lsn_type) + load_external_function("$libdir/neon", "neon_read_at_lsn", true, NULL); } -#define zenith_read_at_lsn zenith_read_at_lsn_ptr +#define neon_read_at_lsn neon_read_at_lsn_ptr /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. @@ -96,7 +96,7 @@ test_consume_xids(PG_FUNCTION_ARGS) Datum clear_buffer_cache(PG_FUNCTION_ARGS) { - bool save_zenith_test_evict; + bool save_neon_test_evict; /* * Temporarily set the zenith_test_evict GUC, so that when we pin and @@ -104,7 +104,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) * buffers, as there is no explicit "evict this buffer" function in the * buffer manager. */ - save_zenith_test_evict = zenith_test_evict; + save_neon_test_evict = zenith_test_evict; zenith_test_evict = true; PG_TRY(); { @@ -149,14 +149,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS) PG_FINALLY(); { /* restore the GUC */ - zenith_test_evict = save_zenith_test_evict; + zenith_test_evict = save_neon_test_evict; } PG_END_TRY(); PG_RETURN_VOID(); } - /* * Reads the page from page server without buffer cache * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN @@ -232,7 +231,6 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); - forknum = forkname_to_number(text_to_cstring(forkname)); /* Initialize buffer to copy to */ @@ -240,7 +238,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); relation_close(rel, AccessShareLock); @@ -272,8 +270,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) RelFileNode rnode = { .spcNode = PG_GETARG_OID(0), .dbNode = PG_GETARG_OID(1), - .relNode = PG_GETARG_OID(2) - }; + .relNode = PG_GETARG_OID(2)}; ForkNumber forknum = PG_GETARG_UINT32(3); @@ -281,14 +278,13 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) bool request_latest = PG_ARGISNULL(5); uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); - /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5a450793f1..5417f4f2b3 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -32,7 +32,7 @@ sha2 = "0.10.2" socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" git-version = "0.3.5" diff --git a/pyproject.toml b/pyproject.toml index ec166ea7cd..9c2aa39c7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "zenith" +name = "neon" version = "0.1.0" description = "" authors = [] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 4ed30413e2..cae095c3c2 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -14,8 +14,8 @@ tracing = "0.1.27" clap = "3.0" daemonize = "0.4.1" tokio = { version = "1.17", features = ["macros", "fs"] } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } anyhow = "1.0" crc32c = "0.6.0" humantime = "2.1.0" @@ -25,7 +25,7 @@ serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" hex = "0.4.3" const_format = "0.2.21" -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 244c793250..d518ac01cc 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -30,8 +30,8 @@ use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ - http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, - zid::NodeId, + http::endpoint, id::NodeId, logging, project_git_version, shutdown::exit_now, signals, + tcp_listener, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; @@ -39,7 +39,7 @@ const ID_FILE_NAME: &str = "safekeeper.id"; project_git_version!(GIT_VERSION); fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Zenith safekeeper") + let arg_matches = App::new("Neon safekeeper") .about("Store WAL stream to local file system and push it to WAL receivers") .version(GIT_VERSION) .arg( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index ce66131700..f276fad613 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -22,7 +22,7 @@ use etcd_broker::{ subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, Client, PutOptions, }; -use utils::zid::{NodeId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; @@ -45,7 +45,7 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( broker_etcd_prefix: String, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, sk_id: NodeId, ) -> String { format!( @@ -162,12 +162,12 @@ pub fn get_candiate_name(system_id: NodeId) -> String { } async fn push_sk_info( - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, mut client: Client, key: String, sk_info: SkTimelineInfo, mut lease: Lease, -) -> anyhow::Result<(ZTenantTimelineId, Lease)> { +) -> anyhow::Result<(TenantTimelineId, Lease)> { let put_opts = PutOptions::new().with_lease(lease.id); client .put( @@ -202,7 +202,7 @@ struct Lease { /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; - let mut leases: HashMap = HashMap::new(); + let mut leases: HashMap = HashMap::new(); let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); loop { diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 7fc75246e1..ff23f0360f 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -14,7 +14,7 @@ use tracing::*; use crate::control_file_upgrade::upgrade_control_file; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; -use utils::{bin_ser::LeSer, zid::ZTenantTimelineId}; +use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -55,7 +55,7 @@ pub struct FileStorage { } impl FileStorage { - pub fn restore_new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> Result { + pub fn restore_new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { let timeline_dir = conf.timeline_dir(zttid); let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); @@ -72,7 +72,7 @@ impl FileStorage { } pub fn create_new( - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { @@ -115,7 +115,7 @@ impl FileStorage { // Load control file for given zttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result { let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); Self::load_control_file(path) @@ -252,7 +252,7 @@ mod test { use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; use anyhow::Result; use std::fs; - use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + use utils::{id::TenantTimelineId, lsn::Lsn}; fn stub_conf() -> SafeKeeperConf { let workdir = tempfile::tempdir().unwrap().into_path(); @@ -264,7 +264,7 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); Ok(( @@ -275,7 +275,7 @@ mod test { fn create( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); @@ -286,7 +286,7 @@ mod test { #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let zttid = TenantTimelineId::generate(); { let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); // change something @@ -301,7 +301,7 @@ mod test { #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let zttid = TenantTimelineId::generate(); { let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 91d2f61c10..87204d6b49 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -7,9 +7,9 @@ use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ bin_ser::LeSer, + id::{TenantId, TimelineId}, lsn::Lsn, pq_proto::SystemId, - zid::{ZTenantId, ZTimelineId}, }; /// Persistent consensus state of the acceptor. @@ -45,9 +45,8 @@ pub struct ServerInfoV2 { /// Postgres server version pub pg_version: u32, pub system_id: SystemId, - pub tenant_id: ZTenantId, - /// Zenith timelineid - pub ztli: ZTimelineId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub wal_seg_size: u32, } @@ -76,10 +75,9 @@ pub struct ServerInfoV3 { pub pg_version: u32, pub system_id: SystemId, #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub wal_seg_size: u32, } @@ -106,10 +104,9 @@ pub struct SafeKeeperStateV3 { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperStateV4 { #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -154,7 +151,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }; return Ok(SafeKeeperState { tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, + timeline_id: oldstate.server.timeline_id, acceptor_state: ac, server: ServerInfo { pg_version: oldstate.server.pg_version, @@ -181,7 +178,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }; return Ok(SafeKeeperState { tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, + timeline_id: oldstate.server.timeline_id, acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, @@ -193,9 +190,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); - // migrate to moving ztenantid/ztli to the top and adding some lsns + // migrate to moving tenant_id/timeline_id to the top and adding some lsns } else if version == 3 { - info!("reading safekeeper control file version {}", version); + info!("reading safekeeper control file version {version}"); let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?; let server = ServerInfo { pg_version: oldstate.server.pg_version, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 3e301259ed..41b9ad66e1 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -14,10 +14,10 @@ use regex::Regex; use std::sync::Arc; use tracing::info; use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, postgres_backend::{self, PostgresBackend}, pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, }; /// Safekeeper handler of postgres commands @@ -25,8 +25,8 @@ pub struct SafekeeperPostgresHandler { pub conf: SafeKeeperConf, /// assigned application name pub appname: Option, - pub ztenantid: Option, - pub ztimelineid: Option, + pub tenant_id: Option, + pub timeline_id: Option, pub timeline: Option>, } @@ -63,17 +63,17 @@ fn parse_cmd(cmd: &str) -> Result { } impl postgres_backend::Handler for SafekeeperPostgresHandler { - // ztenant id and ztimeline id are passed in connection string params + // tenant_id and timeline_id are passed in connection string params fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { match opt.split_once('=') { - Some(("ztenantid", value)) => { - self.ztenantid = Some(value.parse()?); + Some(("tenant_id", value)) => { + self.tenant_id = Some(value.parse()?); } - Some(("ztimelineid", value)) => { - self.ztimelineid = Some(value.parse()?); + Some(("timeline_id", value)) => { + self.timeline_id = Some(value.parse()?); } _ => continue, } @@ -95,18 +95,18 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { info!( "got query {:?} in timeline {:?}", - query_string, self.ztimelineid + query_string, self.timeline_id ); let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); - let tenantid = self.ztenantid.context("tenantid is required")?; - let timelineid = self.ztimelineid.context("timelineid is required")?; + let tenant_id = self.tenant_id.context("tenant_id is required")?; + let timeline_id = self.timeline_id.context("timeline_id is required")?; if self.timeline.is_none() { self.timeline.set( &self.conf, - ZTenantTimelineId::new(tenantid, timelineid), + TenantTimelineId::new(tenant_id, timeline_id), create, )?; } @@ -121,7 +121,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } - .context(format!("timeline {timelineid}"))?; + .context(format!("timeline {timeline_id}"))?; Ok(()) } @@ -132,8 +132,8 @@ impl SafekeeperPostgresHandler { SafekeeperPostgresHandler { conf, appname: None, - ztenantid: None, - ztimelineid: None, + tenant_id: None, + timeline_id: None, timeline: None, } } diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index 4b3ae7798e..e13ea50eaf 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,8 +1,8 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{NodeId, ZTimelineId}; +use utils::id::{NodeId, TimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, pub peer_ids: Vec, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 13356c5921..14c9414c09 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -21,8 +21,8 @@ use utils::{ request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::models::TimelineCreateRequest; @@ -68,9 +68,9 @@ struct AcceptorStateStatus { #[derive(Debug, Serialize)] struct TimelineStatus { #[serde(serialize_with = "display_serialize")] - tenant_id: ZTenantId, + tenant_id: TenantId, #[serde(serialize_with = "display_serialize")] - timeline_id: ZTimelineId, + timeline_id: TimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] flush_lsn: Lsn, @@ -90,7 +90,7 @@ struct TimelineStatus { /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); @@ -125,7 +125,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, ApiError> { let request_data: TimelineCreateRequest = json_request(&mut request).await?; - let zttid = ZTenantTimelineId { + let zttid = TenantTimelineId { tenant_id: parse_request_param(&request, "tenant_id")?, timeline_id: request_data.timeline_id, }; @@ -146,7 +146,7 @@ async fn timeline_create_handler(mut request: Request) -> Result, ) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); @@ -181,7 +181,7 @@ async fn tenant_delete_force_handler( /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( + let zttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 16c1d36131..00fc43521b 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -97,8 +97,8 @@ fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { pg_version: 0, // unknown proposer_id: [0u8; 16], system_id: 0, - ztli: spg.ztimelineid.unwrap(), - tenant_id: spg.ztenantid.unwrap(), + timeline_id: spg.timeline_id.unwrap(), + tenant_id: spg.tenant_id.unwrap(), tli: 0, wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests }); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 0335d61d3f..b466d5aab5 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use std::time::Duration; use url::Url; -use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId}; pub mod broker; pub mod control_file; @@ -61,11 +61,11 @@ pub struct SafeKeeperConf { } impl SafeKeeperConf { - pub fn tenant_dir(&self, tenant_id: &ZTenantId) -> PathBuf { + pub fn tenant_dir(&self, tenant_id: &TenantId) -> PathBuf { self.workdir.join(tenant_id.to_string()) } - pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf { + pub fn timeline_dir(&self, zttid: &TenantTimelineId) -> PathBuf { self.tenant_dir(&zttid.tenant_id) .join(zttid.timeline_id.to_string()) } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index c693035dd3..3fa3916266 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -8,7 +8,7 @@ use metrics::{ Gauge, IntGaugeVec, }; use postgres_ffi::XLogSegNo; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, @@ -16,7 +16,7 @@ use crate::{ }; pub struct FullTimelineInfo { - pub zttid: ZTenantTimelineId, + pub zttid: TenantTimelineId, pub replicas: Vec, pub wal_backup_active: bool, pub timeline_is_active: bool, diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index af4cfb6ba4..b0b6a73621 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -53,7 +53,7 @@ impl<'pg> ReceiveWalConn<'pg> { /// Receive WAL from wal_proposer pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let _enter = info_span!("WAL acceptor", timeline = %spg.ztimelineid.unwrap()).entered(); + let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered(); // Notify the libpq client that it's allowed to send `CopyData` messages self.pg_backend diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index a2bdcb55e7..fa045eed90 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -19,9 +19,9 @@ use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; use utils::{ bin_ser::LeSer, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, pq_proto::{ReplicationFeedback, SystemId}, - zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; @@ -166,10 +166,9 @@ pub struct Peers(pub Vec<(NodeId, PeerInfo)>); #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperState { #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid + pub tenant_id: TenantId, #[serde(with = "hex")] - pub timeline_id: ZTimelineId, + pub timeline_id: TimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -219,7 +218,7 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new(zttid: &TenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { tenant_id: zttid.tenant_id, timeline_id: zttid.timeline_id, @@ -245,7 +244,7 @@ impl SafeKeeperState { #[cfg(test)] pub fn empty() -> Self { - SafeKeeperState::new(&ZTenantTimelineId::empty(), vec![]) + SafeKeeperState::new(&TenantTimelineId::empty(), vec![]) } } @@ -260,9 +259,8 @@ pub struct ProposerGreeting { pub pg_version: u32, pub proposer_id: PgUuid, pub system_id: SystemId, - /// Zenith timelineid - pub ztli: ZTimelineId, - pub tenant_id: ZTenantId, + pub timeline_id: TimelineId, + pub tenant_id: TenantId, pub tli: TimeLineID, pub wal_seg_size: u32, } @@ -507,13 +505,13 @@ where { // constructor pub fn new( - ztli: ZTimelineId, + timeline_id: TimelineId, state: CTRL, mut wal_store: WAL, node_id: NodeId, ) -> Result> { - if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { - bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); + if state.timeline_id != TimelineId::from([0u8; 16]) && timeline_id != state.timeline_id { + bail!("Calling SafeKeeper::new with inconsistent timeline_id ({}) and SafeKeeperState.server.timeline_id ({})", timeline_id, state.timeline_id); } // initialize wal_store, if state is already initialized @@ -600,10 +598,10 @@ where self.state.tenant_id ); } - if msg.ztli != self.state.timeline_id { + if msg.timeline_id != self.state.timeline_id { bail!( "invalid timeline ID, got {}, expected {}", - msg.ztli, + msg.timeline_id, self.state.timeline_id ); } @@ -982,9 +980,9 @@ mod tests { persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); + let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1000,7 +998,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(timeline_id, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1016,9 +1014,9 @@ mod tests { persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); + let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 293cf67c57..375b6eea18 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -30,7 +30,7 @@ use utils::{ // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; -// zenith extension of replication protocol +// neon extension of replication protocol const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; @@ -105,7 +105,7 @@ impl ReplicationConn { match &msg { FeMessage::CopyData(m) => { // There's three possible data messages that the client is supposed to send here: - // `HotStandbyFeedback` and `StandbyStatusUpdate` and `ZenithStandbyFeedback`. + // `HotStandbyFeedback` and `StandbyStatusUpdate` and `NeonStandbyFeedback`. match m.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { @@ -165,12 +165,12 @@ impl ReplicationConn { pgb: &mut PostgresBackend, mut start_pos: Lsn, ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap()).entered(); + let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); let bg_stream_in = self.stream_in.take().unwrap(); - let bg_timeline_id = spg.ztimelineid.unwrap(); + let bg_timeline_id = spg.timeline_id.unwrap(); let state = ReplicaState::new(); // This replica_id is used below to check if it's time to stop replication. diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 8d101e6ff6..cf317c41c3 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -21,9 +21,9 @@ use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ + id::{NodeId, TenantId, TenantTimelineId}, lsn::Lsn, pq_proto::ReplicationFeedback, - zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; use crate::control_file; @@ -98,7 +98,7 @@ impl SharedState { /// Initialize timeline state, creating control file fn create( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); @@ -119,7 +119,7 @@ impl SharedState { /// Restore SharedState from control file. /// If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result { + fn restore(conf: &SafeKeeperConf, zttid: &TenantTimelineId) -> Result { let control_store = control_file::FileStorage::restore_new(zttid, conf)?; let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); @@ -143,7 +143,7 @@ impl SharedState { /// Mark timeline active/inactive and return whether s3 offloading requires /// start/stop action. - fn update_status(&mut self, ttid: ZTenantTimelineId) -> bool { + fn update_status(&mut self, ttid: TenantTimelineId) -> bool { let is_active = self.is_active(); if self.active != is_active { info!("timeline {} active={} now", ttid, is_active); @@ -213,7 +213,7 @@ impl SharedState { // // To choose what feedback to use and resend to compute node, // we need to know which pageserver compute node considers to be main. - // See https://github.com/zenithdb/zenith/issues/1171 + // See https://github.com/neondatabase/neon/issues/1171 // if let Some(pageserver_feedback) = state.pageserver_feedback { if let Some(acc_feedback) = acc.pageserver_feedback { @@ -227,7 +227,7 @@ impl SharedState { // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. - // See https://github.com/zenithdb/zenith/issues/1171 + // See https://github.com/neondatabase/neon/issues/1171 acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, @@ -256,11 +256,11 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { - pub zttid: ZTenantTimelineId, + pub zttid: TenantTimelineId, /// Sending here asks for wal backup launcher attention (start/stop /// offloading). Sending zttid instead of concrete command allows to do /// sending without timeline lock. - wal_backup_launcher_tx: Sender, + wal_backup_launcher_tx: Sender, commit_lsn_watch_tx: watch::Sender, /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, @@ -269,8 +269,8 @@ pub struct Timeline { impl Timeline { fn new( - zttid: ZTenantTimelineId, - wal_backup_launcher_tx: Sender, + zttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = @@ -539,13 +539,13 @@ impl Timeline { // Utilities needed by various Connection-like objects pub trait TimelineTools { - fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; + fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()>; fn get(&self) -> &Arc; } impl TimelineTools for Option> { - fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()> { + fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()> { *self = Some(GlobalTimelines::get(conf, zttid, create)?); Ok(()) } @@ -556,8 +556,8 @@ impl TimelineTools for Option> { } struct GlobalTimelinesState { - timelines: HashMap>, - wal_backup_launcher_tx: Option>, + timelines: HashMap>, + wal_backup_launcher_tx: Option>, } static TIMELINES_STATE: Lazy> = Lazy::new(|| { @@ -577,7 +577,7 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn init(wal_backup_launcher_tx: Sender) { + pub fn init(wal_backup_launcher_tx: Sender) { let mut state = TIMELINES_STATE.lock().unwrap(); assert!(state.wal_backup_launcher_tx.is_none()); state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); @@ -586,7 +586,7 @@ impl GlobalTimelines { fn create_internal( mut state: MutexGuard, conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, peer_ids: Vec, ) -> Result> { match state.timelines.get(&zttid) { @@ -612,7 +612,7 @@ impl GlobalTimelines { pub fn create( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, peer_ids: Vec, ) -> Result> { let state = TIMELINES_STATE.lock().unwrap(); @@ -623,7 +623,7 @@ impl GlobalTimelines { /// If control file doesn't exist and create=false, bails out. pub fn get( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, create: bool, ) -> Result> { let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); @@ -664,13 +664,12 @@ impl GlobalTimelines { } /// Get loaded timeline, if it exists. - pub fn get_loaded(zttid: ZTenantTimelineId) -> Option> { + pub fn get_loaded(zttid: TenantTimelineId) -> Option> { let state = TIMELINES_STATE.lock().unwrap(); state.timelines.get(&zttid).map(Arc::clone) } - /// Get ZTenantTimelineIDs of all active timelines. - pub fn get_active_timelines() -> HashSet { + pub fn get_active_timelines() -> HashSet { let state = TIMELINES_STATE.lock().unwrap(); state .timelines @@ -692,7 +691,7 @@ impl GlobalTimelines { fn delete_force_internal( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, was_active: bool, ) -> Result { match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { @@ -721,7 +720,7 @@ impl GlobalTimelines { /// TODO: ensure all of the above never happens. pub async fn delete_force( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, + zttid: &TenantTimelineId, ) -> Result { info!("deleting timeline {}", zttid); let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); @@ -737,8 +736,8 @@ impl GlobalTimelines { /// There may be a race if new timelines are created simultaneously. pub async fn delete_force_all_for_tenant( conf: &SafeKeeperConf, - tenant_id: &ZTenantId, - ) -> Result> { + tenant_id: &TenantId, + ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let mut to_delete = HashMap::new(); { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 5d946e37a4..85e967e218 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -23,7 +23,7 @@ use tokio::sync::watch; use tokio::time::sleep; use tracing::*; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::broker::{Election, ElectionLeader}; use crate::timeline::{GlobalTimelines, Timeline}; @@ -38,7 +38,7 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; pub fn wal_backup_launcher_thread_main( conf: SafeKeeperConf, - wal_backup_launcher_rx: Receiver, + wal_backup_launcher_rx: Receiver, ) { let rt = Builder::new_multi_thread() .worker_threads(conf.backup_runtime_threads) @@ -53,7 +53,7 @@ pub fn wal_backup_launcher_thread_main( /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(zttid: ZTenantTimelineId) -> Option> { +fn is_wal_backup_required(zttid: TenantTimelineId) -> Option> { GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) } @@ -70,7 +70,7 @@ struct WalBackupTimelineEntry { /// Start per timeline task, if it makes sense for this safekeeper to offload. fn consider_start_task( conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, task: &mut WalBackupTimelineEntry, ) { if !task.timeline.can_wal_backup() { @@ -117,7 +117,7 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// panics and separate elections from offloading itself. async fn wal_backup_launcher_main_loop( conf: SafeKeeperConf, - mut wal_backup_launcher_rx: Receiver, + mut wal_backup_launcher_rx: Receiver, ) { info!( "WAL backup launcher started, remote config {:?}", @@ -135,7 +135,7 @@ async fn wal_backup_launcher_main_loop( // Presense in this map means launcher is aware s3 offloading is needed for // the timeline, but task is started only if it makes sense for to offload // from this safekeeper. - let mut tasks: HashMap = HashMap::new(); + let mut tasks: HashMap = HashMap::new(); let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { @@ -193,7 +193,7 @@ struct WalBackupTask { /// Offload single timeline. async fn backup_task_main( - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, election: Election, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 644237a00d..58b69f06e7 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -25,7 +25,7 @@ use std::path::{Path, PathBuf}; use tracing::*; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::safekeeper::SafeKeeperState; @@ -86,7 +86,7 @@ struct WalStorageMetrics { } impl WalStorageMetrics { - fn new(zttid: &ZTenantTimelineId) -> Self { + fn new(zttid: &TenantTimelineId) -> Self { let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); Self { @@ -130,7 +130,7 @@ pub trait Storage { /// When storage is just created, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, - zttid: ZTenantTimelineId, + zttid: TenantTimelineId, timeline_dir: PathBuf, conf: SafeKeeperConf, @@ -161,7 +161,7 @@ pub struct PhysicalStorage { } impl PhysicalStorage { - pub fn new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { + pub fn new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { let timeline_dir = conf.timeline_dir(zttid); PhysicalStorage { metrics: WalStorageMetrics::new(zttid), diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh index df84fa0dd8..9e03302b0f 100755 --- a/scripts/generate_and_push_perf_report.sh +++ b/scripts/generate_and_push_perf_report.sh @@ -5,8 +5,8 @@ set -eux -o pipefail SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -echo "Uploading perf report to zenith pg" -# ingest per test results data into zenith backed postgres running in staging to build grafana reports on that data +echo "Uploading perf report to neon pg" +# ingest per test results data into neon backed postgres running in staging to build grafana reports on that data DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM" # Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository) @@ -16,8 +16,8 @@ DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_ echo "Uploading perf result to zenith-perf-data" scripts/git-upload \ - --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/zenithdb/zenith-perf-data.git \ - --message="add performance test result for $GITHUB_SHA zenith revision" \ + --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \ + --message="add performance test result for $GITHUB_SHA neon revision" \ --branch=master \ copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\ --merge \ diff --git a/scripts/perf_report_template.html b/scripts/perf_report_template.html index 2847e75a00..c86ab37c2d 100644 --- a/scripts/perf_report_template.html +++ b/scripts/perf_report_template.html @@ -19,7 +19,7 @@ } -

Zenith Performance Tests

+

Neon Performance Tests

{% for suit_name, suit_data in context.items() %}

Runs for {{ suit_name }}

@@ -38,7 +38,7 @@ {% for row in suit_data.rows %} - {{ row.revision[:6] }} + {{ row.revision[:6] }} {% for column_value in row.values %} {{ column_value.value }}{{column_value.ratio}} {% endfor %} diff --git a/test_runner/README.md b/test_runner/README.md index c7ec361d65..44751944b3 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -60,7 +60,7 @@ Useful environment variables: `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as +`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as `--pageserver-config-override=${value}` parameter values when neon_local cli is invoked `RUST_LOG`: logging configuration to pass into Neon CLI diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index b9cdfdebc4..b5565dab0f 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -16,7 +16,7 @@ from typing import Iterator, Optional import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. @@ -365,11 +365,11 @@ class NeonBenchmarker: assert matches, f"metric {metric_name} not found" return int(round(float(matches.group(1)))) - def get_timeline_size(self, repo_dir: Path, tenantid: ZTenantId, timelineid: ZTimelineId): + def get_timeline_size(self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId): """ Calculate the on-disk size of a timeline """ - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid) + path = f"{repo_dir}/tenants/{tenant_id}/timelines/{timeline_id}" totalbytes = 0 for root, dirs, files in os.walk(path): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 69c6d31315..0c03429f95 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,7 +29,7 @@ import pytest import requests from cached_property import cached_property from fixtures.log_helper import log -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -754,7 +754,7 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = ZTenantId.generate() + self.initial_tenant = TenantId.generate() # Create a config file corresponding to the options toml = textwrap.dedent( @@ -776,7 +776,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - pageserver_auth_type = "ZenithJWT" if config.auth_enabled else "Trust" + pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust" toml += textwrap.dedent( f""" @@ -841,7 +841,7 @@ class NeonEnv: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) - def timeline_dir(self, tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Path: + def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -971,7 +971,7 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, new_tenant_id: Optional[ZTenantId] = None) -> ZTenantId: + def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ @@ -983,24 +983,24 @@ class NeonPageserverHttpClient(requests.Session): raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") new_tenant_id = res.json() assert isinstance(new_tenant_id, str) - return ZTenantId(new_tenant_id) + return TenantId(new_tenant_id) - def tenant_attach(self, tenant_id: ZTenantId): + def tenant_attach(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") self.verbose_error(res) - def tenant_detach(self, tenant_id: ZTenantId): + def tenant_detach(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") self.verbose_error(res) - def tenant_status(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) return res_json - def timeline_list(self, tenant_id: ZTenantId) -> List[Dict[str, Any]]: + def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") self.verbose_error(res) res_json = res.json() @@ -1009,9 +1009,9 @@ class NeonPageserverHttpClient(requests.Session): def timeline_create( self, - tenant_id: ZTenantId, - new_timeline_id: Optional[ZTimelineId] = None, - ancestor_timeline_id: Optional[ZTimelineId] = None, + tenant_id: TenantId, + new_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, ) -> Dict[Any, Any]: res = self.post( @@ -1032,8 +1032,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_detail( self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_non_incremental_physical_size: bool = False, ) -> Dict[Any, Any]: @@ -1052,7 +1052,7 @@ class NeonPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: ZTenantId, timeline_id: ZTimelineId): + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -1174,17 +1174,17 @@ class NeonCli(AbstractNeonCli): def create_tenant( self, - tenant_id: Optional[ZTenantId] = None, - timeline_id: Optional[ZTimelineId] = None, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, conf: Optional[Dict[str, str]] = None, - ) -> Tuple[ZTenantId, ZTimelineId]: + ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. """ if tenant_id is None: - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() if timeline_id is None: - timeline_id = ZTimelineId.generate() + timeline_id = TimelineId.generate() if conf is None: res = self.raw_cli( [ @@ -1211,7 +1211,7 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return tenant_id, timeline_id - def config_tenant(self, tenant_id: ZTenantId, conf: Dict[str, str]): + def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]): """ Update tenant config. """ @@ -1230,8 +1230,8 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[ZTenantId] = None - ) -> ZTimelineId: + self, new_branch_name: str, tenant_id: Optional[TenantId] = None + ) -> TimelineId: cmd = [ "timeline", "create", @@ -1250,9 +1250,9 @@ class NeonCli(AbstractNeonCli): if matches is not None: created_timeline_id = matches.group("timeline_id") - return ZTimelineId(str(created_timeline_id)) + return TimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[ZTenantId] = None): + def create_root_branch(self, branch_name: str, tenant_id: Optional[TenantId] = None): cmd = [ "timeline", "create", @@ -1274,15 +1274,15 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return ZTimelineId(created_timeline_id) + return TimelineId(created_timeline_id) def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, ancestor_start_lsn: Optional[Lsn] = None, - ) -> ZTimelineId: + ) -> TimelineId: cmd = [ "timeline", "branch", @@ -1308,11 +1308,9 @@ class NeonCli(AbstractNeonCli): if created_timeline_id is None: raise Exception("could not find timeline id after `neon timeline create` invocation") else: - return ZTimelineId(str(created_timeline_id)) + return TimelineId(str(created_timeline_id)) - def list_timelines( - self, tenant_id: Optional[ZTenantId] = None - ) -> List[Tuple[str, ZTimelineId]]: + def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: """ Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ @@ -1324,14 +1322,14 @@ class NeonCli(AbstractNeonCli): ) timelines_cli = sorted( map( - lambda branch_and_id: (branch_and_id[0], ZTimelineId(branch_and_id[1])), + lambda branch_and_id: (branch_and_id[0], TimelineId(branch_and_id[1])), TIMELINE_DATA_EXTRACTOR.findall(res.stdout), ) ) return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[ZTimelineId] = None + self, config_toml: str, initial_timeline_id: Optional[TimelineId] = None ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1410,7 +1408,7 @@ class NeonCli(AbstractNeonCli): self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1436,7 +1434,7 @@ class NeonCli(AbstractNeonCli): def pg_start( self, node_name: str, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1460,7 +1458,7 @@ class NeonCli(AbstractNeonCli): def pg_stop( self, node_name: str, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, destroy=False, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": @@ -1558,7 +1556,7 @@ def append_pageserver_param_overrides( f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" ) - env_overrides = os.getenv("ZENITH_PAGESERVER_OVERRIDES") + env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") if env_overrides is not None: params_to_update += [ f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") @@ -1867,7 +1865,7 @@ class Postgres(PgProtocol): """An object representing a running postgres daemon.""" def __init__( - self, env: NeonEnv, tenant_id: ZTenantId, port: int, check_stop_result: bool = True + self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True ): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env @@ -2057,7 +2055,7 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2081,7 +2079,7 @@ class PostgresFactory: self, branch_name: str, node_name: Optional[str] = None, - tenant_id: Optional[ZTenantId] = None, + tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: @@ -2157,7 +2155,7 @@ class Safekeeper: return self def append_logical_message( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId, request: Dict[str, Any] + self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify @@ -2167,7 +2165,7 @@ class Safekeeper: # "replication=0" hacks psycopg not to send additional queries # on startup, see https://github.com/psycopg/psycopg2/pull/482 - connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + connstr = f"host=localhost port={self.port.pg} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" with closing(psycopg2.connect(connstr)) as conn: # server doesn't support transactions @@ -2202,8 +2200,8 @@ class SafekeeperTimelineStatus: class SafekeeperMetrics: # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[ZTenantId, ZTimelineId], int] = field(default_factory=dict) + flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) class SafekeeperHttpClient(requests.Session): @@ -2221,7 +2219,7 @@ class SafekeeperHttpClient(requests.Session): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() def timeline_status( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId + self, tenant_id: TenantId, timeline_id: TimelineId ) -> SafekeeperTimelineStatus: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() @@ -2234,16 +2232,14 @@ class SafekeeperHttpClient(requests.Session): remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), ) - def record_safekeeper_info(self, tenant_id: ZTenantId, timeline_id: ZTimelineId, body): + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", json=body, ) res.raise_for_status() - def timeline_delete_force( - self, tenant_id: ZTenantId, timeline_id: ZTimelineId - ) -> Dict[Any, Any]: + def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" ) @@ -2252,7 +2248,7 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def tenant_delete_force(self, tenant_id: ZTenantId) -> Dict[Any, Any]: + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") res.raise_for_status() res_json = res.json() @@ -2273,16 +2269,16 @@ class SafekeeperHttpClient(requests.Session): all_metrics_text, re.MULTILINE, ): - metrics.flush_lsn_inexact[ - (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) - ] = int(match.group(3)) + metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( + match.group(3) + ) for match in re.finditer( r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', all_metrics_text, re.MULTILINE, ): metrics.commit_lsn_inexact[ - (ZTenantId(match.group(1)), ZTimelineId(match.group(2))) + (TenantId(match.group(1)), TimelineId(match.group(2))) ] = int(match.group(3)) return metrics @@ -2456,7 +2452,7 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command - timeline = ZTimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) + timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) # stop postgres to ensure that files won't change pg.stop() @@ -2540,7 +2536,7 @@ def wait_until(number_of_iterations: int, interval: float, func): def assert_timeline_local( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ): timeline_detail = pageserver_http_client.timeline_detail( tenant, @@ -2554,14 +2550,14 @@ def assert_timeline_local( def assert_no_in_progress_downloads_for_tenant( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, + tenant: TenantId, ): tenant_status = pageserver_http_client.tenant_status(tenant) assert tenant_status["has_in_progress_downloads"] is False, tenant_status def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2578,8 +2574,8 @@ def remote_consistent_lsn( def wait_for_upload( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, - timeline: ZTimelineId, + tenant: TenantId, + timeline: TimelineId, lsn: Lsn, ): """waits for local timeline upload up to specified lsn""" @@ -2601,7 +2597,7 @@ def wait_for_upload( def last_record_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + pageserver_http_client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2612,8 +2608,8 @@ def last_record_lsn( def wait_for_last_record_lsn( pageserver_http_client: NeonPageserverHttpClient, - tenant: ZTenantId, - timeline: ZTimelineId, + tenant: TenantId, + timeline: TimelineId, lsn: Lsn, ): """waits for pageserver to catch up to a certain lsn""" @@ -2632,7 +2628,7 @@ def wait_for_last_record_lsn( ) -def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: ZTenantId, timeline: ZTimelineId): +def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId): """Wait for pageserver to catch up the latest flush LSN""" last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) @@ -2643,8 +2639,8 @@ def fork_at_current_lsn( pg: Postgres, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[ZTenantId] = None, -) -> ZTimelineId: + tenant_id: Optional[TenantId] = None, +) -> TimelineId: """ Create new branch at the last LSN of an existing branch. The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index bdf675a785..de2e131b79 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -46,11 +46,11 @@ class Lsn: @total_ordering -class ZId: +class Id: """ Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and - the string representation is in hex. This corresponds to the ZId / ZTenantId / - ZTimelineIds in the Rust code. + the string representation is in hex. This corresponds to the Id / TenantId / + TimelineIds in the Rust code. """ def __init__(self, x: str): @@ -79,11 +79,11 @@ class ZId: return cls(random.randbytes(16).hex()) -class ZTenantId(ZId): +class TenantId(Id): def __repr__(self): - return f'ZTenantId("{self.id.hex()}")' + return f'`TenantId("{self.id.hex()}")' -class ZTimelineId(ZId): +class TimelineId(Id): def __repr__(self): - return f'ZTimelineId("{self.id.hex()}")' + return f'TimelineId("{self.id.hex()}")' diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 8bac8080db..21e48cf899 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -20,4 +20,4 @@ All tests run only once. Usually to obtain more consistent performance numbers, Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. -There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with zenith-local-ci and zenith-staging variants. I.e. some tests under zenith-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[zenith]` which is highly confusing. +There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing. diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index b8e81824b0..cb2621ff02 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,6 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import query_scalar @@ -27,7 +27,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_timeline = ZTimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) + branch0_timeline = TimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. @@ -51,7 +51,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_timeline = ZTimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) + branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) log.info(f"b1 timeline {branch1_timeline}") branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") @@ -74,7 +74,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_timeline = ZTimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) + branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) log.info(f"b2 timeline {branch2_timeline}") branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 08e38e1461..d9082efada 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -2,7 +2,7 @@ from contextlib import closing import pytest from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException -from fixtures.types import ZTenantId +from fixtures.types import TenantId def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -13,7 +13,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) tenant_http_client = env.pageserver.http_client(tenant_token) - invalid_tenant_token = env.auth_keys.generate_tenant_token(ZTenantId.generate()) + invalid_tenant_token = env.auth_keys.generate_tenant_token(TenantId.generate()) invalid_tenant_http_client = env.pageserver.http_client(invalid_tenant_token) management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 5bd6368bfc..cfb9649867 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -2,7 +2,7 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import Lsn, ZTimelineId +from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -28,7 +28,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): main_cur = pgmain.connect().cursor() - timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows main_cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index ce3a74930e..fd81981b2b 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -5,7 +5,7 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep @@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_timelines: List[Tuple[ZTenantId, ZTimelineId, Postgres]] = [] + tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = [] for n in range(4): tenant_id, timeline_id = env.neon_cli.create_tenant() diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index af94865549..8de2687c9b 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -8,7 +8,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, pg_distrib_dir, ) -from fixtures.types import Lsn, ZTimelineId +from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -27,7 +27,7 @@ def test_fullbackup( log.info("postgres is running on 'test_fullbackup' branch") with pgmain.cursor() as cur: - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 67ce8871cd..88d4ad8a6e 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -3,7 +3,7 @@ import random from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import query_scalar # Test configuration @@ -29,7 +29,7 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon -async def gc(env: NeonEnv, timeline: ZTimelineId): +async def gc(env: NeonEnv, timeline: TimelineId): psconn = await env.pageserver.connect_async() while updates_performed < updates_to_perform: @@ -37,7 +37,7 @@ async def gc(env: NeonEnv, timeline: ZTimelineId): # At the same time, run UPDATEs and GC -async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: ZTimelineId): +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId): workers = [] for worker_id in range(num_connections): workers.append(asyncio.create_task(update_table(pg))) @@ -62,7 +62,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): log.info("postgres is running on test_gc_aggressive branch") with pg.cursor() as cur: - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (id int, counter int, t text)") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index fc9f41bda0..60cc0551ab 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import subprocess_capture @@ -69,8 +69,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] node_name = "import_from_vanilla" - tenant = ZTenantId.generate() - timeline = ZTimelineId.generate() + tenant = TenantId.generate() + timeline = TimelineId.generate() # Set up pageserver for import neon_env_builder.enable_local_fs_remote_storage() @@ -195,7 +195,7 @@ def _generate_data(num_rows: int, pg: Postgres) -> Lsn: def _import( - expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: ZTimelineId + expected_num_rows: int, lsn: Lsn, env: NeonEnv, pg_bin: PgBin, timeline: TimelineId ) -> str: """Test importing backup data to the pageserver. @@ -228,9 +228,9 @@ def _import( # start the pageserver again env.pageserver.start() - # Import using another tenantid, because we use the same pageserver. + # Import using another tenant_id, because we use the same pageserver. # TODO Create another pageserver to make test more realistic. - tenant = ZTenantId.generate() + tenant = TenantId.generate() # Import to pageserver node_name = "import_from_pageserver" diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index b2342e5ee8..a9dc63dd50 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -7,11 +7,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, NeonPageserverHttpClient, ) -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( - pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: ZTenantId + pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: TenantId ): """ Compare timelines list returned by CLI and directly via API. @@ -20,7 +20,7 @@ def helper_compare_timeline_list( timelines_api = sorted( map( - lambda t: ZTimelineId(t["timeline_id"]), + lambda t: TimelineId(t["timeline_id"]), pageserver_http_client.timeline_list(initial_tenant), ) ) @@ -85,7 +85,7 @@ def test_cli_tenant_list(neon_simple_env: NeonEnv): helper_compare_tenant_list(pageserver_http_client, env) res = env.neon_cli.list_tenants() - tenants = sorted(map(lambda t: ZTenantId(t.split()[0]), res.stdout.splitlines())) + tenants = sorted(map(lambda t: TenantId(t.split()[0]), res.stdout.splitlines())) assert env.initial_tenant in tenants assert tenant1 in tenants diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 2b5e2edb5f..c99e13f45f 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,7 +1,7 @@ import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -27,7 +27,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - timeline = ZTimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) psconn = env.pageserver.connect() pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index a7b7189824..def6bd5b33 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -11,7 +11,7 @@ from fixtures.neon_fixtures import ( pg_distrib_dir, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId # test that we cannot override node id after init @@ -60,39 +60,39 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv): assert "has node id already, it cannot be overridden" in bad_update.stderr -def check_client(client: NeonPageserverHttpClient, initial_tenant: ZTenantId): +def check_client(client: NeonPageserverHttpClient, initial_tenant: TenantId): client.check_status() # check initial tenant is there - assert initial_tenant in {ZTenantId(t["id"]) for t in client.tenant_list()} + assert initial_tenant in {TenantId(t["id"]) for t in client.tenant_list()} # create new tenant and check it is also there - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() client.tenant_create(tenant_id) - assert tenant_id in {ZTenantId(t["id"]) for t in client.tenant_list()} + assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} timelines = client.timeline_list(tenant_id) assert len(timelines) == 0, "initial tenant should not have any timelines" # create timeline - timeline_id = ZTimelineId.generate() + timeline_id = TimelineId.generate() client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 # check it is there - assert timeline_id in {ZTimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} + assert timeline_id in {TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} for timeline in timelines: - timeline_id = ZTimelineId(timeline["timeline_id"]) + timeline_id = TimelineId(timeline["timeline_id"]) timeline_details = client.timeline_detail( tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True, ) - assert ZTenantId(timeline_details["tenant_id"]) == tenant_id - assert ZTimelineId(timeline_details["timeline_id"]) == timeline_id + assert TenantId(timeline_details["tenant_id"]) == tenant_id + assert TimelineId(timeline_details["timeline_id"]) == timeline_id assert timeline_details.get("local") is not None @@ -118,8 +118,8 @@ def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): def expect_updated_msg_lsn( client: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, prev_msg_lsn: Optional[Lsn], ) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 329f4b7d24..786266b70e 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -3,7 +3,7 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import ZTimelineId +from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -25,7 +25,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - timeline = ZTimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) # Create table main_cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 04baef6ba0..cbe74cad5c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,5 +1,5 @@ # It's possible to run any regular test with the local fs remote storage via -# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... +# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import os import shutil @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar @@ -61,8 +61,8 @@ def test_remote_storage_backup_and_restore( client = env.pageserver.http_client() - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) checkpoint_numbers = range(1, 3) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 147e22b38f..e3c9a091f9 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -4,10 +4,10 @@ import psycopg2 import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId -def do_gc_target(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): +def do_gc_target(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") @@ -20,7 +20,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() # first check for non existing tenant - tenant_id = ZTenantId.generate() + tenant_id = TenantId.generate() with pytest.raises( expected_exception=NeonPageserverApiException, match=f"Tenant not found for id {tenant_id}", @@ -46,7 +46,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): with pytest.raises( expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" ): - bogus_timeline_id = ZTimelineId.generate() + bogus_timeline_id = TimelineId.generate() env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") # try to concurrently run gc and detach diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 56563ebe87..aa7d92f1fd 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -24,7 +24,7 @@ from fixtures.neon_fixtures import ( wait_for_upload, wait_until, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, subprocess_capture @@ -113,15 +113,15 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def populate_branch( pg: Postgres, - tenant_id: ZTenantId, + tenant_id: TenantId, ps_http: NeonPageserverHttpClient, create_table: bool, expected_sum: Optional[int], -) -> Tuple[ZTimelineId, Lsn]: +) -> Tuple[TimelineId, Lsn]: # insert some data with pg_cur(pg) as cur: cur.execute("SHOW neon.timeline_id") - timeline_id = ZTimelineId(cur.fetchone()[0]) + timeline_id = TimelineId(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline_id) log.info( @@ -149,8 +149,8 @@ def populate_branch( def ensure_checkpoint( pageserver_cur, pageserver_http: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage @@ -162,8 +162,8 @@ def ensure_checkpoint( def check_timeline_attached( new_pageserver_http_client: NeonPageserverHttpClient, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, old_timeline_detail: Dict[str, Any], old_current_lsn: Lsn, ): @@ -187,8 +187,8 @@ def switch_pg_to_new_pageserver( env: NeonEnv, pg: Postgres, new_pageserver_port: int, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> pathlib.Path: pg.stop() @@ -265,7 +265,7 @@ def test_tenant_relocation( pageserver_http = env.pageserver.http_client() tenant_id, initial_timeline_id = env.neon_cli.create_tenant( - ZTenantId("74ee8b079a0e437eb0afea7d26a07209") + TenantId("74ee8b079a0e437eb0afea7d26a07209") ) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 1214d703d0..97a13bbcb0 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,6 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_until -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def get_only_element(l): # noqa: E741 @@ -23,7 +23,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): def get_state(tenant): all_states = client.tenant_list() - matching = [t for t in all_states if ZTenantId(t["id"]) == tenant] + matching = [t for t in all_states if TenantId(t["id"]) == tenant] return get_only_element(matching)["state"] def get_metric_value(name): @@ -35,8 +35,8 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): value = line.lstrip(name).strip() return int(value) - def delete_all_timelines(tenant: ZTenantId): - timelines = [ZTimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] + def delete_all_timelines(tenant: TenantId): + timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) @@ -56,7 +56,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Delete all timelines on all tenants for tenant_info in client.tenant_list(): - tenant_id = ZTenantId(tenant_info["id"]) + tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index bd53aae25c..4e7610a96f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -8,7 +8,7 @@ import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, ZTenantId +from fixtures.types import Lsn, TenantId from prometheus_client.samples import Sample @@ -188,7 +188,7 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000,) - def get_ps_metric_samples_for_tenant(tenant_id: ZTenantId) -> List[Sample]: + def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") samples = [] for metric_name in ps_metrics.metrics: diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 70b474c9a9..85f371c845 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -19,7 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -58,7 +58,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem env = neon_env_builder.init_start() - tenants_pgs: List[Tuple[ZTenantId, Postgres]] = [] + tenants_pgs: List[Tuple[TenantId, Postgres]] = [] for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly @@ -83,8 +83,8 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem res = pg.safe_psql_many( ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] ) - tenant_id = ZTenantId(res[0][0][0]) - timeline_id = ZTimelineId(res[1][0][0]) + tenant_id = TenantId(res[0][0][0]) + timeline_id = TimelineId(res[1][0][0]) current_lsn = Lsn(res[2][0][0]) # wait until pageserver receives all the data diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 5a20dbd232..2eea8dd3cc 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,6 +1,6 @@ import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId def test_timeline_delete(neon_simple_env: NeonEnv): @@ -10,12 +10,12 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # first try to delete non existing timeline # for existing tenant: - invalid_timeline_id = ZTimelineId.generate() + invalid_timeline_id = TimelineId.generate() with pytest.raises(NeonPageserverApiException, match="timeline not found"): ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) # for non existing tenant: - invalid_tenant_id = ZTenantId.generate() + invalid_tenant_id = TenantId.generate() with pytest.raises( NeonPageserverApiException, match=f"Tenant {invalid_tenant_id} not found in the local state", diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 6fbc430e80..83018f46f5 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -15,7 +15,7 @@ from fixtures.neon_fixtures import ( assert_timeline_local, wait_for_last_flush_lsn, ) -from fixtures.types import ZTenantId, ZTimelineId +from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size @@ -386,7 +386,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: ZTimelineId): + def get_timeline_physical_size(timeline: TimelineId): res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) return res["local"]["current_physical_size_non_incremental"] @@ -415,7 +415,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): assert tenant_physical_size == timeline_total_size -def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimelineId): +def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): """Check the current physical size returned from timeline API matches the total physical size of the timeline on disk""" client = env.pageserver.http_client() @@ -431,7 +431,7 @@ def assert_physical_size(env: NeonEnv, tenant_id: ZTenantId, timeline_id: ZTimel # Timeline logical size initialization is an asynchronous background task that runs once, # try a few times to ensure it's activated properly def wait_for_timeline_size_init( - client: NeonPageserverHttpClient, tenant: ZTenantId, timeline: ZTimelineId + client: NeonPageserverHttpClient, tenant: TenantId, timeline: TimelineId ): for i in range(10): timeline_details = assert_timeline_local(client, tenant, timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index cd370e60c0..8c5b4c8c30 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -32,13 +32,13 @@ from fixtures.neon_fixtures import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar def wait_lsn_force_checkpoint( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, pg: Postgres, ps: NeonPageserver, pageserver_conn_options={}, @@ -74,7 +74,7 @@ def wait_lsn_force_checkpoint( @dataclass class TimelineMetrics: - timeline_id: ZTimelineId + timeline_id: TimelineId last_record_lsn: Lsn # One entry per each Safekeeper, order is the same flush_lsns: List[Lsn] = field(default_factory=list) @@ -126,7 +126,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): timeline_metrics = [] for timeline_detail in timeline_details: - timeline_id = ZTimelineId(timeline_detail["timeline_id"]) + timeline_id = TimelineId(timeline_detail["timeline_id"]) local_timeline_detail = timeline_detail.get("local") if local_timeline_detail is None: @@ -273,8 +273,8 @@ def test_broker(neon_env_builder: NeonEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -325,8 +325,8 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ] ) - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} @@ -348,7 +348,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(tenant_id) ) http_cli_other = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) http_cli_noauth = env.safekeepers[0].http_client() @@ -438,8 +438,8 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot pg = env.postgres.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) pg_conn = pg.connect() cur = pg_conn.cursor() @@ -493,8 +493,8 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) expected_sum = 0 @@ -584,8 +584,8 @@ class ProposerPostgres(PgProtocol): self, pgdata_dir: str, pg_bin, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, listen_addr: str, port: int, ): @@ -593,8 +593,8 @@ class ProposerPostgres(PgProtocol): self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin - self.tenant_id: ZTenantId = tenant_id - self.timeline_id: ZTimelineId = timeline_id + self.tenant_id: TenantId = tenant_id + self.timeline_id: TimelineId = timeline_id self.listen_addr: str = listen_addr self.port: int = port @@ -672,8 +672,8 @@ def test_sync_safekeepers( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_id = ZTenantId.generate() - timeline_id = ZTimelineId.generate() + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() # write config for proposer pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") @@ -725,8 +725,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa = env.safekeepers[0] # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) if not auth_enabled: wa_http_cli = wa.http_client() @@ -735,7 +735,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() wa_http_cli_bad = wa.http_client( - auth_token=env.auth_keys.generate_tenant_token(ZTenantId.generate()) + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) wa_http_cli_bad.check_status() wa_http_cli_noauth = wa.http_client() @@ -785,15 +785,15 @@ class SafekeeperEnv: self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[ZTenantId] = None - self.timeline_id: Optional[ZTimelineId] = None + self.tenant_id: Optional[TenantId] = None + self.timeline_id: Optional[TimelineId] = None def init(self) -> "SafekeeperEnv": assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" - self.tenant_id = ZTenantId.generate() - self.timeline_id = ZTimelineId.generate() + self.tenant_id = TenantId.generate() + self.timeline_id = TimelineId.generate() self.repo_dir.mkdir(exist_ok=True) # Create config and a Safekeeper object for each safekeeper @@ -912,9 +912,7 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") assert sum_after == sum_before + 5000050000 - def show_statuses( - safekeepers: List[Safekeeper], tenant_id: ZTenantId, timeline_id: ZTimelineId - ): + def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): for sk in safekeepers: http_cli = sk.http_client() try: @@ -935,8 +933,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): pg.start() # learn neon timeline from compute - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = ZTimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1134,7 +1132,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, ZTimelineId("00" * 16)) == { + assert sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16)) == { "dir_existed": False, "was_active": False, } diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index e36d3cf94b..9d2008296a 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -7,7 +7,7 @@ from typing import List, Optional import asyncpg from fixtures.log_helper import getLogger from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper -from fixtures.types import Lsn, ZTenantId, ZTimelineId +from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") @@ -103,8 +103,8 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou async def wait_for_lsn( safekeeper: Safekeeper, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, wait_lsn: Lsn, polling_interval=1, timeout=60, @@ -155,8 +155,8 @@ async def run_restarts_under_load( test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = ZTenantId(await pg_conn.fetchval("show neon.tenant_id")) - timeline_id = ZTimelineId(await pg_conn.fetchval("show neon.timeline_id")) + tenant_id = TenantId(await pg_conn.fetchval("show neon.tenant_id")) + timeline_id = TimelineId(await pg_conn.fetchval("show neon.timeline_id")) bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 6fd509c4d1..21921a3bc2 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -9,7 +9,7 @@ from fixtures.neon_fixtures import ( base_dir, pg_distrib_dir, ) -from fixtures.types import ZTenantId +from fixtures.types import TenantId def test_wal_restore( @@ -22,7 +22,7 @@ def test_wal_restore( env.neon_cli.create_branch("test_wal_restore") pg = env.postgres.create_start("test_wal_restore") pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = ZTenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" From 6db6e7ddda3c67a3d48387955859452e93f7d751 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 13 Sep 2022 22:51:28 +0300 Subject: [PATCH 057/166] Use backward-compatible safekeeper code --- safekeeper/src/handler.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 41b9ad66e1..ad2c0ec8bf 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -68,11 +68,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { + // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy, + // remove these after the PR gets deployed: + // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064 match opt.split_once('=') { - Some(("tenant_id", value)) => { + Some(("ztenantid", value)) | Some(("tenant_id", value)) => { self.tenant_id = Some(value.parse()?); } - Some(("timeline_id", value)) => { + Some(("ztimelineid", value)) | Some(("timeline_id", value)) => { self.timeline_id = Some(value.parse()?); } _ => continue, From c3096532f9ceee8fad82b4c741b0108bd143cc06 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 14 Sep 2022 09:23:51 +0300 Subject: [PATCH 058/166] Fix vendor/postgres-v15 to point to correct v15 branch. Commit f44afbaf62 updated vendor/postgres-v15 to point to a commit that was built on top of PostgreSQL 14 rather than 15. So we accidentally had two copies of PostgreSQL v14 in the repository. Oops. This updates it to point to the correct version. --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index b1dbd93e2b..cf4db95b84 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit b1dbd93e2b1691e93860f7e59b9e1fe5a6e79786 +Subproject commit cf4db95b8480e08425e52ef46f78cb5a234baa0e From d87c9e62d64c8a4628096a4ce5c8307fc1daa2e6 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 14 Sep 2022 11:53:34 +0100 Subject: [PATCH 059/166] Nightly Benchmarks: perform tests on both pre-created and fresh projects (#2443) --- .github/workflows/benchmarking.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 49fbc74dd6..fab0a9aa04 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -144,7 +144,9 @@ jobs: strategy: fail-fast: false matrix: - platform: [ neon-captest, rds-aurora ] + # neon-captest: Run pgbench, reusing existing project + # neon-captest-new: Same, but on a freshly created project + platform: [ neon-captest, neon-captest-new, rds-aurora ] runs-on: dev container: @@ -162,7 +164,7 @@ jobs: sudo apt install -y postgresql-14 - name: Create Neon Project - if: matrix.platform == 'neon-captest' + if: matrix.platform == 'neon-captest-new' id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -174,13 +176,16 @@ jobs: run: | case "${PLATFORM}" in neon-captest) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + neon-captest-new) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest' or 'rds-aurora'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'" exit 1 ;; esac @@ -240,7 +245,7 @@ jobs: build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform == 'neon-captest' && always() }} + if: ${{ matrix.platform == 'neon-captest-new' && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev @@ -252,6 +257,6 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} From f86ea09323ac0d6f2904dcf603652044cea50664 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 14 Sep 2022 09:53:06 +0300 Subject: [PATCH 060/166] Avoid recompiling postgres_ffi every time you run "make". Running "make" at the top level calls "make install" to install the PostgreSQL headers into the pg_install/ directory. That always updated the modification time of the headers even if there were no changes, triggering recompilation of the postgres_ffi bindings. To avoid that, use 'install -C', to install the PostgreSQL headers. However, there was an upstream PostgreSQL issue that the src/include/Makefile didn't respect the INSTALL configure option. That was just fixed in upstream PostgreSQL, so cherry-pick that fix to our vendor/postgres repositories. Fixes https://github.com/neondatabase/neon/issues/1873. --- Makefile | 6 ++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4d7b1bee07..4ac51ed174 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,12 @@ ifeq ($(UNAME_S),Darwin) PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib endif +# Use -C option so that when PostgreSQL "make install" installs the +# headers, the mtime of the headers are not changed when there have +# been no changes to the files. Changing the mtime triggers an +# unnecessary rebuild of 'postgres_ffi'. +PG_CONFIGURE_OPTS += INSTALL='install -C' + # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) # Fix for a corner case when make doesn't pass a jobserver diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 114676d2ed..ce723ee499 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 114676d2edd5307226d9448ec467821fdb77467d +Subproject commit ce723ee499450cb108aede464a35a17f3d75cf84 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index cf4db95b84..0858387047 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit cf4db95b8480e08425e52ef46f78cb5a234baa0e +Subproject commit 08583870479e30c64aeb5a97d6fee9cf470f05fb From 87bf7be5370cc2a621cd51d5a4cb3b1ed76e4633 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 14 Sep 2022 21:27:47 +0300 Subject: [PATCH 061/166] [proxy] Drop support for legacy cloud API (#2448) Apparently, it no longer exists in the cloud. --- proxy/src/auth.rs | 5 - proxy/src/auth/backend.rs | 22 +-- proxy/src/auth/backend/legacy_console.rs | 208 ----------------------- proxy/src/config.rs | 19 +-- proxy/src/main.rs | 31 +++- 5 files changed, 30 insertions(+), 255 deletions(-) delete mode 100644 proxy/src/auth/backend/legacy_console.rs diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index d09470d15e..a50d23e351 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -22,10 +22,6 @@ pub type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { - // This will be dropped in the future. - #[error(transparent)] - Legacy(#[from] backend::LegacyAuthError), - #[error(transparent)] Link(#[from] backend::LinkAuthError), @@ -78,7 +74,6 @@ impl UserFacingError for AuthError { fn to_string_client(&self) -> String { use AuthErrorImpl::*; match self.0.as_ref() { - Legacy(e) => e.to_string_client(), Link(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), WakeCompute(e) => e.to_string_client(), diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 9c43620ffb..de0719a196 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -6,9 +6,6 @@ pub use link::LinkAuthError; mod console; pub use console::{GetAuthInfoError, WakeComputeError}; -mod legacy_console; -pub use legacy_console::LegacyAuthError; - use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute, config, mgmt, @@ -56,7 +53,7 @@ impl std::fmt::Debug for DatabaseInfo { fmt.debug_struct("DatabaseInfo") .field("host", &self.host) .field("port", &self.port) - .finish() + .finish_non_exhaustive() } } @@ -88,8 +85,6 @@ impl From for tokio_postgres::Config { /// backends which require them for the authentication process. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BackendType { - /// Legacy Cloud API (V1) + link auth. - LegacyConsole(T), /// Current Cloud API (V2). Console(T), /// Local mock of Cloud API (V2). @@ -105,7 +100,6 @@ impl BackendType { pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { use BackendType::*; match self { - LegacyConsole(x) => LegacyConsole(f(x)), Console(x) => Console(f(x)), Postgres(x) => Postgres(f(x)), Link => Link, @@ -119,7 +113,6 @@ impl BackendType> { pub fn transpose(self) -> Result, E> { use BackendType::*; match self { - LegacyConsole(x) => x.map(LegacyConsole), Console(x) => x.map(Console), Postgres(x) => x.map(Postgres), Link => Ok(Link), @@ -176,15 +169,6 @@ impl BackendType> { } match self { - LegacyConsole(creds) => { - legacy_console::handle_user( - &urls.auth_endpoint, - &urls.auth_link_uri, - &creds, - client, - ) - .await - } Console(creds) => { console::Api::new(&urls.auth_endpoint, &creds) .handle_user(client) @@ -208,7 +192,6 @@ mod tests { #[test] fn test_backend_type_map() { let values = [ - BackendType::LegacyConsole(0), BackendType::Console(0), BackendType::Postgres(0), BackendType::Link, @@ -222,8 +205,7 @@ mod tests { #[test] fn test_backend_type_transpose() { let values = [ - BackendType::LegacyConsole(Ok::<_, ()>(0)), - BackendType::Console(Ok(0)), + BackendType::Console(Ok::<_, ()>(0)), BackendType::Postgres(Ok(0)), BackendType::Link, ]; diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs deleted file mode 100644 index b99a004dcd..0000000000 --- a/proxy/src/auth/backend/legacy_console.rs +++ /dev/null @@ -1,208 +0,0 @@ -//! Cloud API V1. - -use super::DatabaseInfo; -use crate::{ - auth::{self, ClientCredentials}, - compute, - error::UserFacingError, - stream::PqStream, - waiters, -}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::BeMessage as Be; - -#[derive(Debug, Error)] -pub enum LegacyAuthError { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error("Console responded with a malformed JSON: {0}")] - BadResponse(#[from] serde_json::Error), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error(transparent)] - WaiterRegister(#[from] waiters::RegisterError), - - #[error(transparent)] - WaiterWait(#[from] waiters::WaitError), -} - -impl UserFacingError for LegacyAuthError { - fn to_string_client(&self) -> String { - use LegacyAuthError::*; - match self { - AuthFailed(_) | HttpStatus(_) => self.to_string(), - _ => "Internal error".to_string(), - } - } -} - -// NOTE: the order of constructors is important. -// https://serde.rs/enum-representations.html#untagged -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum ProxyAuthResponse { - Ready { conn_info: DatabaseInfo }, - Error { error: String }, - NotReady { ready: bool }, // TODO: get rid of `ready` -} - -impl ClientCredentials<'_> { - fn is_existing_user(&self) -> bool { - self.user.ends_with("@zenith") - } -} - -async fn authenticate_proxy_client( - auth_endpoint: &reqwest::Url, - creds: &ClientCredentials<'_>, - md5_response: &str, - salt: &[u8; 4], - psql_session_id: &str, -) -> Result { - let mut url = auth_endpoint.clone(); - url.query_pairs_mut() - .append_pair("login", creds.user) - .append_pair("database", creds.dbname) - .append_pair("md5response", md5_response) - .append_pair("salt", &hex::encode(salt)) - .append_pair("psql_session_id", psql_session_id); - - super::with_waiter(psql_session_id, |waiter| async { - println!("cloud request: {}", url); - // TODO: leverage `reqwest::Client` to reuse connections - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(LegacyAuthError::HttpStatus(resp.status())); - } - - let auth_info = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: {:?}", auth_info); - - use ProxyAuthResponse::*; - let db_info = match auth_info { - Ready { conn_info } => conn_info, - Error { error } => return Err(LegacyAuthError::AuthFailed(error)), - NotReady { .. } => waiter.await?.map_err(LegacyAuthError::AuthFailed)?, - }; - - Ok(db_info) - }) - .await -} - -async fn handle_existing_user( - auth_endpoint: &reqwest::Url, - client: &mut PqStream, - creds: &ClientCredentials<'_>, -) -> auth::Result { - let psql_session_id = super::link::new_psql_session_id(); - let md5_salt = rand::random(); - - client - .write_message(&Be::AuthenticationMD5Password(md5_salt)) - .await?; - - // Read client's password hash - let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword( - "the password should be a valid null-terminated utf-8 string", - ))?; - - let db_info = authenticate_proxy_client( - auth_endpoint, - creds, - md5_response, - &md5_salt, - &psql_session_id, - ) - .await?; - - Ok(compute::NodeInfo { - reported_auth_ok: false, - config: db_info.into(), - }) -} - -pub async fn handle_user( - auth_endpoint: &reqwest::Url, - auth_link_uri: &reqwest::Url, - creds: &ClientCredentials<'_>, - client: &mut PqStream, -) -> auth::Result { - if creds.is_existing_user() { - handle_existing_user(auth_endpoint, client, creds).await - } else { - super::link::handle_user(auth_link_uri, client).await - } -} - -fn parse_password(bytes: &[u8]) -> Option<&str> { - std::str::from_utf8(bytes).ok()?.strip_suffix('\0') -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_proxy_auth_response() { - // Ready - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": true, - "conn_info": DatabaseInfo::default(), - })) - .unwrap(); - assert!(matches!( - auth, - ProxyAuthResponse::Ready { - conn_info: DatabaseInfo { .. } - } - )); - - // Error - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - "error": "too bad, so sad", - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::Error { .. })); - - // NotReady - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); - } - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1f01c25734..8835d660d5 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,21 +1,6 @@ use crate::{auth, url::ApiUrl}; -use anyhow::{bail, ensure, Context}; -use std::{str::FromStr, sync::Arc}; - -impl FromStr for auth::BackendType<()> { - type Err = anyhow::Error; - - fn from_str(s: &str) -> anyhow::Result { - use auth::BackendType::*; - Ok(match s { - "legacy" => LegacyConsole(()), - "console" => Console(()), - "postgres" => Postgres(()), - "link" => Link, - _ => bail!("Invalid option `{s}` for auth method"), - }) - } -} +use anyhow::{ensure, Context}; +use std::sync::Arc; pub struct ProxyConfig { pub tls_config: Option, diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 2521f2af21..efe45f6386 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -20,7 +20,7 @@ mod url; mod waiters; use anyhow::{bail, Context}; -use clap::{App, Arg}; +use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; use std::{future::Future, net::SocketAddr}; @@ -36,9 +36,26 @@ async fn flatten_err( f.map(|r| r.context("join error").and_then(|x| x)).await } +/// A proper parser for auth backend parameter. +impl clap::ValueEnum for auth::BackendType<()> { + fn value_variants<'a>() -> &'a [Self] { + use auth::BackendType::*; + &[Console(()), Postgres(()), Link] + } + + fn to_possible_value<'a>(&self) -> Option> { + use auth::BackendType::*; + Some(clap::PossibleValue::new(match self { + Console(_) => "console", + Postgres(_) => "postgres", + Link => "link", + })) + } +} + #[tokio::main] async fn main() -> anyhow::Result<()> { - let arg_matches = App::new("Neon proxy/router") + let arg_matches = clap::App::new("Neon proxy/router") .version(GIT_VERSION) .arg( Arg::new("proxy") @@ -52,8 +69,8 @@ async fn main() -> anyhow::Result<()> { Arg::new("auth-backend") .long("auth-backend") .takes_value(true) - .help("Possible values: legacy | console | postgres | link") - .default_value("legacy"), + .value_parser(clap::builder::EnumValueParser::>::new()) + .default_value("link"), ) .arg( Arg::new("mgmt") @@ -118,6 +135,10 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; + let auth_backend = *arg_matches + .try_get_one::>("auth-backend")? + .unwrap(); + let auth_urls = config::AuthUrls { auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, @@ -125,7 +146,7 @@ async fn main() -> anyhow::Result<()> { let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, - auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?, + auth_backend, auth_urls, })); From 757e2147c12a4d63cfecf84018b5453cbec474bd Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 15 Sep 2022 14:21:22 +0200 Subject: [PATCH 062/166] Follow-up for neondatabase/neon#2448 (#2452) * remove `legacy` mode from the proxy readme * explicitly specify `authBackend` in the link auth proxy helm-values for all envs --- .github/helm-values/neon-stress.proxy.yaml | 1 + .github/helm-values/production.proxy.yaml | 1 + .github/helm-values/staging.proxy.yaml | 1 + proxy/README.md | 17 +++++++---------- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml index 8236f9873a..ce432ca23c 100644 --- a/.github/helm-values/neon-stress.proxy.yaml +++ b/.github/helm-values/neon-stress.proxy.yaml @@ -1,6 +1,7 @@ fullnameOverride: "neon-stress-proxy" settings: + authBackend: "link" authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/" uri: "https://console.dev.neon.tech/psql_session/" diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml index 87c61c90cf..c26a6258be 100644 --- a/.github/helm-values/production.proxy.yaml +++ b/.github/helm-values/production.proxy.yaml @@ -1,4 +1,5 @@ settings: + authBackend: "link" authEndpoint: "https://console.neon.tech/authenticate_proxy_request/" uri: "https://console.neon.tech/psql_session/" diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml index 34ba972b64..25842429a5 100644 --- a/.github/helm-values/staging.proxy.yaml +++ b/.github/helm-values/staging.proxy.yaml @@ -5,6 +5,7 @@ image: repository: neondatabase/neon settings: + authBackend: "link" authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" uri: "https://console.stage.neon.tech/psql_session/" diff --git a/proxy/README.md b/proxy/README.md index 458a7d9bbf..4ead098b73 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -2,10 +2,8 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: -* legacy - old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back * console - new SCRAM-based console API; uses SNI info to select the destination cluster + new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon) * postgres uses postgres to select auth secrets of existing roles. Useful for local testing * link @@ -13,21 +11,20 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a ## Using SNI-based routing on localhost -Now proxy determines cluster name from the subdomain, request to the `my-cluster-42.somedomain.tld` will be routed to the cluster named `my-cluster-42`. Unfortunately `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: +Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: -``` +```sh openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" - ``` -now you can start proxy: +start proxy -``` +```sh ./target/debug/proxy -c server.crt -k server.key ``` -and connect to it: +and connect to it -``` +```sh PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full' ``` From a8d97325291b207d3481ed9578246398c6576ec2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 15 Sep 2022 03:28:24 +0000 Subject: [PATCH 063/166] Bump axum-core from 0.2.7 to 0.2.8 Bumps [axum-core](https://github.com/tokio-rs/axum) from 0.2.7 to 0.2.8. - [Release notes](https://github.com/tokio-rs/axum/releases) - [Changelog](https://github.com/tokio-rs/axum/blob/main/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/axum/compare/axum-core-v0.2.7...axum-core-v0.2.8) --- updated-dependencies: - dependency-name: axum-core dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Cargo.lock | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4234d2b00..a258fab5f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -183,9 +183,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f44a0e6200e9d11a1cdc989e4b358f6e3d354fbf48478f345a17f4e43f8635" +checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b" dependencies = [ "async-trait", "bytes", @@ -193,6 +193,8 @@ dependencies = [ "http", "http-body", "mime", + "tower-layer", + "tower-service", ] [[package]] From 1062e57feeae80fa9771ad42dc66cd10ffcf5e36 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 15 Sep 2022 16:33:42 +0300 Subject: [PATCH 064/166] Don't run codestyle checks separately for Postgres v14 and v15. Previously, we compiled neon separately for Postgres v14 and v15, for the codestyle checks. But that was bogus; we actually just ran "make postgres", which always compiled both versions. The version really only affected the caching. Fix that, by copying the build steps from the main build_and_test.yml workflow. --- .github/workflows/codestyle.yml | 53 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 53d0f9c5d8..237cf81205 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -30,8 +30,6 @@ jobs: # this is all we need to install our toolchain later via rust-toolchain.toml # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] - # To support several Postgres versions, add them here. - postgres_version: [v14, v15] timeout-minutes: 60 name: check codestyle rust and postgres runs-on: ${{ matrix.os }} @@ -56,17 +54,29 @@ jobs: if: matrix.os == 'macos-latest' run: brew install flex bison openssl - - name: Set pg revision for caching - id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-${{matrix.postgres_version}}) + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14) + shell: bash -euxo pipefail {0} - - name: Cache postgres ${{matrix.postgres_version}} build - id: cache_pg + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15) + shell: bash -euxo pipefail {0} + + - name: Cache postgres v14 build + id: cache_pg_14 uses: actions/cache@v3 with: - path: | - pg_install/${{matrix.postgres_version}} - key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS if: matrix.os == 'macos-latest' @@ -74,24 +84,19 @@ jobs: echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: make postgres + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: make postgres-v14 + shell: bash -euxo pipefail {0} + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: make postgres-v15 + shell: bash -euxo pipefail {0} - name: Build neon extensions run: make neon-pg-ext - # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' - # and the real cause will be inside config.log - - name: Print configure logs in case of failure - if: failure() - continue-on-error: true - run: | - echo '' && echo '=== Postgres ${{matrix.postgres_version}} config.log ===' && echo '' - cat pg_install/build/${{matrix.postgres_version}}/config.log - echo '' && echo '=== Postgres ${{matrix.postgres_version}} configure.log ===' && echo '' - cat pg_install/build/${{matrix.postgres_version}}/configure.log - - name: Cache cargo deps id: cache_cargo uses: actions/cache@v3 From 9d9d8e951947b9cbaca4ab11937bda8d681dc24c Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 15 Sep 2022 19:16:07 +0200 Subject: [PATCH 065/166] docs/sourcetree: update CLion set up instructions (#2454) After #2325 the old method no longer works as our Makefile does not print compilation commands when run with --dry-run, see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 This method is much slower but is hopefully robust. Add some more notes while we're here. --- docs/sourcetree.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index c1a860f126..8043450a55 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -147,8 +147,16 @@ C code requires some extra care, as it's built via Make, not CMake. Some of our ```bash # Install a `compiledb` tool which can parse make's output and generate the compilation database. poetry add -D compiledb - # Run Make without actually compiling code so we can generate the compilation database. It still may take a few minutes. - make --dry-run --print-directory --keep-going --assume-new=* postgres neon-pg-ext | poetry run compiledb --verbose --no-build + # Clean the build tree so we can rebuild from scratch. + # Unfortunately, our and Postgres Makefiles do not work well with either --dry-run or --assume-new, + # so we don't know a way to generate the compilation database without recompiling everything, + # see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 + make distclean + # Rebuild the Postgres parts from scratch and save the compilation commands to the compilation database. + # You can alter the -j parameter to your liking. + # Note that we only build for a specific version of Postgres. The extension code is shared, but headers are + # different, so we set up CLion to only use a specific version of the headers. + make -j$(nproc) --print-directory postgres-v15 neon-pg-ext-v15 | poetry run compiledb --verbose --no-build # Uninstall the tool poetry remove -D compiledb # Make sure the compile_commands.json file is not committed. @@ -157,7 +165,8 @@ C code requires some extra care, as it's built via Make, not CMake. Some of our 3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. 4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). 5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. -7. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. +6. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. +7. Set up correct code indentation in CLion's settings: Editor > Code Style > C/C++, choose the "Project" scheme on the top, and tick the "Use tab character" on the "Tabs and Indents" tab. Ensure that "Tab size" is 4. You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. @@ -168,3 +177,4 @@ Known issues (fixes and suggestions are welcome): * Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. * CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. * Cargo Clippy diagnostics in CLion may take a lot of resources. +* `poetry add -D` updates some packages and changes `poetry.lock` drastically even when followed by `poetry remove -D`. Feel free to `git checkout poetry.lock` and `./scripts/pysync` to revert these changes. From e968b5e5025616f2a7d03cd7307c54a49185925c Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 15 Sep 2022 20:43:51 +0200 Subject: [PATCH 066/166] tests: do not set num_safekeepers = 1, it's the default (#2457) Also get rid if `with_safekeepers` parameter in tests. Its meaning has changed: `False` meant "no safekeepers" which is not supported anymore, so we assume it's always `True`. See #1648 --- test_runner/performance/test_perf_pgbench.py | 1 - test_runner/regress/test_auth.py | 8 +++----- test_runner/regress/test_branch_behind.py | 7 ------- test_runner/regress/test_crafted_wal_end.py | 1 - test_runner/regress/test_fullbackup.py | 2 -- test_runner/regress/test_import.py | 2 -- test_runner/regress/test_lsn_mapping.py | 1 - test_runner/regress/test_pitr_gc.py | 2 -- test_runner/regress/test_recovery.py | 1 - test_runner/regress/test_tenants.py | 18 ++++++------------ test_runner/regress/test_wal_acceptor.py | 1 - 11 files changed, 9 insertions(+), 35 deletions(-) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 934642d095..2a2213b783 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -173,7 +173,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): - neon_env_builder.num_safekeepers = 1 neon_env_builder.pageserver_config_override = """ profiling="page_requests" """ diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index d9082efada..ce4a8ffa9e 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -56,14 +56,12 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): tenant_http_client.tenant_create() -@pytest.mark.parametrize("with_safekeepers", [False, True]) -def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): +def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True - if with_safekeepers: - neon_env_builder.num_safekeepers = 3 + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - branch = f"test_compute_auth_to_pageserver{with_safekeepers}" + branch = "test_compute_auth_to_pageserver" env.neon_cli.create_branch(branch) pg = env.postgres.create_start(branch) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index cfb9649867..b0d0737172 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -10,13 +10,6 @@ from fixtures.utils import print_gc_result, query_scalar # Create a couple of branches off the main branch, at a historical point in time. # def test_branch_behind(neon_env_builder: NeonEnvBuilder): - - # Use safekeeper in this test to avoid a subtle race condition. - # Without safekeeper, walreceiver reconnection can stuck - # because of IO deadlock. - # - # See https://github.com/neondatabase/neon/issues/1068 - neon_env_builder.num_safekeepers = 1 # Disable pitr, because here we want to test branch creation after GC neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 32e5366945..e94c9a2bd0 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -17,7 +17,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft ], ) def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 8de2687c9b..0048e7b580 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -18,8 +18,6 @@ num_rows = 1000 def test_fullbackup( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor ): - - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_fullbackup") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 60cc0551ab..7b61b03b97 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -122,7 +122,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build @pytest.mark.timeout(600) def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() @@ -140,7 +139,6 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu # @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 9d1efec2c1..ef99954a76 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -9,7 +9,6 @@ from fixtures.utils import query_scalar # Test pageserver get_lsn_by_timestamp API # def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 786266b70e..57b2ee1c04 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -12,8 +12,6 @@ from fixtures.utils import print_gc_result, query_scalar # Insert some data, run GC and create a branch in the past. # def test_pitr_gc(neon_env_builder: NeonEnvBuilder): - - neon_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data neon_env_builder.pageserver_config_override = ( "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 6aa8b4e9be..08c15d8f09 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -10,7 +10,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder # Test pageserver recovery after crash # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 # Override default checkpointer settings to run it more often neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4e7610a96f..4500395c8f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -50,29 +50,23 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ), "pageserver should clean its temp tenant dirs on restart" -@pytest.mark.parametrize("with_safekeepers", [False, True]) -def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): - if with_safekeepers: - neon_env_builder.num_safekeepers = 3 +def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() """Tests tenants with and without wal acceptors""" tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_1 - ) - env.neon_cli.create_timeline( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_2 - ) + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_1) + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", + "test_tenants_normal_work", tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", + "test_tenants_normal_work", tenant_id=tenant_2, ) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8c5b4c8c30..089ed91c98 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1037,7 +1037,6 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): - neon_env_builder.num_safekeepers = 1 neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() From 96e867642fbe730a3fe13c572383d68b393ca567 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 15 Sep 2022 18:20:23 -0400 Subject: [PATCH 067/166] Validate tenant create options (#2450) Co-authored-by: Kirill Bulatov --- control_plane/src/storage.rs | 79 ++++++++++++++----------- test_runner/regress/test_tenant_conf.py | 16 ++++- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2cc5e096c..3bbbdc5865 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -371,43 +371,50 @@ impl PageServerNode { new_tenant_id: Option, settings: HashMap<&str, &str>, ) -> anyhow::Result { + let mut settings = settings.clone(); + let request = TenantCreateRequest { + new_tenant_id, + checkpoint_distance: settings + .remove("checkpoint_distance") + .map(|x| x.parse::()) + .transpose()?, + checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + compaction_target_size: settings + .remove("compaction_target_size") + .map(|x| x.parse::()) + .transpose()?, + compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_threshold: settings + .remove("compaction_threshold") + .map(|x| x.parse::()) + .transpose()?, + gc_horizon: settings + .remove("gc_horizon") + .map(|x| x.parse::()) + .transpose()?, + gc_period: settings.remove("gc_period").map(|x| x.to_string()), + image_creation_threshold: settings + .remove("image_creation_threshold") + .map(|x| x.parse::()) + .transpose()?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .remove("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings + .remove("lagging_wal_timeout") + .map(|x| x.to_string()), + max_lsn_wal_lag: settings + .remove("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, + }; + if !settings.is_empty() { + bail!("Unrecognized tenant settings: {settings:?}") + } self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) - .json(&TenantCreateRequest { - new_tenant_id, - checkpoint_distance: settings - .get("checkpoint_distance") - .map(|x| x.parse::()) - .transpose()?, - checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()), - compaction_target_size: settings - .get("compaction_target_size") - .map(|x| x.parse::()) - .transpose()?, - compaction_period: settings.get("compaction_period").map(|x| x.to_string()), - compaction_threshold: settings - .get("compaction_threshold") - .map(|x| x.parse::()) - .transpose()?, - gc_horizon: settings - .get("gc_horizon") - .map(|x| x.parse::()) - .transpose()?, - gc_period: settings.get("gc_period").map(|x| x.to_string()), - image_creation_threshold: settings - .get("image_creation_threshold") - .map(|x| x.parse::()) - .transpose()?, - pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), - walreceiver_connect_timeout: settings - .get("walreceiver_connect_timeout") - .map(|x| x.to_string()), - lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), - max_lsn_wal_lag: settings - .get("max_lsn_wal_lag") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - }) + .json(&request) .send()? .error_from_body()? .json::>() diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 51a8101b11..c6cf416d12 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -6,6 +6,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_tenant_config(neon_env_builder: NeonEnvBuilder): + """Test per tenant configuration""" # set some non-default global config neon_env_builder.pageserver_config_override = """ page_cache_size=444; @@ -13,7 +14,20 @@ wait_lsn_timeout='111 s'; tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" env = neon_env_builder.init_start() - """Test per tenant configuration""" + + # Check that we raise on misspelled configs + invalid_conf_key = "some_invalid_setting_name_blah_blah_123" + try: + env.neon_cli.create_tenant( + conf={ + invalid_conf_key: "20000", + } + ) + except Exception as e: + assert invalid_conf_key in str(e) + else: + raise AssertionError("Expected validation error") + tenant, _ = env.neon_cli.create_tenant( conf={ "checkpoint_distance": "20000", From 031e57a973d5be159012a7af44d4b41f7abd61be Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 14 Sep 2022 16:10:52 +0300 Subject: [PATCH 068/166] Disable failpoints by default --- .github/workflows/build_and_test.yml | 6 ++++-- pageserver/Cargo.toml | 10 +++++----- test_runner/README.md | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d586741d68..7688f9c1bd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -94,15 +94,17 @@ jobs: # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, # because "cargo metadata" doesn't accept --release or --debug options # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. - name: Set env variables run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="" + CARGO_FEATURES="--features failpoints" CARGO_FLAGS="--locked --timings" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features profiling" + CARGO_FEATURES="--features failpoints,profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 11d2d94906..ce09e788bd 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -4,12 +4,12 @@ version = "0.1.0" edition = "2021" [features] -# It is simpler infra-wise to have failpoints enabled by default -# It shouldn't affect performance in any way because failpoints -# are not placed in hot code paths -default = ["failpoints"] -profiling = ["pprof"] +default = [] + +# Feature that enables a special API, fail_point! macro (adds some runtime cost) +# to run tests on outage conditions failpoints = ["fail/failpoints"] +profiling = ["pprof"] [dependencies] async-stream = "0.3" diff --git a/test_runner/README.md b/test_runner/README.md index 44751944b3..01fe4ff863 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,6 +6,7 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions + If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule From db5ec0dae70aed65d79a23574afb4f2ea8d4fa06 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Thu, 15 Sep 2022 23:50:46 -0700 Subject: [PATCH 069/166] Cleanup/simplify logical size calculation (#2459) Should produce identical results; replaces an error case that shouldn't be possible with `expect`. --- pageserver/src/tenant/timeline.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e821ef1b9a..95bdf715b5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -232,14 +232,16 @@ impl LogicalSize { } fn current_size(&self) -> anyhow::Result { - let size_increment = self.size_added_after_initial.load(AtomicOrdering::Acquire); + let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); + // ^^^ keep this type explicit so that the casts in this function break if + // we change the type. match self.initial_logical_size.get() { Some(initial_size) => { let absolute_size_increment = u64::try_from( size_increment .checked_abs() .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, - ).with_context(|| format!("Failed to convert size increment {size_increment} to u64"))?; + ).expect("casting nonnegative i64 to u64 should not fail"); if size_increment < 0 { initial_size.checked_sub(absolute_size_increment) @@ -249,11 +251,7 @@ impl LogicalSize { .map(CurrentLogicalSize::Exact) } None => { - let non_negative_size_increment = if size_increment < 0 { - 0 - } else { - u64::try_from(size_increment).expect("not negative, cannot fail") - }; + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) } } From 74312e268febaff8829b6fa795268231bd985699 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 16 Sep 2022 09:49:33 +0300 Subject: [PATCH 070/166] Tidy up storege artifact build flags * Simplify test build features handling * Build only necessary binaries during the release build --- .github/workflows/build_and_test.yml | 4 ++-- Dockerfile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7688f9c1bd..f67d42f2ff 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -101,7 +101,7 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="--features failpoints" - CARGO_FLAGS="--locked --timings" + CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features failpoints,profiling" @@ -160,7 +160,7 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests shell: bash -euxo pipefail {0} - name: Run cargo test diff --git a/Dockerfile b/Dockerfile index eacb88d168..711a92a90e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --locked --release \ +&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \ && cachepot -s # Build final image From 72b33997c773a963521d8007136c30080292e85e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 16 Sep 2022 10:09:54 +0100 Subject: [PATCH 071/166] Nightly Benchmarks: trigger tests earlier (#2463) --- .github/workflows/benchmarking.yml | 3 ++- .github/workflows/build_and_test.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index fab0a9aa04..df0e8a4275 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '36 4 * * *' # run once a day, timezone is utc + - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: @@ -239,6 +239,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Create Allure report + if: always() uses: ./.github/actions/allure-report with: action: generate diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f67d42f2ff..5bff469582 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -324,6 +324,7 @@ jobs: build_type: ${{ matrix.build_type }} - name: Store Allure test stat in the DB + if: ${{ steps.create-allure-report.outputs.report-url }} env: BUILD_TYPE: ${{ matrix.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} From 4db15d3c7cbfbbe17c6f18af7b5eae3198fafadf Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 14 Sep 2022 18:22:00 +0300 Subject: [PATCH 072/166] change prefix_in_bucket in pageserver config --- .github/ansible/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index b47db6a9b5..c06a0ef5b3 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -71,7 +71,7 @@ - "[remote_storage]" - "bucket_name = '{{ bucket_name }}'" - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" + - "prefix_in_bucket = 'pageserver/v1'" become: true tags: - pageserver From 44fd4e3c9f9b8087dc0871785f87ed7848538839 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 16 Sep 2022 16:59:05 +0300 Subject: [PATCH 073/166] add more logs --- pageserver/src/storage_sync.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 9d259bf1e2..64e0f9a9e3 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -601,6 +601,7 @@ pub fn spawn_storage_sync_task( for (tenant_id, timeline_data) in local_timeline_files.0 { if timeline_data.is_empty() { + info!("got empty tenant {}", tenant_id); let _ = empty_tenants.0.entry(tenant_id).or_default(); } else { for (timeline_id, timeline_data) in timeline_data { @@ -1303,6 +1304,10 @@ fn schedule_first_sync_tasks( None => { // TODO (rodionov) does this mean that we've crashed during tenant creation? // is it safe to upload this checkpoint? could it be half broken? + warn!( + "marking {} as locally complete, while it doesnt exist in remote index", + sync_id + ); new_sync_tasks.push_back(( sync_id, SyncTask::upload(LayersUpload { @@ -1337,6 +1342,8 @@ fn compare_local_and_remote_timeline( local_files: HashSet, remote_entry: &RemoteTimeline, ) -> (LocalTimelineInitStatus, bool) { + let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered(); + let remote_files = remote_entry.stored_files(); let number_of_layers_to_download = remote_files.difference(&local_files).count(); @@ -1347,10 +1354,12 @@ fn compare_local_and_remote_timeline( layers_to_skip: local_files.clone(), }), )); + info!("NeedsSync"); (LocalTimelineInitStatus::NeedsSync, true) // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { + info!("LocallyComplete"); ( LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), false, From 9c35a094527fea58f1f402f99682fe9dc8c23b02 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Fri, 16 Sep 2022 08:37:44 -0700 Subject: [PATCH 074/166] Improve build errors when `postgres_ffi` fails (#2460) This commit does two things of note: 1. Bumps the bindgen dependency from `0.59.1` to `0.60.1`. This gets us an actual error type from bindgen, so we can display what's wrong. 2. Adds `anyhow` as a build dependency, so our error message can be prettier. It's already used heavily elsewhere in the crates in this repo, so I figured the fact it's a build dependency doesn't matter much. I ran into this from running `cargo ` without running `make` first. Here's a comparison of the compiler output in those two cases. Before this commit: ``` error: failed to run custom build command for `postgres_ffi v0.1.0 ($repo_path/libs/postgres_ffi)` Caused by: process didn't exit successfully: `$repo_path/target/debug/build/postgres_ffi-2f7253b3ad3ca840/build-script-build` (exit status: 101) --- stdout cargo:rerun-if-changed=bindgen_deps.h --- stderr bindgen_deps.h:7:10: fatal error: 'c.h' file not found bindgen_deps.h:7:10: fatal error: 'c.h' file not found, err: true thread 'main' panicked at 'Unable to generate bindings: ()', libs/postgres_ffi/build.rs:135:14 note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace ``` After this commit: ``` error: failed to run custom build command for `postgres_ffi v0.1.0 ($repo_path/libs/postgres_ffi)` Caused by: process didn't exit successfully: `$repo_path/target/debug/build/postgres_ffi-e01fb59602596748/build-script-build` (exit status: 1) --- stdout cargo:rerun-if-changed=bindgen_deps.h --- stderr bindgen_deps.h:7:10: fatal error: 'c.h' file not found Error: Unable to generate bindings Caused by: clang diagnosed error: bindgen_deps.h:7:10: fatal error: 'c.h' file not found ``` --- Cargo.lock | 6 +++--- libs/postgres_ffi/Cargo.toml | 3 ++- libs/postgres_ffi/build.rs | 29 +++++++++++++++++++---------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a258fab5f6..ca169dc0c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -229,14 +229,14 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.59.2" +version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" dependencies = [ "bitflags", "cexpr", "clang-sys", - "clap 2.34.0", + "clap 3.2.16", "env_logger", "lazy_static", "lazycell", diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 2b453fa0dc..60caca76b8 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -25,4 +25,5 @@ postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d05 wal_craft = { path = "wal_craft" } [build-dependencies] -bindgen = "0.59.1" +anyhow = "1.0" +bindgen = "0.60.1" diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 8389ac37fe..25ff398bbd 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -4,6 +4,7 @@ use std::env; use std::path::PathBuf; use std::process::Command; +use anyhow::{anyhow, Context}; use bindgen::callbacks::ParseCallbacks; #[derive(Debug)] @@ -42,7 +43,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { } } -fn main() { +fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -58,7 +59,7 @@ fn main() { for pg_version in &["v14", "v15"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { - let cwd = env::current_dir().unwrap(); + let cwd = env::current_dir().context("Failed to get current_dir")?; pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } @@ -70,21 +71,25 @@ fn main() { let output = Command::new(pg_config_bin) .arg("--includedir-server") .output() - .expect("failed to execute `pg_config --includedir-server`"); + .context("failed to execute `pg_config --includedir-server`")?; if !output.status.success() { panic!("`pg_config --includedir-server` failed") } - String::from_utf8(output.stdout).unwrap().trim_end().into() + String::from_utf8(output.stdout) + .context("pg_config output is not UTF-8")? + .trim_end() + .into() } else { - pg_install_dir_versioned + let server_path = pg_install_dir_versioned .join("include") .join("postgresql") .join("server") - .into_os_string() + .into_os_string(); + server_path .into_string() - .unwrap() + .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; // The bindgen::Builder is the main entry point @@ -132,14 +137,18 @@ fn main() { // Finish the builder and generate the bindings. // .generate() - .expect("Unable to generate bindings"); + .context("Unable to generate bindings")?; // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + let out_path: PathBuf = env::var("OUT_DIR") + .context("Couldn't read OUT_DIR environment variable var")? + .into(); let filename = format!("bindings_{pg_version}.rs"); bindings .write_to_file(out_path.join(filename)) - .expect("Couldn't write bindings!"); + .context("Couldn't write bindings")?; } + + Ok(()) } From 65a5010e256da28cbf9a9410ecd7953d8f57cd00 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 16 Sep 2022 17:44:02 +0200 Subject: [PATCH 075/166] Use custom `install` command in Makefile to speed up incremental builds (#2458) Fixes #1873: previously any run of `make` caused the `postgres-v15-headers` target to build. It copied a bunch of headers via `install -C`. Unfortunately, some origins were symlinks in the `./pg_install/build` directory pointing inside `./vendor/postgres-v15` (e.g. `pg_config_os.h` pointing to `linux.h`). GNU coreutils' `install` ignores the `-C` key for non-regular files and always overwrites the destination if the origin is a symlink. That in turn made Cargo rebuild the `postgres_ffi` crate and all its dependencies because it thinks that Postgres headers changed, even if they did not. That was slow. Now we use a custom script that wraps the `install` program. It handles one specific case and makes sure individual headers are never copied if their content did not change. Hence, `postgres_ffi` is not rebuilt unless there were some changes to the C code. One may still have slow incremental single-threaded builds because Postgres Makefiles spawn about 2800 sub-makes even if no files have been changed. A no-op build takes "only" 3-4 seconds on my machine now when run with `-j30`, and 20 seconds when run with `-j1`. --- Makefile | 2 +- scripts/ninstall.sh | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100755 scripts/ninstall.sh diff --git a/Makefile b/Makefile index 4ac51ed174..738a45fd5e 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ endif # headers, the mtime of the headers are not changed when there have # been no changes to the files. Changing the mtime triggers an # unnecessary rebuild of 'postgres_ffi'. -PG_CONFIGURE_OPTS += INSTALL='install -C' +PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C' # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) diff --git a/scripts/ninstall.sh b/scripts/ninstall.sh new file mode 100755 index 0000000000..3554e3e4df --- /dev/null +++ b/scripts/ninstall.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail +# GNU coreutil's `install -C` always overrides the destination if the source +# is not a regular file, which is the case with lots of headers symlinked into +# the build directory by `./configure`. That causes Rust's Cargo to think that +# Postgres headers have been updated after `make` call even if no files have been +# touched. That causes long recompilation of `postgres_ffi` and all dependent +# packages. To counter that, we handle a special case here: do not copy the file +# if its content did not change. We only handle a single case where `install` +# installs a single file with a specific set of arguments, the rest does not +# matter in our configuration. +# +# Such behavior may be incorrect if e.g. permissions have changed, but it should +# not happen during normal Neon development that often, and rebuild should help. +# +# See https://github.com/neondatabase/neon/issues/1873 +if [ "$#" == "5" ]; then + if [ "$1" == "-C" ] && [ "$2" == "-m" ] && [ "$3" == "644" ]; then + if [ -e "$5" ] && diff -q "$4" "$5" >/dev/null 2>&1; then + exit 0 + fi + fi +fi +install "$@" From b46c8b4ae008f88a0693837752d0ca8007a54dd5 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 16 Sep 2022 11:35:51 +0300 Subject: [PATCH 076/166] Add an alias to build test images simply --- .cargo/config.toml | 3 +++ test_runner/README.md | 2 ++ 2 files changed, 5 insertions(+) diff --git a/.cargo/config.toml b/.cargo/config.toml index 76a2ff549e..d70d57a817 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -11,3 +11,6 @@ opt-level = 3 [profile.dev] # Turn on a small amount of optimization in Development mode. opt-level = 1 + +[alias] +build_testing = ["build", "--features", "failpoints"] diff --git a/test_runner/README.md b/test_runner/README.md index 01fe4ff863..f17a4a5a5d 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -7,6 +7,8 @@ Prerequisites: - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. + For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features failpoints --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule From c9c3c77c31e45cf59c02dbe142d0c99432fc4f18 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Fri, 16 Sep 2022 19:51:35 +0200 Subject: [PATCH 077/166] Fix Docker image builds (follow-up for #2458) (#2469) Put ninstall.sh inside Docker images for building --- .dockerignore | 1 + Dockerfile | 1 + 2 files changed, 2 insertions(+) diff --git a/.dockerignore b/.dockerignore index 4bc8e5fa13..92eb4f24de 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,3 +18,4 @@ !vendor/postgres-v15/ !workspace_hack/ !neon_local/ +!scripts/ninstall.sh diff --git a/Dockerfile b/Dockerfile index 711a92a90e..213934a844 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,7 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile +COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE release RUN set -e \ From 846d126579bd34f0b57b11a4e5477d8d239feea2 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 19 Sep 2022 12:56:08 +0300 Subject: [PATCH 078/166] Set last written lsn for created relation (#2398) * Set last written lsn for created relation * use current LSN for updating last written LSN of relation metadata * Update LSN for the extended blocks even for pges without LSN (zeroed) * Update pgxn/neon/pagestore_smgr.c Co-authored-by: Heikki Linnakangas Co-authored-by: Heikki Linnakangas --- pgxn/neon/pagestore_smgr.c | 12 +++++++++++- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 24adee019f..8e6dd373b0 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -959,7 +959,17 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif - + /* + * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr. + * An smgr_write() call will come for the buffer later, after it has been initialized + * with the real page contents, and it is eventually evicted from the buffer cache. + * But we need a valid LSN to the relation metadata update now. + */ + if (lsn == InvalidXLogRecPtr) + { + lsn = GetXLogInsertRecPtr(); + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno); + } SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ce723ee499..796770565f 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ce723ee499450cb108aede464a35a17f3d75cf84 +Subproject commit 796770565ff668b585e80733b8d679961ad50e93 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 0858387047..7d144ae2f3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 08583870479e30c64aeb5a97d6fee9cf470f05fb +Subproject commit 7d144ae2f3649570f60a0477993b8c8ad9dd8c4b From 90ed12630e698441a66fce7c095cc5a02487a26d Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Mon, 19 Sep 2022 12:57:44 +0200 Subject: [PATCH 079/166] Add zenith-us-stage-ps-4 and undo changes in prefix_in_bucket in pageserver config (#2473) * Add zenith-us-stage-ps-4 * Undo changes in prefix_in_bucket in pageserver config (Rollback #2449) --- .github/ansible/deploy.yaml | 2 +- .github/ansible/staging.hosts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index c06a0ef5b3..b47db6a9b5 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -71,7 +71,7 @@ - "[remote_storage]" - "bucket_name = '{{ bucket_name }}'" - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = 'pageserver/v1'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" become: true tags: - pageserver diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index 2bb28f1972..c470f8a814 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -2,6 +2,7 @@ #zenith-us-stage-ps-1 console_region_id=27 zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-ps-3 console_region_id=27 +zenith-us-stage-ps-4 console_region_id=27 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 From d11cb4b2f115eb3be48f31926b952bbbbd21e6f7 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 14:23:44 +0300 Subject: [PATCH 080/166] Bump vendor/postgres-v15 to the latest state of REL_15_STABLE_neon branch --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 7d144ae2f3..34c47d6c99 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 7d144ae2f3649570f60a0477993b8c8ad9dd8c4b +Subproject commit 34c47d6c99415c94296d5e599ec5590d0001d6c2 From 4b5e7f2f82aaa0c1427b42976a555d7c236ee5ad Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 19 Sep 2022 11:14:34 +0300 Subject: [PATCH 081/166] Temporarily disable storage deployments Do not update configs Do not restart servieces Still update binaries --- .github/ansible/deploy.yaml | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index b47db6a9b5..6982445558 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -63,18 +63,18 @@ tags: - pageserver - - name: update remote storage (s3) config - lineinfile: - path: /storage/pageserver/data/pageserver.toml - line: "{{ item }}" - loop: - - "[remote_storage]" - - "bucket_name = '{{ bucket_name }}'" - - "bucket_region = '{{ bucket_region }}'" - - "prefix_in_bucket = '{{ inventory_hostname }}'" - become: true - tags: - - pageserver + # - name: update remote storage (s3) config + # lineinfile: + # path: /storage/pageserver/data/pageserver.toml + # line: "{{ item }}" + # loop: + # - "[remote_storage]" + # - "bucket_name = '{{ bucket_name }}'" + # - "bucket_region = '{{ bucket_region }}'" + # - "prefix_in_bucket = '{{ inventory_hostname }}'" + # become: true + # tags: + # - pageserver - name: upload systemd service definition ansible.builtin.template: @@ -87,15 +87,15 @@ tags: - pageserver - - name: start systemd service - ansible.builtin.systemd: - daemon_reload: yes - name: pageserver - enabled: yes - state: restarted - become: true - tags: - - pageserver + # - name: start systemd service + # ansible.builtin.systemd: + # daemon_reload: yes + # name: pageserver + # enabled: yes + # state: restarted + # become: true + # tags: + # - pageserver - name: post version to console when: console_mgmt_base_url is defined From fcb4a61a120ab29de19f8a0bbe64aa29bed5f194 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 19 Sep 2022 18:41:18 +0300 Subject: [PATCH 082/166] Adjust spans around gc and compaction So compaction and gc loops have their own span to always show tenant id in log messages. --- pageserver/src/page_service.rs | 3 +++ pageserver/src/tenant.rs | 6 +----- pageserver/src/tenant/timeline.rs | 9 +++++---- pageserver/src/tenant_tasks.rs | 20 +++++++++++-------- .../src/walreceiver/connection_manager.rs | 9 +++++++-- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b06814c557..1461a6d117 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1090,6 +1090,9 @@ impl postgres_backend_async::Handler for PageServerHandler { let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; + let _span_guard = + info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; let gc_horizon: u64 = caps diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 41fd98ec07..f56f10d7ea 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -342,8 +342,7 @@ impl Tenant { drop(timelines); for (timeline_id, timeline) in &timelines_to_compact { - let _entered = - info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered(); + let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered(); timeline.compact()?; } @@ -835,9 +834,6 @@ impl Tenant { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _span_guard = - info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id) - .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 95bdf715b5..8670e979ee 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1916,18 +1916,19 @@ impl Timeline { let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let _enter = + info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff) + .entered(); + // Nothing to GC. Return early. let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( - "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", - self.timeline_id + "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", ); return Ok(result); } - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - // We need to ensure that no one tries to read page versions or create // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() // for details. This will block until the old value is no longer in use. diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index c543a0ecb1..8329b15c08 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -21,7 +21,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("compactor for tenant {tenant_id}"), false, async move { - compaction_loop(tenant_id).await; + compaction_loop(tenant_id) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -33,7 +35,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("garbage collector for tenant {tenant_id}"), false, async move { - gc_loop(tenant_id).await; + gc_loop(tenant_id) + .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -44,7 +48,7 @@ pub fn start_background_loops(tenant_id: TenantId) { /// async fn compaction_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting compaction loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -52,7 +56,7 @@ async fn compaction_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -73,7 +77,7 @@ async fn compaction_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request during idling"); + info!("received cancellation request during idling"); break ; }, _ = tokio::time::sleep(sleep_duration) => {}, @@ -91,7 +95,7 @@ async fn compaction_loop(tenant_id: TenantId) { /// async fn gc_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting gc loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -99,7 +103,7 @@ async fn gc_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -123,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request during idling"); + info!("received cancellation request during idling"); break; }, _ = tokio::time::sleep(sleep_duration) => {}, diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1e4b4e7d52..799062e935 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -58,7 +58,10 @@ pub fn spawn_connection_manager_task( TaskKind::WalReceiverManager, Some(tenant_id), Some(timeline_id), - &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + &format!( + "walreceiver for tenant {} timeline {}", + timeline.tenant_id, timeline.timeline_id + ), false, async move { info!("WAL receiver broker started, connecting to etcd"); @@ -88,7 +91,9 @@ pub fn spawn_connection_manager_task( } } } - .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + .instrument( + info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), + ), ); Ok(()) } From 6985f6cd6c53ae96ad4afaaaf546f5d94c869d50 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Sep 2022 20:56:11 +0300 Subject: [PATCH 083/166] Add a new benchmark data series for prefetching. Also run benchmarks with the seqscan prefetching (commit f44afbaf62) enabled. Renames the 'neon-captest' test to 'neon-captest-reuse', for clarity --- .github/workflows/benchmarking.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index df0e8a4275..9a9021ac37 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -144,9 +144,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest: Run pgbench, reusing existing project - # neon-captest-new: Same, but on a freshly created project - platform: [ neon-captest, neon-captest-new, rds-aurora ] + # neon-captest-new: Run pgbench in a freshly created project + # neon-captest-reuse: Same, but reusing existing project + # neon-captest-prefetch: Same, with prefetching enabled (new project) + platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ] runs-on: dev container: @@ -164,7 +165,7 @@ jobs: sudo apt install -y postgresql-14 - name: Create Neon Project - if: matrix.platform == 'neon-captest-new' + if: matrix.platform != 'neon-captest-reuse' id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -175,17 +176,20 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; neon-captest-new) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; + neon-captest-prefetch) + CONNSTR=${{ steps.create-neon-project.outputs.dsn }}?options=-cenable_seqscan_prefetch%3Don%20-cseqscan_prefetch_buffers%3D10 + ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'" exit 1 ;; esac @@ -246,7 +250,7 @@ jobs: build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform == 'neon-captest-new' && always() }} + if: ${{ matrix.platform != 'neon-captest-reuse' && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev From bb3c66d86f6c91e05d72d52baedcb4ff32617c2e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 19 Sep 2022 23:28:51 +0100 Subject: [PATCH 084/166] github/workflows: Make publishing perf reports more configurable (#2440) --- .github/actions/neon-project-create/action.yml | 1 + .github/actions/run-python-test-set/action.yml | 14 +++++--------- .github/workflows/benchmarking.yml | 10 +++++++--- .github/workflows/build_and_test.yml | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ba81afaaff..2f58ae77ad 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -60,6 +60,7 @@ runs: --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { + \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"platform_id\": \"aws\", \"region_id\": \"${REGION_ID}\", \"settings\": { } diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4c18641938..e69cb28df1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -112,10 +112,8 @@ runs: fi if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi + mkdir -p "$PERF_REPORT_DIR" + EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi if [[ "${{ inputs.build_type }}" == "debug" ]]; then @@ -150,11 +148,9 @@ runs: -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO="$PLATFORM" - scripts/generate_and_push_perf_report.sh - fi + export REPORT_FROM="$PERF_REPORT_DIR" + export REPORT_TO="$PLATFORM" + scripts/generate_and_push_perf_report.sh fi - name: Create Allure report diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 9a9021ac37..0430f0b9c0 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -19,8 +19,12 @@ on: description: 'Environment to run remote tests on (dev or staging)' required: false region_id: - description: 'Use a particular region. If empty the default one will be used' - false: true + description: 'Use a particular region. If not set the default region will be used' + required: false + save_perf_report: + type: boolean + description: 'Publish perf report or not. If not set, the report is published only for the main branch' + required: false defaults: run: @@ -139,7 +143,7 @@ jobs: POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: true + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} strategy: fail-fast: false diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5bff469582..0b6cb21120 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -292,7 +292,7 @@ jobs: build_type: ${{ matrix.build_type }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ github.ref == 'refs/heads/main' }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" From e4f775436f534e8de49d0cb5a2c955e73ac6f03e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 19 Sep 2022 23:52:21 +0300 Subject: [PATCH 085/166] Don't override other options than statement_timeout in test conn string. In commit 6985f6cd6c, I tried passing extra GUCs in the 'options' part of the connection string, but it didn't work because the pgbench test overrode it with the statement_timeout. Change it so that it adds the statement_timeout to any other options, instead of replacing them. --- test_runner/performance/test_perf_pgbench.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 2a2213b783..d9bf237e49 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -84,9 +84,8 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - init_pgbench( - env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options="-cstatement_timeout=1h")] - ) + options = "-cstatement_timeout=1h " + env.pg.default_options["options"] + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload From 566e816298a201c9150f0c42846949296997d74d Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 20 Sep 2022 09:42:39 +0200 Subject: [PATCH 086/166] Refactor safekeeper timelines handling (#2329) See https://github.com/neondatabase/neon/pull/2329 for details --- Cargo.lock | 2 + libs/utils/src/postgres_backend.rs | 18 +- pageserver/src/lib.rs | 2 +- safekeeper/Cargo.toml | 2 + safekeeper/src/bin/safekeeper.rs | 6 +- safekeeper/src/broker.rs | 42 +- safekeeper/src/control_file.rs | 73 ++- safekeeper/src/handler.rs | 67 +-- safekeeper/src/http/routes.rs | 62 ++- safekeeper/src/json_ctrl.rs | 61 +-- safekeeper/src/lib.rs | 9 +- safekeeper/src/metrics.rs | 19 +- safekeeper/src/receive_wal.rs | 27 +- safekeeper/src/remove_wal.rs | 21 +- safekeeper/src/safekeeper.rs | 141 +++--- safekeeper/src/send_wal.rs | 20 +- safekeeper/src/timeline.rs | 665 ++++++++++++------------- safekeeper/src/timelines_global_map.rs | 348 +++++++++++++ safekeeper/src/wal_backup.rs | 72 +-- safekeeper/src/wal_storage.rs | 249 ++++----- 20 files changed, 1097 insertions(+), 809 deletions(-) create mode 100644 safekeeper/src/timelines_global_map.rs diff --git a/Cargo.lock b/Cargo.lock index ca169dc0c8..2f4a57b698 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2723,6 +2723,7 @@ dependencies = [ "hyper", "metrics", "once_cell", + "parking_lot 0.12.1", "postgres", "postgres-protocol", "postgres_ffi", @@ -2733,6 +2734,7 @@ dependencies = [ "serde_with", "signal-hook", "tempfile", + "thiserror", "tokio", "tokio-postgres", "toml_edit", diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 0498e0887b..adee46c2dd 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -429,8 +429,22 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; + // + // We also don't want to log full stacktrace when the error is primitive, + // such as usual connection closed. + let short_error = format!("{:#}", e); + let root_cause = e.root_cause().to_string(); + if root_cause.contains("connection closed unexpectedly") + || root_cause.contains("Broken pipe (os error 32)") + { + error!( + "query handler for '{}' failed: {}", + query_string, short_error + ); + } else { + error!("query handler for '{}' failed: {:?}", query_string, e); + } + self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { return Ok(ProcessMsgResult::Break); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 959942aa12..acd37161a0 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -105,7 +105,7 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds } /// A newtype to store arbitrary data grouped by tenant and timeline ids. -/// One could use [`utils::zid::TenantTimelineId`] for grouping, but that would +/// One could use [`utils::id::TenantTimelineId`] for grouping, but that would /// not include the cases where a certain tenant has zero timelines. /// This is sometimes important: a tenant could be registered during initial load from FS, /// even if he has no timelines on disk. diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cae095c3c2..87ee63d1df 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,8 @@ git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" toml_edit = { version = "0.13", features = ["easy"] } +thiserror = "1" +parking_lot = "0.12.1" postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d518ac01cc..7726f25a2d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -24,9 +24,9 @@ use safekeeper::defaults::{ }; use safekeeper::http; use safekeeper::remove_wal; -use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_backup; use safekeeper::wal_service; +use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ @@ -298,7 +298,9 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - GlobalTimelines::init(wal_backup_launcher_tx); + + // Load all timelines from disk to memory. + GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index f276fad613..6a2456ecda 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,6 +10,7 @@ use etcd_broker::LeaseKeeper; use std::collections::hash_map::Entry; use std::collections::HashMap; +use std::collections::HashSet; use std::time::Duration; use tokio::spawn; use tokio::task::JoinHandle; @@ -17,7 +18,8 @@ use tokio::{runtime, time::sleep}; use tracing::*; use url::Url; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::GlobalTimelines; +use crate::SafeKeeperConf; use etcd_broker::{ subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, Client, PutOptions, @@ -45,12 +47,12 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( broker_etcd_prefix: String, - zttid: TenantTimelineId, + ttid: TenantTimelineId, sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", - SubscriptionKey::sk_timeline_info(broker_etcd_prefix, zttid).watch_key() + SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key() ) } @@ -162,7 +164,7 @@ pub fn get_candiate_name(system_id: NodeId) -> String { } async fn push_sk_info( - zttid: TenantTimelineId, + ttid: TenantTimelineId, mut client: Client, key: String, sk_info: SkTimelineInfo, @@ -190,7 +192,7 @@ async fn push_sk_info( .await .context("failed to receive LeaseKeepAliveResponse")?; - Ok((zttid, lease)) + Ok((ttid, lease)) } struct Lease { @@ -210,11 +212,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - let active_tlis = GlobalTimelines::get_active_timelines(); + let mut active_tlis = GlobalTimelines::get_all(); + active_tlis.retain(|tli| tli.is_active()); + + let active_tlis_set: HashSet = + active_tlis.iter().map(|tli| tli.ttid).collect(); // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. - for zttid in active_tlis.iter() { - if let Entry::Vacant(v) = leases.entry(*zttid) { + for tli in &active_tlis { + if let Entry::Vacant(v) = leases.entry(tli.ttid) { let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; v.insert(Lease { @@ -224,30 +230,26 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { }); } } - leases.retain(|zttid, _| active_tlis.contains(zttid)); + leases.retain(|ttid, _| active_tlis_set.contains(ttid)); // Push data concurrently to not suffer from latency, with many timelines it can be slow. let handles = active_tlis .iter() - .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid)) .map(|tli| { let sk_info = tli.get_public_info(&conf); - let key = timeline_safekeeper_path( - conf.broker_etcd_prefix.clone(), - tli.zttid, - conf.my_id, - ); - let lease = leases.remove(&tli.zttid).unwrap(); - tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease)) + let key = + timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); + let lease = leases.remove(&tli.ttid).unwrap(); + tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease)) }) .collect::>(); for h in handles { - let (zttid, lease) = h.await??; + let (ttid, lease) = h.await??; // It is ugly to pull leases from hash and then put it back, but // otherwise we have to resort to long living per tli tasks (which // would generate a lot of errors when etcd is down) as task wants to // have 'static objects, we can't borrow to it. - leases.insert(zttid, lease); + leases.insert(ttid, lease); } sleep(push_interval).await; @@ -279,7 +281,7 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { match subscription.value_updates.recv().await { Some(new_info) => { // note: there are blocking operations below, but it's considered fine for now - if let Ok(tli) = GlobalTimelines::get(&conf, new_info.key.id, false) { + if let Ok(tli) = GlobalTimelines::get(new_info.key.id) { tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) .await? } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index ff23f0360f..22ed34cc00 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -9,8 +9,6 @@ use std::io::{Read, Write}; use std::ops::Deref; use std::path::{Path, PathBuf}; -use tracing::*; - use crate::control_file_upgrade::upgrade_control_file; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; @@ -55,12 +53,13 @@ pub struct FileStorage { } impl FileStorage { - pub fn restore_new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + /// Initialize storage by loading state from disk. + pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); - let state = Self::load_control_file_conf(conf, zttid)?; + let state = Self::load_control_file_conf(conf, ttid)?; Ok(FileStorage { timeline_dir, @@ -71,28 +70,28 @@ impl FileStorage { }) } + /// Create file storage for a new timeline, but don't persist it yet. pub fn create_new( - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + let timeline_dir = conf.timeline_dir(ttid); + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); - let mut store = FileStorage { + let store = FileStorage { timeline_dir, conf: conf.clone(), persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS .with_label_values(&[&tenant_id, &timeline_id]), - state: state.clone(), + state, }; - store.persist(&state)?; Ok(store) } - // Check the magic/version in the on-disk data and deserialize it, if possible. + /// Check the magic/version in the on-disk data and deserialize it, if possible. fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part let magic = buf.read_u32::()?; @@ -112,23 +111,17 @@ impl FileStorage { upgrade_control_file(buf, version) } - // Load control file for given zttid at path specified by conf. + /// Load control file for given ttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result { - let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); + let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); Self::load_control_file(path) } /// Read in the control file. - /// If create=false and file doesn't exist, bails out. pub fn load_control_file>(control_file_path: P) -> Result { - info!( - "loading control file {}", - control_file_path.as_ref().display(), - ); - let mut control_file = OpenOptions::new() .read(true) .write(true) @@ -179,8 +172,8 @@ impl Deref for FileStorage { } impl Storage for FileStorage { - // persists state durably to underlying storage - // for description see https://lwn.net/Articles/457667/ + /// persists state durably to underlying storage + /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { let _timer = &self.persist_control_file_seconds.start_timer(); @@ -264,57 +257,57 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); Ok(( - FileStorage::restore_new(zttid, conf)?, - FileStorage::load_control_file_conf(conf, zttid)?, + FileStorage::restore_new(ttid, conf)?, + FileStorage::load_control_file_conf(conf, ttid)?, )) } fn create( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); - let storage = FileStorage::create_new(zttid, conf, state.clone())?; + let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) } #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state"); + let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state"); assert_eq!(state.commit_lsn, Lsn(42)); } #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); + let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid) { + match load_from_control_file(&conf, &ttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index ad2c0ec8bf..ca887399e1 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -3,15 +3,15 @@ use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::receive_wal::ReceiveWalConn; -use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; + use crate::send_wal::ReplicationConn; -use crate::timeline::{Timeline, TimelineTools}; -use crate::SafeKeeperConf; + +use crate::{GlobalTimelines, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use postgres_ffi::PG_TLI; use regex::Regex; -use std::sync::Arc; + use tracing::info; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, @@ -27,7 +27,7 @@ pub struct SafekeeperPostgresHandler { pub appname: Option, pub tenant_id: Option, pub timeline_id: Option, - pub timeline: Option>, + pub ttid: TenantTimelineId, } /// Parsed Postgres command. @@ -101,30 +101,21 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { query_string, self.timeline_id ); - let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) - || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); - - let tenant_id = self.tenant_id.context("tenant_id is required")?; - let timeline_id = self.timeline_id.context("timeline_id is required")?; - if self.timeline.is_none() { - self.timeline.set( - &self.conf, - TenantTimelineId::new(tenant_id, timeline_id), - create, - )?; - } + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); match cmd { - SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb) - .run(self) - .context("failed to run ReceiveWalConn"), - SafekeeperPostgresCommand::StartReplication { start_lsn } => ReplicationConn::new(pgb) - .run(self, pgb, start_lsn) - .context("failed to run ReplicationConn"), + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), + SafekeeperPostgresCommand::StartReplication { start_lsn } => { + ReplicationConn::new(pgb).run(self, pgb, start_lsn) + } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } - .context(format!("timeline {timeline_id}"))?; + .context(format!( + "Failed to process query for timeline {timeline_id}" + ))?; Ok(()) } @@ -137,42 +128,26 @@ impl SafekeeperPostgresHandler { appname: None, tenant_id: None, timeline_id: None, - timeline: None, + ttid: TenantTimelineId::empty(), } } - /// Shortcut for calling `process_msg` in the timeline. - pub fn process_safekeeper_msg( - &self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - self.timeline - .get() - .process_msg(msg) - .context("failed to process ProposerAcceptorMessage") - } - /// /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { + let tli = GlobalTimelines::get(self.ttid)?; + let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn - self.timeline.get().get_end_of_wal() + tli.get_flush_lsn() } else { // other clients shouldn't get any uncommitted WAL - self.timeline.get().get_state().0.commit_lsn + tli.get_state().0.commit_lsn } .to_string(); - let sysid = self - .timeline - .get() - .get_state() - .1 - .server - .system_id - .to_string(); + let sysid = tli.get_state().1.server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli = PG_TLI.to_string(); let tli_bytes = tli.as_bytes(); diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 14c9414c09..244325368b 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,3 +1,4 @@ +use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; use once_cell::sync::Lazy; @@ -9,7 +10,9 @@ use std::sync::Arc; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; -use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; + +use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::GlobalTimelines; use crate::SafeKeeperConf; use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ @@ -90,15 +93,15 @@ struct TimelineStatus { /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid)?; let (inmem, state) = tli.get_state(); - let flush_lsn = tli.get_end_of_wal(); + let flush_lsn = tli.get_flush_lsn(); let acc_state = AcceptorStateStatus { term: state.acceptor_state.term, @@ -108,8 +111,8 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { let request_data: TimelineCreateRequest = json_request(&mut request).await?; - let zttid = TenantTimelineId { + let ttid = TenantTimelineId { tenant_id: parse_request_param(&request, "tenant_id")?, timeline_id: request_data.timeline_id, }; - check_permission(&request, Some(zttid.tenant_id))?; - GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids) - .map_err(ApiError::from_err)?; + check_permission(&request, Some(ttid.tenant_id))?; - json_response(StatusCode::CREATED, ()) + Err(ApiError::from_err(anyhow!("not implemented"))) } /// Deactivates the timeline and removes its data directory. -/// -/// It does not try to stop any processing of the timeline; there is no such code at the time of writing. -/// However, it tries to check whether the timeline was active and report it to caller just in case. -/// Note that this information is inaccurate: -/// 1. There is a race condition between checking the timeline for activity and actual directory deletion. -/// 2. At the time of writing Safekeeper rarely marks a timeline inactive. E.g. disconnecting the compute node does nothing. async fn timeline_delete_force_handler( mut request: Request, ) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; ensure_no_body(&mut request).await?; - json_response( - StatusCode::OK, - GlobalTimelines::delete_force(get_conf(&request), &zttid) - .await - .map_err(ApiError::from_err)?, - ) + let resp = tokio::task::spawn_blocking(move || GlobalTimelines::delete_force(&ttid)) + .await + .map_err(ApiError::from_err)??; + json_response(StatusCode::OK, resp) } /// Deactivates all timelines for the tenant and removes its data directory. @@ -168,27 +161,30 @@ async fn tenant_delete_force_handler( let tenant_id = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; ensure_no_body(&mut request).await?; + let delete_info = tokio::task::spawn_blocking(move || { + GlobalTimelines::delete_force_all_for_tenant(&tenant_id) + }) + .await + .map_err(ApiError::from_err)??; json_response( StatusCode::OK, - GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id) - .await - .map_err(ApiError::from_err)? + delete_info .iter() - .map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp)) + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) .collect::>(), ) } /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid)?; tli.record_safekeeper_info(&safekeeper_info, NodeId(1)) .await?; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 00fc43521b..2456eb0752 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -6,18 +6,22 @@ //! modifications in tests. //! +use std::sync::Arc; + use anyhow::Result; use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; +use utils::id::TenantTimelineId; use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, ProposerGreeting, + AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; -use crate::timeline::TimelineTools; +use crate::timeline::Timeline; +use crate::GlobalTimelines; use postgres_ffi::v14::xlog_utils; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ @@ -57,23 +61,23 @@ struct AppendResult { /// content, and then append it with specified term and lsn. This /// function is used to test safekeepers in different scenarios. pub fn handle_json_ctrl( - spg: &mut SafekeeperPostgresHandler, + spg: &SafekeeperPostgresHandler, pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, ) -> Result<()> { info!("JSON_CTRL request: {:?}", append_request); // need to init safekeeper state before AppendRequest - prepare_safekeeper(spg)?; + let tli = prepare_safekeeper(spg.ttid)?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { - send_proposer_elected(spg, append_request.term, append_request.epoch_start_lsn)?; + send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?; } - let inserted_wal = append_logical_message(spg, append_request)?; + let inserted_wal = append_logical_message(&tli, append_request)?; let response = AppendResult { - state: spg.timeline.get().get_state().1, + state: tli.get_state().1, inserted_wal, }; let response_data = serde_json::to_vec(&response)?; @@ -91,28 +95,20 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { - protocol_version: 2, // current protocol - pg_version: 0, // unknown - proposer_id: [0u8; 16], - system_id: 0, - timeline_id: spg.timeline_id.unwrap(), - tenant_id: spg.tenant_id.unwrap(), - tli: 0, - wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests - }); - - let response = spg.timeline.get().process_msg(&greeting_request)?; - match response { - Some(AcceptorProposerMessage::Greeting(_)) => Ok(()), - _ => anyhow::bail!("not GreetingResponse"), - } +fn prepare_safekeeper(ttid: TenantTimelineId) -> Result> { + GlobalTimelines::create( + ttid, + ServerInfo { + pg_version: 0, // unknown + wal_seg_size: WAL_SEGMENT_SIZE as u32, + system_id: 0, + }, + ) } -fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: Lsn) -> Result<()> { +fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> Result<()> { // add new term to existing history - let history = spg.timeline.get().get_state().1.acceptor_state.term_history; + let history = tli.get_state().1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); let mut history_entries = history.0; history_entries.push(TermSwitchEntry { term, lsn }); @@ -125,7 +121,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L timeline_start_lsn: lsn, }); - spg.timeline.get().process_msg(&proposer_elected_request)?; + tli.process_msg(&proposer_elected_request)?; Ok(()) } @@ -138,12 +134,9 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message( - spg: &mut SafekeeperPostgresHandler, - msg: &AppendLogicalMessage, -) -> Result { +fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = spg.timeline.get().get_state().1; + let sk_state = tli.get_state().1; let begin_lsn = msg.begin_lsn; let end_lsn = begin_lsn + wal_data.len() as u64; @@ -167,7 +160,7 @@ fn append_logical_message( wal_data: Bytes::from(wal_data), }); - let response = spg.timeline.get().process_msg(&append_request)?; + let response = tli.process_msg(&append_request)?; let append_response = match response { Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index b466d5aab5..58a237a5d3 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -23,6 +23,9 @@ pub mod wal_backup; pub mod wal_service; pub mod wal_storage; +mod timelines_global_map; +pub use timelines_global_map::GlobalTimelines; + pub mod defaults { use const_format::formatcp; use std::time::Duration; @@ -65,9 +68,9 @@ impl SafeKeeperConf { self.workdir.join(tenant_id.to_string()) } - pub fn timeline_dir(&self, zttid: &TenantTimelineId) -> PathBuf { - self.tenant_dir(&zttid.tenant_id) - .join(zttid.timeline_id.to_string()) + pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> PathBuf { + self.tenant_dir(&ttid.tenant_id) + .join(ttid.timeline_id.to_string()) } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 3fa3916266..851a568aec 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -12,11 +12,12 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, - timeline::{GlobalTimelines, ReplicaState}, + timeline::ReplicaState, + GlobalTimelines, }; pub struct FullTimelineInfo { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, pub replicas: Vec, pub wal_backup_active: bool, pub timeline_is_active: bool, @@ -235,11 +236,17 @@ impl Collector for TimelineCollector { self.disk_usage.reset(); self.acceptor_term.reset(); - let timelines = GlobalTimelines::active_timelines_metrics(); + let timelines = GlobalTimelines::get_all(); - for tli in timelines { - let tenant_id = tli.zttid.tenant_id.to_string(); - let timeline_id = tli.zttid.timeline_id.to_string(); + for arc_tli in timelines { + let tli = arc_tli.info_for_metrics(); + if tli.is_none() { + continue; + } + let tli = tli.unwrap(); + + let tenant_id = tli.ttid.tenant_id.to_string(); + let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; let mut most_advanced: Option = None; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index b0b6a73621..e28caa2f19 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -7,7 +7,9 @@ use anyhow::{anyhow, bail, Result}; use bytes::BytesMut; use tracing::*; +use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; +use crate::GlobalTimelines; use std::net::SocketAddr; use std::sync::mpsc::channel; @@ -20,7 +22,6 @@ use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::TimelineTools; use utils::{ postgres_backend::PostgresBackend, pq_proto::{BeMessage, FeMessage}, @@ -67,15 +68,21 @@ impl<'pg> ReceiveWalConn<'pg> { // Receive information about server let next_msg = poll_reader.recv_msg()?; - match next_msg { + let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( "start handshake with wal proposer {} sysid {} timeline {}", self.peer_addr, greeting.system_id, greeting.tli, ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + GlobalTimelines::create(spg.ttid, server_info)? } _ => bail!("unexpected message {:?} instead of greeting", next_msg), - } + }; let mut next_msg = Some(next_msg); @@ -88,7 +95,7 @@ impl<'pg> ReceiveWalConn<'pg> { while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -97,13 +104,13 @@ impl<'pg> ReceiveWalConn<'pg> { } // flush all written WAL to the disk - let reply = spg.process_safekeeper_msg(&ProposerAcceptorMessage::FlushWAL)?; + let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?; if let Some(reply) = reply { self.write_msg(&reply)?; } } else if let Some(msg) = next_msg.take() { // process other message - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -112,9 +119,9 @@ impl<'pg> ReceiveWalConn<'pg> { // Register the connection and defer unregister. Do that only // after processing first message, as it sets wal_seg_size, // wanted by many. - spg.timeline.get().on_compute_connect()?; + tli.on_compute_connect()?; _guard = Some(ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), + timeline: Arc::clone(&tli), }); first_time_through = false; } @@ -190,6 +197,8 @@ struct ComputeConnectionGuard { impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - self.timeline.on_compute_disconnect().unwrap(); + if let Err(e) = self.timeline.on_compute_disconnect() { + error!("failed to unregister compute connection: {}", e); + } } } diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 004c0243f9..b6d497f34e 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -4,20 +4,21 @@ use std::{thread, time::Duration}; use tracing::*; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::{GlobalTimelines, SafeKeeperConf}; pub fn thread_main(conf: SafeKeeperConf) { let wal_removal_interval = Duration::from_millis(5000); loop { - let active_tlis = GlobalTimelines::get_active_timelines(); - for zttid in &active_tlis { - if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { - warn!( - "failed to remove WAL for tenant {} timeline {}: {}", - tli.zttid.tenant_id, tli.zttid.timeline_id, e - ); - } + let tlis = GlobalTimelines::get_all(); + for tli in &tlis { + if !tli.is_active() { + continue; + } + let ttid = tli.ttid; + let _enter = + info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered(); + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { + warn!("failed to remove WAL: {}", e); } } thread::sleep(wal_removal_interval) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fa045eed90..d34a77e02b 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -218,19 +218,19 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &TenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new( + ttid: &TenantTimelineId, + server_info: ServerInfo, + peers: Vec, + ) -> SafeKeeperState { SafeKeeperState { - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), }, - server: ServerInfo { - pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ - system_id: 0, /* Postgres system identifier */ - wal_seg_size: 0, - }, + server: server_info, proposer_uuid: [0; 16], timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), @@ -244,7 +244,15 @@ impl SafeKeeperState { #[cfg(test)] pub fn empty() -> Self { - SafeKeeperState::new(&TenantTimelineId::empty(), vec![]) + SafeKeeperState::new( + &TenantTimelineId::empty(), + ServerInfo { + pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ + system_id: 0, /* Postgres system identifier */ + wal_seg_size: 0, + }, + vec![], + ) } } @@ -479,8 +487,12 @@ impl AcceptorProposerMessage { } } -/// SafeKeeper which consumes events (messages from compute) and provides -/// replies. +/// Safekeeper implements consensus to reliably persist WAL across nodes. +/// It controls all WAL disk writes and updates of control file. +/// +/// Currently safekeeper processes: +/// - messages from compute (proposers) and provides replies +/// - messages from broker peers pub struct SafeKeeper { /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. /// Note: be careful to set only if we are sure our WAL (term history) matches @@ -503,20 +515,20 @@ where CTRL: control_file::Storage, WAL: wal_storage::Storage, { - // constructor - pub fn new( - timeline_id: TimelineId, - state: CTRL, - mut wal_store: WAL, - node_id: NodeId, - ) -> Result> { - if state.timeline_id != TimelineId::from([0u8; 16]) && timeline_id != state.timeline_id { - bail!("Calling SafeKeeper::new with inconsistent timeline_id ({}) and SafeKeeperState.server.timeline_id ({})", timeline_id, state.timeline_id); + /// Accepts a control file storage containing the safekeeper state. + /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id` + /// and `server` (`wal_seg_size` inside it) fields. + pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result> { + if state.tenant_id == TenantId::from([0u8; 16]) + || state.timeline_id == TimelineId::from([0u8; 16]) + { + bail!( + "Calling SafeKeeper::new with empty tenant_id ({}) or timeline_id ({})", + state.tenant_id, + state.timeline_id + ); } - // initialize wal_store, if state is already initialized - wal_store.init_storage(&state)?; - Ok(SafeKeeper { global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), @@ -574,7 +586,7 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - /* Check protocol compatibility */ + // Check protocol compatibility if msg.protocol_version != SK_PROTOCOL_VERSION { bail!( "incompatible protocol version {}, expected {}", @@ -582,11 +594,11 @@ where SK_PROTOCOL_VERSION ); } - /* Postgres upgrade is not treated as fatal error */ + // Postgres upgrade is not treated as fatal error if msg.pg_version != self.state.server.pg_version && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { - info!( + warn!( "incompatible server version {}, expected {}", msg.pg_version, self.state.server.pg_version ); @@ -605,17 +617,25 @@ where self.state.timeline_id ); } - - // set basic info about server, if not yet - // TODO: verify that is doesn't change after - { - let mut state = self.state.clone(); - state.server.system_id = msg.system_id; - state.server.wal_seg_size = msg.wal_seg_size; - self.state.persist(&state)?; + if self.state.server.wal_seg_size != msg.wal_seg_size { + bail!( + "invalid wal_seg_size, got {}, expected {}", + msg.wal_seg_size, + self.state.server.wal_seg_size + ); } - self.wal_store.init_storage(&self.state)?; + // system_id will be updated on mismatch + if self.state.server.system_id != msg.system_id { + warn!( + "unexpected system ID arrived, got {}, expected {}", + msg.system_id, self.state.server.system_id + ); + + let mut state = self.state.clone(); + state.server.system_id = msg.system_id; + self.state.persist(&state)?; + } info!( "processed greeting from proposer {:?}, sending term {:?}", @@ -665,16 +685,6 @@ where Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } - /// Bump our term if received a note from elected proposer with higher one - fn bump_if_higher(&mut self, term: Term) -> Result<()> { - if self.state.acceptor_state.term < term { - let mut state = self.state.clone(); - state.acceptor_state.term = term; - self.state.persist(&state)?; - } - Ok(()) - } - /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { @@ -691,7 +701,12 @@ where fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { info!("received ProposerElected {:?}", msg); - self.bump_if_higher(msg.term)?; + if self.state.acceptor_state.term < msg.term { + let mut state = self.state.clone(); + state.acceptor_state.term = msg.term; + self.state.persist(&state)?; + } + // If our term is higher, ignore the message (next feedback will inform the compute) if self.state.acceptor_state.term > msg.term { return Ok(None); @@ -748,7 +763,7 @@ where } /// Advance commit_lsn taking into account what we have locally - pub fn update_commit_lsn(&mut self) -> Result<()> { + fn update_commit_lsn(&mut self) -> Result<()> { let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); @@ -768,6 +783,11 @@ where Ok(()) } + /// Persist control file to disk, called only after timeline creation (bootstrap). + pub fn persist(&mut self) -> Result<()> { + self.persist_control_file(self.state.clone()) + } + /// Persist in-memory state to the disk, taking other data from state. fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; @@ -918,6 +938,8 @@ where #[cfg(test)] mod tests { + use postgres_ffi::WAL_SEGMENT_SIZE; + use super::*; use crate::wal_storage::Storage; use std::ops::Deref; @@ -942,6 +964,14 @@ mod tests { } } + fn test_sk_state() -> SafeKeeperState { + let mut state = SafeKeeperState::empty(); + state.server.wal_seg_size = WAL_SEGMENT_SIZE as u32; + state.tenant_id = TenantId::from([1u8; 16]); + state.timeline_id = TimelineId::from([1u8; 16]); + state + } + struct DummyWalStore { lsn: Lsn, } @@ -951,10 +981,6 @@ mod tests { self.lsn } - fn init_storage(&mut self, _state: &SafeKeeperState) -> Result<()> { - Ok(()) - } - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) @@ -977,12 +1003,10 @@ mod tests { #[test] fn test_voting() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -998,7 +1022,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(timeline_id, storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1011,12 +1035,11 @@ mod tests { #[test] fn test_epoch_switch() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 375b6eea18..5a38558e9c 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,8 +2,9 @@ //! with the "START_REPLICATION" message. use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline, TimelineTools}; +use crate::timeline::{ReplicaState, Timeline}; use crate::wal_storage::WalReader; +use crate::GlobalTimelines; use anyhow::{bail, Context, Result}; use bytes::Bytes; @@ -167,8 +168,10 @@ impl ReplicationConn { ) -> Result<()> { let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); + let tli = GlobalTimelines::get(spg.ttid)?; + // spawn the background thread which receives HotStandbyFeedback messages. - let bg_timeline = Arc::clone(spg.timeline.get()); + let bg_timeline = Arc::clone(&tli); let bg_stream_in = self.stream_in.take().unwrap(); let bg_timeline_id = spg.timeline_id.unwrap(); @@ -201,11 +204,8 @@ impl ReplicationConn { .build()?; runtime.block_on(async move { - let (inmem_state, persisted_state) = spg.timeline.get().get_state(); + let (inmem_state, persisted_state) = tli.get_state(); // add persisted_state.timeline_start_lsn == Lsn(0) check - if persisted_state.server.wal_seg_size == 0 { - bail!("Cannot start replication before connecting to walproposer"); - } // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will @@ -217,7 +217,7 @@ impl ReplicationConn { // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. let stop_pos: Option = if spg.is_walproposer_recovery() { - let wal_end = spg.timeline.get().get_end_of_wal(); + let wal_end = tli.get_flush_lsn(); Some(wal_end) } else { None @@ -231,7 +231,7 @@ impl ReplicationConn { let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( - spg.conf.timeline_dir(&spg.timeline.get().zttid), + spg.conf.timeline_dir(&tli.ttid), &persisted_state, start_pos, spg.conf.wal_backup_enabled, @@ -241,7 +241,7 @@ impl ReplicationConn { let mut send_buf = vec![0u8; MAX_SEND_SIZE]; // watcher for commit_lsn updates - let mut commit_lsn_watch_rx = spg.timeline.get().get_commit_lsn_watch_rx(); + let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); loop { if let Some(stop_pos) = stop_pos { @@ -258,7 +258,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().stop_walsender(replica_id)? { + if tli.should_walsender_stop(replica_id) { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index cf317c41c3..4000815857 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,27 +1,25 @@ -//! This module contains timeline id -> safekeeper state map with file-backed -//! persistence and support for interaction between sending and receiving wal. +//! This module implements Timeline lifecycle management and has all neccessary code +//! to glue together SafeKeeper and all other background services. -use anyhow::{bail, Context, Result}; +use anyhow::{bail, Result}; use etcd_broker::subscription_value::SkTimelineInfo; -use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use serde::Serialize; use tokio::sync::watch; use std::cmp::{max, min}; -use std::collections::{HashMap, HashSet}; -use std::fs::{self}; -use std::sync::{Arc, Mutex, MutexGuard}; +use parking_lot::{Mutex, MutexGuard}; + +use std::path::PathBuf; use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ - id::{NodeId, TenantId, TenantTimelineId}, + id::{NodeId, TenantTimelineId}, lsn::Lsn, pq_proto::ReplicationFeedback, }; @@ -29,7 +27,7 @@ use utils::{ use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, - SafekeeperMemState, + SafekeeperMemState, ServerInfo, }; use crate::send_wal::HotStandbyFeedback; @@ -73,7 +71,7 @@ impl ReplicaState { } /// Shared state associated with database instance -struct SharedState { +pub struct SharedState { /// Safekeeper object sk: SafeKeeper, /// State of replicas @@ -95,17 +93,21 @@ struct SharedState { } impl SharedState { - /// Initialize timeline state, creating control file - fn create( + /// Initialize fresh timeline state without persisting anything to disk. + fn create_new( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - peer_ids: Vec, + ttid: &TenantTimelineId, + state: SafeKeeperState, ) -> Result { - let state = SafeKeeperState::new(zttid, peer_ids); - let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; + // We don't want to write anything to disk, because we may have existing timeline there. + // These functions should not change anything on disk. + let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; + let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; Ok(Self { sk, @@ -117,16 +119,17 @@ impl SharedState { }) } - /// Restore SharedState from control file. - /// If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, zttid: &TenantTimelineId) -> Result { - let control_store = control_file::FileStorage::restore_new(zttid, conf)?; - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); + /// Restore SharedState from control file. If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let control_store = control_file::FileStorage::restore_new(ttid, conf)?; + if control_store.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - info!("timeline {} restored", zttid.timeline_id); + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; Ok(Self { - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, + sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, replicas: Vec::new(), wal_backup_active: false, active: false, @@ -134,6 +137,7 @@ impl SharedState { last_removed_segno: 0, }) } + fn is_active(&self) -> bool { self.is_wal_backup_required() // FIXME: add tracking of relevant pageservers and check them here individually, @@ -254,148 +258,289 @@ impl SharedState { } } -/// Database instance (tenant) +#[derive(Debug, thiserror::Error)] +pub enum TimelineError { + #[error("Timeline {0} was cancelled and cannot be used anymore")] + Cancelled(TenantTimelineId), + #[error("Timeline {0} was not found in global map")] + NotFound(TenantTimelineId), + #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] + Invalid(TenantTimelineId), + #[error("Timeline {0} is already exists")] + AlreadyExists(TenantTimelineId), + #[error("Timeline {0} is not initialized, wal_seg_size is zero")] + UninitializedWalSegSize(TenantTimelineId), +} + +/// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. +/// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, + /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending zttid instead of concrete command allows to do + /// offloading). Sending ttid instead of concrete command allows to do /// sending without timeline lock. wal_backup_launcher_tx: Sender, + + /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, - /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, + + /// Safekeeper and other state, that should remain consistent and synchronized + /// with the disk. mutex: Mutex, + + /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. + cancellation_tx: watch::Sender, + + /// Timeline should not be used after cancellation. Background tasks should + /// monitor this channel and stop eventually after receiving `true` from this channel. + cancellation_rx: watch::Receiver, + + /// Directory where timeline state is stored. + timeline_dir: PathBuf, } impl Timeline { - fn new( - zttid: TenantTimelineId, + /// Load existing timeline from disk. + pub fn load_timeline( + conf: SafeKeeperConf, + ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, - shared_state: SharedState, - ) -> Timeline { + ) -> Result { + let shared_state = SharedState::restore(&conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = - watch::channel(shared_state.sk.inmem.commit_lsn); - Timeline { - zttid, + watch::channel(shared_state.sk.state.commit_lsn); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + + Ok(Timeline { + ttid, wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, mutex: Mutex::new(shared_state), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Create a new timeline, which is not yet persisted to disk. + pub fn create_empty( + conf: SafeKeeperConf, + ttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, + server_info: ServerInfo, + ) -> Result { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + let state = SafeKeeperState::new(&ttid, server_info, vec![]); + + Ok(Timeline { + ttid, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, + mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Initialize fresh timeline on disk and start background tasks. If bootstrap + /// fails, timeline is cancelled and cannot be used anymore. + /// + /// Bootstrap is transactional, so if it fails, created files will be deleted, + /// and state on disk should remain unchanged. + pub fn bootstrap(&self, shared_state: &mut MutexGuard) -> Result<()> { + match std::fs::metadata(&self.timeline_dir) { + Ok(_) => { + // Timeline directory exists on disk, we should leave state unchanged + // and return error. + bail!(TimelineError::Invalid(self.ttid)); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + return Err(e.into()); + } } + + // Create timeline directory. + std::fs::create_dir_all(&self.timeline_dir)?; + + // Write timeline to disk and TODO: start background tasks. + match || -> Result<()> { + shared_state.sk.persist()?; + // TODO: add more initialization steps here + Ok(()) + }() { + Ok(_) => Ok(()), + Err(e) => { + // Bootstrap failed, cancel timeline and remove timeline directory. + self.cancel(); + + if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { + warn!( + "failed to remove timeline {} directory after bootstrap failure: {}", + self.ttid, fs_err + ); + } + + Err(e) + } + } + } + + /// Delete timeline from disk completely, by removing timeline directory. Background + /// timeline activities will stop eventually. + pub fn delete_from_disk( + &self, + shared_state: &mut MutexGuard, + ) -> Result<(bool, bool)> { + let was_active = shared_state.active; + self.cancel(); + let dir_existed = delete_dir(&self.timeline_dir)?; + Ok((dir_existed, was_active)) + } + + /// Cancel timeline to prevent further usage. Background tasks will stop + /// eventually after receiving cancellation signal. + fn cancel(&self) { + info!("Timeline {} is cancelled", self.ttid); + let _ = self.cancellation_tx.send(true); + let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); + if let Err(e) = res { + error!("Failed to send stop signal to wal_backup_launcher: {}", e); + } + } + + /// Returns if timeline is cancelled. + pub fn is_cancelled(&self) -> bool { + *self.cancellation_rx.borrow() + } + + /// Take a writing mutual exclusive lock on timeline shared_state. + pub fn write_shared_state(&self) -> MutexGuard { + self.mutex.lock() } /// Register compute connection, starting timeline-related activity if it is /// not running yet. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_connect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes += 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes -= 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } - /// Whether we still need this walsender running? + /// Returns true if walsender should stop sending WAL to pageserver. /// TODO: check this pageserver is actually interested in this timeline. - pub fn stop_walsender(&self, replica_id: usize) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); + pub fn should_walsender_stop(&self, replica_id: usize) -> bool { + if self.is_cancelled() { + return true; + } + + let mut shared_state = self.write_shared_state(); if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.update_status(self.zttid); - return Ok(true); + shared_state.update_status(self.ttid); + return true; } } - Ok(false) + false } /// Returns whether s3 offloading is required and sets current status as /// matching it. pub fn wal_backup_attend(&self) -> bool { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.wal_backup_attend() - } - - // Can this safekeeper offload to s3? Recently joined safekeepers might not - // have necessary WAL. - pub fn can_wal_backup(&self) -> bool { - self.mutex.lock().unwrap().can_wal_backup() - } - - /// Deactivates the timeline, assuming it is being deleted. - /// Returns whether the timeline was already active. - /// - /// We assume all threads will stop by themselves eventually (possibly with errors, but no panics). - /// There should be no compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but - /// we're deleting the timeline anyway. - pub async fn deactivate_for_delete(&self) -> Result { - let was_active: bool; - { - let shared_state = self.mutex.lock().unwrap(); - was_active = shared_state.active; + if self.is_cancelled() { + return false; } - self.wal_backup_launcher_tx.send(self.zttid).await?; - Ok(was_active) + + self.write_shared_state().wal_backup_attend() } - fn is_active(&self) -> bool { - let shared_state = self.mutex.lock().unwrap(); - shared_state.active + /// Can this safekeeper offload to s3? Recently joined safekeepers might not + /// have necessary WAL. + pub fn can_wal_backup(&self) -> bool { + if self.is_cancelled() { + return false; + } + + let shared_state = self.write_shared_state(); + shared_state.can_wal_backup() } - /// Returns full timeline info, required for the metrics. - /// If the timeline is not active, returns None instead. + /// Returns full timeline info, required for the metrics. If the timeline is + /// not active, returns None instead. pub fn info_for_metrics(&self) -> Option { - let shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { + if self.is_cancelled() { return None; } - Some(FullTimelineInfo { - zttid: self.zttid, - replicas: shared_state - .replicas - .iter() - .filter_map(|r| r.as_ref()) - .copied() - .collect(), - wal_backup_active: shared_state.wal_backup_active, - timeline_is_active: shared_state.active, - num_computes: shared_state.num_computes, - last_removed_segno: shared_state.last_removed_segno, - epoch_start_lsn: shared_state.sk.epoch_start_lsn, - mem_state: shared_state.sk.inmem.clone(), - persisted_state: shared_state.sk.state.clone(), - flush_lsn: shared_state.sk.wal_store.flush_lsn(), - }) + let state = self.write_shared_state(); + if state.active { + Some(FullTimelineInfo { + ttid: self.ttid, + replicas: state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: state.wal_backup_active, + timeline_is_active: state.active, + num_computes: state.num_computes, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + }) + } else { + None + } } + /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() } @@ -405,10 +550,14 @@ impl Timeline { &self, msg: &ProposerAcceptorMessage, ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let mut rmsg: Option; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); rmsg = shared_state.sk.process_msg(msg)?; // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn @@ -426,28 +575,46 @@ impl Timeline { Ok(rmsg) } + /// Returns wal_seg_size. pub fn get_wal_seg_size(&self) -> usize { - self.mutex.lock().unwrap().get_wal_seg_size() + self.write_shared_state().get_wal_seg_size() } + /// Returns true only if the timeline is loaded and active. + pub fn is_active(&self) -> bool { + if self.is_cancelled() { + return false; + } + + self.write_shared_state().active + } + + /// Returns state of the timeline. pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { - let shared_state = self.mutex.lock().unwrap(); - (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) + let state = self.write_shared_state(); + (state.sk.inmem.clone(), state.sk.state.clone()) } + /// Returns latest backup_lsn. pub fn get_wal_backup_lsn(&self) -> Lsn { - self.mutex.lock().unwrap().sk.inmem.backup_lsn + self.write_shared_state().sk.inmem.backup_lsn } - pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { - self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + /// Sets backup_lsn to the given value. + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + self.write_shared_state().sk.inmem.backup_lsn = backup_lsn; // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. + Ok(()) } - /// Prepare public safekeeper info for reporting. + /// Return public safekeeper info for broadcasting to broker and other peers. pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { - let shared_state = self.mutex.lock().unwrap(); + let shared_state = self.write_shared_state(); SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), @@ -473,54 +640,53 @@ impl Timeline { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet (no message from compute ever - // received), can't do much without it. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let mut shared_state = self.write_shared_state(); shared_state.sk.record_safekeeper_info(sk_info)?; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); commit_lsn = shared_state.sk.inmem.commit_lsn; } self.commit_lsn_watch_tx.send(commit_lsn)?; // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.zttid).await?; + self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } + /// Add send_wal replica to the in-memory vector of replicas. pub fn add_replica(&self, state: ReplicaState) -> usize { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.add_replica(state) + self.write_shared_state().add_replica(state) } + /// Update replication replica state. pub fn update_replica_state(&self, id: usize, state: ReplicaState) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.replicas[id] = Some(state); } + /// Remove send_wal replica from the in-memory vector of replicas. pub fn remove_replica(&self, id: usize) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); assert!(shared_state.replicas[id].is_some()); shared_state.replicas[id] = None; } - pub fn get_end_of_wal(&self) -> Lsn { - let shared_state = self.mutex.lock().unwrap(); - shared_state.sk.wal_store.flush_lsn() + /// Returns flush_lsn. + pub fn get_flush_lsn(&self) -> Lsn { + self.write_shared_state().sk.wal_store.flush_lsn() } + /// Delete WAL segments from disk that are no longer needed. This is determined + /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { - let shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet, no WAL exists. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let shared_state = self.write_shared_state(); horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { @@ -528,243 +694,22 @@ impl Timeline { } // release the lock before removing } - let _enter = - info_span!("", tenant = %self.zttid.tenant_id, timeline = %self.zttid.timeline_id) - .entered(); + + // delete old WAL files remover(horizon_segno - 1)?; - self.mutex.lock().unwrap().last_removed_segno = horizon_segno; + + // update last_removed_segno + let mut shared_state = self.write_shared_state(); + shared_state.last_removed_segno = horizon_segno; Ok(()) } } -// Utilities needed by various Connection-like objects -pub trait TimelineTools { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()>; - - fn get(&self) -> &Arc; -} - -impl TimelineTools for Option> { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()> { - *self = Some(GlobalTimelines::get(conf, zttid, create)?); - Ok(()) - } - - fn get(&self) -> &Arc { - self.as_ref().unwrap() - } -} - -struct GlobalTimelinesState { - timelines: HashMap>, - wal_backup_launcher_tx: Option>, -} - -static TIMELINES_STATE: Lazy> = Lazy::new(|| { - Mutex::new(GlobalTimelinesState { - timelines: HashMap::new(), - wal_backup_launcher_tx: None, - }) -}); - -#[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { - pub dir_existed: bool, - pub was_active: bool, -} - -/// A zero-sized struct used to manage access to the global timelines map. -pub struct GlobalTimelines; - -impl GlobalTimelines { - pub fn init(wal_backup_launcher_tx: Sender) { - let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); - } - - fn create_internal( - mut state: MutexGuard, - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - match state.timelines.get(&zttid) { - Some(_) => bail!("timeline {} already exists", zttid), - None => { - // TODO: check directory existence - let dir = conf.timeline_dir(&zttid); - fs::create_dir_all(dir)?; - - let shared_state = SharedState::create(conf, &zttid, peer_ids) - .context("failed to create shared state")?; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - pub fn create( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - let state = TIMELINES_STATE.lock().unwrap(); - GlobalTimelines::create_internal(state, conf, zttid, peer_ids) - } - - /// Get a timeline with control file loaded from the global TIMELINES_STATE.timelines map. - /// If control file doesn't exist and create=false, bails out. - pub fn get( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - create: bool, - ) -> Result> { - let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); - - let mut state = TIMELINES_STATE.lock().unwrap(); - - match state.timelines.get(&zttid) { - Some(result) => Ok(Arc::clone(result)), - None => { - let shared_state = SharedState::restore(conf, &zttid); - - let shared_state = match shared_state { - Ok(shared_state) => shared_state, - Err(error) => { - // TODO: always create timeline explicitly - if error - .root_cause() - .to_string() - .contains("No such file or directory") - && create - { - return GlobalTimelines::create_internal(state, conf, zttid, vec![]); - } else { - return Err(error); - } - } - }; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - /// Get loaded timeline, if it exists. - pub fn get_loaded(zttid: TenantTimelineId) -> Option> { - let state = TIMELINES_STATE.lock().unwrap(); - state.timelines.get(&zttid).map(Arc::clone) - } - - pub fn get_active_timelines() -> HashSet { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter(|&(_, tli)| tli.is_active()) - .map(|(zttid, _)| *zttid) - .collect() - } - - /// Return FullTimelineInfo for all active timelines. - pub fn active_timelines_metrics() -> Vec { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter_map(|(_, tli)| tli.info_for_metrics()) - .collect() - } - - fn delete_force_internal( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - was_active: bool, - ) -> Result { - match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { - Ok(_) => Ok(TimelineDeleteForceResult { - dir_existed: true, - was_active, - }), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active, - }), - Err(e) => Err(e.into()), - } - } - - /// Deactivates and deletes the timeline, see `Timeline::deactivate_for_delete()`, the deletes - /// the corresponding data directory. - /// We assume all timeline threads do not care about `GlobalTimelines` not containing the timeline - /// anymore, and they will eventually terminate without panics. - /// - /// There are multiple ways the timeline may be accidentally "re-created" (so we end up with two - /// `Timeline` objects in memory): - /// a) a compute node connects after this method is called, or - /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or - /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. - /// TODO: ensure all of the above never happens. - pub async fn delete_force( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - ) -> Result { - info!("deleting timeline {}", zttid); - let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); - let mut was_active = false; - if let Some(tli) = timeline { - was_active = tli.deactivate_for_delete().await?; - } - GlobalTimelines::delete_force_internal(conf, zttid, was_active) - } - - /// Deactivates and deletes all timelines for the tenant, see `delete()`. - /// Returns map of all timelines which the tenant had, `true` if a timeline was active. - /// There may be a race if new timelines are created simultaneously. - pub async fn delete_force_all_for_tenant( - conf: &SafeKeeperConf, - tenant_id: &TenantId, - ) -> Result> { - info!("deleting all timelines for tenant {}", tenant_id); - let mut to_delete = HashMap::new(); - { - // Keep mutex in this scope. - let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; - for (&zttid, tli) in timelines.iter() { - if zttid.tenant_id == *tenant_id { - to_delete.insert(zttid, tli.clone()); - } - } - // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. - timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); - } - let mut deleted = HashMap::new(); - for (zttid, timeline) in to_delete { - let was_active = timeline.deactivate_for_delete().await?; - deleted.insert( - zttid, - GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, - ); - } - // There may be inactive timelines, so delete the whole tenant dir as well. - match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { - Ok(_) => (), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), - e => e?, - }; - Ok(deleted) +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: &PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), } } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs new file mode 100644 index 0000000000..cf99a243d7 --- /dev/null +++ b/safekeeper/src/timelines_global_map.rs @@ -0,0 +1,348 @@ +//! This module contains global (tenant_id, timeline_id) -> Arc mapping. +//! All timelines should always be present in this map, this is done by loading them +//! all from the disk on startup and keeping them in memory. + +use crate::safekeeper::ServerInfo; +use crate::timeline::{Timeline, TimelineError}; +use crate::SafeKeeperConf; +use anyhow::{anyhow, bail, Context, Result}; +use once_cell::sync::Lazy; +use serde::Serialize; +use std::collections::HashMap; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::{Arc, Mutex, MutexGuard}; +use tokio::sync::mpsc::Sender; +use tracing::*; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +struct GlobalTimelinesState { + timelines: HashMap>, + wal_backup_launcher_tx: Option>, + conf: SafeKeeperConf, +} + +impl GlobalTimelinesState { + /// Get dependencies for a timeline constructor. + fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { + ( + self.conf.clone(), + self.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) + } + + /// Insert timeline into the map. Returns error if timeline with the same id already exists. + fn try_insert(&mut self, timeline: Arc) -> Result<()> { + let ttid = timeline.ttid; + if self.timelines.contains_key(&ttid) { + bail!(TimelineError::AlreadyExists(ttid)); + } + self.timelines.insert(ttid, timeline); + Ok(()) + } + + /// Get timeline from the map. Returns error if timeline doesn't exist. + fn get(&self, ttid: &TenantTimelineId) -> Result> { + self.timelines + .get(ttid) + .cloned() + .ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid))) + } +} + +static TIMELINES_STATE: Lazy> = Lazy::new(|| { + Mutex::new(GlobalTimelinesState { + timelines: HashMap::new(), + wal_backup_launcher_tx: None, + conf: SafeKeeperConf::default(), + }) +}); + +/// A zero-sized struct used to manage access to the global timelines map. +pub struct GlobalTimelines; + +impl GlobalTimelines { + /// Inject dependencies needed for the timeline constructors and load all timelines to memory. + pub fn init( + conf: SafeKeeperConf, + wal_backup_launcher_tx: Sender, + ) -> Result<()> { + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); + state.conf = conf; + + // Iterate through all directories and load tenants for all directories + // named as a valid tenant_id. + let mut tenant_count = 0; + let tenants_dir = state.conf.workdir.clone(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + if let Ok(tenant_id) = + TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or("")) + { + tenant_count += 1; + GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?; + } + } + Err(e) => error!( + "failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + info!( + "found {} tenants directories, successfully loaded {} timelines", + tenant_count, + state.timelines.len() + ); + Ok(()) + } + + /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any. + fn load_tenant_timelines( + state: &mut MutexGuard, + tenant_id: TenantId, + ) -> Result<()> { + let timelines_dir = state.conf.tenant_dir(&tenant_id); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))? + { + match &timelines_dir_entry { + Ok(timeline_dir_entry) => { + if let Ok(timeline_id) = + TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) + { + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + match Timeline::load_timeline( + state.conf.clone(), + ttid, + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) { + Ok(timeline) => { + state.timelines.insert(ttid, Arc::new(timeline)); + } + // If we can't load a timeline, it's most likely because of a corrupted + // directory. We will log an error and won't allow to delete/recreate + // this timeline. The only way to fix this timeline is to repair manually + // and restart the safekeeper. + Err(e) => error!( + "failed to load timeline {} for tenant {}, reason: {:?}", + timeline_id, tenant_id, e + ), + } + } + } + Err(e) => error!( + "failed to list timelines dir entry {:?} in directory {}, reason: {:?}", + timelines_dir_entry, + timelines_dir.display(), + e + ), + } + } + + Ok(()) + } + + /// Create a new timeline with the given id. If the timeline already exists, returns + /// an existing timeline. + pub fn create(ttid: TenantTimelineId, server_info: ServerInfo) -> Result> { + let (conf, wal_backup_launcher_tx) = { + let state = TIMELINES_STATE.lock().unwrap(); + if let Ok(timeline) = state.get(&ttid) { + // Timeline already exists, return it. + return Ok(timeline); + } + state.get_dependencies() + }; + + info!("creating new timeline {}", ttid); + + let timeline = Arc::new(Timeline::create_empty( + conf, + ttid, + wal_backup_launcher_tx, + server_info, + )?); + + // Take a lock and finish the initialization holding this mutex. No other threads + // can interfere with creation after we will insert timeline into the map. + let mut shared_state = timeline.write_shared_state(); + + // We can get a race condition here in case of concurrent create calls, but only + // in theory. create() will return valid timeline on the next try. + TIMELINES_STATE + .lock() + .unwrap() + .try_insert(timeline.clone())?; + + // Write the new timeline to the disk and start background workers. + // Bootstrap is transactional, so if it fails, the timeline will be deleted, + // and the state on disk should remain unchanged. + match timeline.bootstrap(&mut shared_state) { + Ok(_) => { + // We are done with bootstrap, release the lock, return the timeline. + drop(shared_state); + Ok(timeline) + } + Err(e) => { + // Note: the most likely reason for bootstrap failure is that the timeline + // directory already exists on disk. This happens when timeline is corrupted + // and wasn't loaded from disk on startup because of that. We want to preserve + // the timeline directory in this case, for further inspection. + + // TODO: this is an unusual error, perhaps we should send it to sentry + // TODO: compute will try to create timeline every second, we should add backoff + error!("failed to bootstrap timeline {}: {}", ttid, e); + + // Timeline failed to bootstrap, it cannot be used. Remove it from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid); + Err(e) + } + } + } + + /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, + /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, + /// i.e. loaded in memory and not cancelled. + pub fn get(ttid: TenantTimelineId) -> Result> { + let res = TIMELINES_STATE.lock().unwrap().get(&ttid); + + match res { + Ok(tli) => { + if tli.is_cancelled() { + anyhow::bail!(TimelineError::Cancelled(ttid)); + } + Ok(tli) + } + Err(e) => Err(e), + } + } + + /// Returns all timelines. This is used for background timeline proccesses. + pub fn get_all() -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .cloned() + .filter(|t| !t.is_cancelled()) + .collect() + } + + /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant, + /// and that's why it can return cancelled timelines, to retry deleting them. + fn get_all_for_tenant(tenant_id: TenantId) -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .filter(|t| t.ttid.tenant_id == tenant_id) + .cloned() + .collect() + } + + /// Cancels timeline, then deletes the corresponding data directory. + pub fn delete_force(ttid: &TenantTimelineId) -> Result { + let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); + match tli_res { + Ok(timeline) => { + // Take a lock and finish the deletion holding this mutex. + let mut shared_state = timeline.write_shared_state(); + + info!("deleting timeline {}", ttid); + let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?; + + // Remove timeline from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active, + }) + } + Err(_) => { + // Timeline is not memory, but it may still exist on disk in broken state. + let dir_path = TIMELINES_STATE.lock().unwrap().conf.timeline_dir(ttid); + let dir_existed = delete_dir(dir_path)?; + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active: false, + }) + } + } + } + + /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which + /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are + /// created simultaneously. In that case the function will return error and the caller should + /// retry tenant deletion again later. + pub fn delete_force_all_for_tenant( + tenant_id: &TenantId, + ) -> Result> { + info!("deleting all timelines for tenant {}", tenant_id); + let to_delete = Self::get_all_for_tenant(*tenant_id); + + let mut err = None; + + let mut deleted = HashMap::new(); + for tli in &to_delete { + match Self::delete_force(&tli.ttid) { + Ok(result) => { + deleted.insert(tli.ttid, result); + } + Err(e) => { + error!("failed to delete timeline {}: {}", tli.ttid, e); + // Save error to return later. + err = Some(e); + } + } + } + + // If there was an error, return it. + if let Some(e) = err { + return Err(e); + } + + // There may be broken timelines on disk, so delete the whole tenant dir as well. + // Note that we could concurrently create new timelines while we were deleting them, + // so the directory may be not empty. In this case timelines will have bad state + // and timeline background jobs can panic. + delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?; + + let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); + if !tlis_after_delete.is_empty() { + // Some timelines were created while we were deleting them, returning error + // to the caller, so it can retry later. + bail!( + "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", + tenant_id + ); + } + + Ok(deleted) + } +} + +#[derive(Clone, Copy, Serialize)] +pub struct TimelineDeleteForceResult { + pub dir_existed: bool, + pub was_active: bool, +} + +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 85e967e218..0d5321fb3a 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -26,8 +26,8 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::broker::{Election, ElectionLeader}; -use crate::timeline::{GlobalTimelines, Timeline}; -use crate::{broker, SafeKeeperConf}; +use crate::timeline::Timeline; +use crate::{broker, GlobalTimelines, SafeKeeperConf}; use once_cell::sync::OnceCell; @@ -53,8 +53,10 @@ pub fn wal_backup_launcher_thread_main( /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(zttid: TenantTimelineId) -> Option> { - GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) +fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { + GlobalTimelines::get(ttid) + .ok() + .filter(|tli| tli.wal_backup_attend()) } struct WalBackupTaskHandle { @@ -70,20 +72,20 @@ struct WalBackupTimelineEntry { /// Start per timeline task, if it makes sense for this safekeeper to offload. fn consider_start_task( conf: &SafeKeeperConf, - zttid: TenantTimelineId, + ttid: TenantTimelineId, task: &mut WalBackupTimelineEntry, ) { if !task.timeline.can_wal_backup() { return; } - info!("starting WAL backup task for {}", zttid); + info!("starting WAL backup task for {}", ttid); // TODO: decide who should offload right here by simply checking current // state instead of running elections in offloading task. let election_name = SubscriptionKey { cluster_prefix: conf.broker_etcd_prefix.clone(), kind: SubscriptionKind::Operation( - zttid, + ttid, NodeKind::Safekeeper, OperationKind::Safekeeper(SkOperationKind::WalBackup), ), @@ -97,11 +99,11 @@ fn consider_start_task( ); let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&zttid); + let timeline_dir = conf.timeline_dir(&ttid); let handle = tokio::spawn( - backup_task_main(zttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup task", zttid = %zttid)), + backup_task_main(ttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup task", ttid = %ttid)), ); task.handle = Some(WalBackupTaskHandle { @@ -140,33 +142,33 @@ async fn wal_backup_launcher_main_loop( let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { tokio::select! { - zttid = wal_backup_launcher_rx.recv() => { + ttid = wal_backup_launcher_rx.recv() => { // channel is never expected to get closed - let zttid = zttid.unwrap(); + let ttid = ttid.unwrap(); if conf.remote_storage.is_none() || !conf.wal_backup_enabled { continue; /* just drain the channel and do nothing */ } - let timeline = is_wal_backup_required(zttid); + let timeline = is_wal_backup_required(ttid); // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&zttid) { + if timeline.is_some() != tasks.contains_key(&ttid) { if let Some(timeline) = timeline { // need to start the task - let entry = tasks.entry(zttid).or_insert(WalBackupTimelineEntry { + let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { timeline, handle: None, }); - consider_start_task(&conf, zttid, entry); + consider_start_task(&conf, ttid, entry); } else { // need to stop the task - info!("stopping WAL backup task for {}", zttid); + info!("stopping WAL backup task for {}", ttid); - let entry = tasks.remove(&zttid).unwrap(); + let entry = tasks.remove(&ttid).unwrap(); if let Some(wb_handle) = entry.handle { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", zttid, e); + warn!("WAL backup task for {} panicked: {}", ttid, e); } } } @@ -174,8 +176,8 @@ async fn wal_backup_launcher_main_loop( } // Start known tasks, if needed and possible. _ = ticker.tick() => { - for (zttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { - consider_start_task(&conf, *zttid, entry); + for (ttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { + consider_start_task(&conf, *ttid, entry); } } } @@ -191,26 +193,26 @@ struct WalBackupTask { election: Election, } -/// Offload single timeline. +/// Offload single timeline. Called only after we checked that backup +/// is required (wal_backup_attend) and possible (can_wal_backup). async fn backup_task_main( - zttid: TenantTimelineId, + ttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, election: Election, ) { info!("started"); - let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { - tli - } else { - /* Timeline could get deleted while task was starting, just exit then. */ - info!("no timeline, exiting"); + let res = GlobalTimelines::get(ttid); + if let Err(e) = res { + error!("backup error for timeline {}: {}", ttid, e); return; - }; + } + let tli = res.unwrap(); let mut wb = WalBackupTask { - wal_seg_size: timeline.get_wal_seg_size(), - commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), - timeline, + wal_seg_size: tli.get_wal_seg_size(), + commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline: tli, timeline_dir, leader: None, election, @@ -322,7 +324,11 @@ impl WalBackupTask { { Ok(backup_lsn_result) => { backup_lsn = backup_lsn_result; - self.timeline.set_wal_backup_lsn(backup_lsn_result); + let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); + if let Err(e) = res { + error!("backup error: {}", e); + return; + } retry_attempt = 0; } Err(e) => { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 58b69f06e7..ea613dd0f1 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,7 +7,7 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{bail, Context, Result}; use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; @@ -17,7 +17,7 @@ use postgres_ffi::v14::xlog_utils::{ find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; use postgres_ffi::{XLogSegNo, PG_TLI}; -use std::cmp::min; +use std::cmp::{max, min}; use std::fs::{self, remove_file, File, OpenOptions}; use std::io::Write; @@ -86,9 +86,9 @@ struct WalStorageMetrics { } impl WalStorageMetrics { - fn new(zttid: &TenantTimelineId) -> Self { - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + fn new(ttid: &TenantTimelineId) -> Self { + let tenant_id = ttid.tenant_id.to_string(); + let timeline_id = ttid.timeline_id.to_string(); Self { write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), @@ -101,9 +101,6 @@ pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; - /// Init storage with wal_seg_size and read WAL from disk to get latest LSN. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()>; - /// Write piece of WAL from buf to disk, but not necessarily sync it. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; @@ -119,7 +116,7 @@ pub trait Storage { } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes -/// for better performance. Storage must be initialized before use. +/// for better performance. Storage is initialized in the constructor. /// /// WAL is stored in segments, each segment is a file. Last segment has ".partial" suffix in /// its filename and may be not fully flushed. @@ -127,16 +124,14 @@ pub trait Storage { /// Relationship of LSNs: /// `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` /// -/// When storage is just created, all LSNs are zeroes and there are no segments on disk. +/// When storage is created first time, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, - zttid: TenantTimelineId, timeline_dir: PathBuf, conf: SafeKeeperConf, - // fields below are filled upon initialization - /// None if uninitialized, Some(usize) if storage is initialized. - wal_seg_size: Option, + /// Size of WAL segment in bytes. + wal_seg_size: usize, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -161,25 +156,47 @@ pub struct PhysicalStorage { } impl PhysicalStorage { - pub fn new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { - let timeline_dir = conf.timeline_dir(zttid); - PhysicalStorage { - metrics: WalStorageMetrics::new(zttid), - zttid: *zttid, + /// Create new storage. If commit_lsn is not zero, flush_lsn is tried to be restored from + /// the disk. Otherwise, all LSNs are set to zero. + pub fn new( + ttid: &TenantTimelineId, + conf: &SafeKeeperConf, + state: &SafeKeeperState, + ) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let wal_seg_size = state.server.wal_seg_size as usize; + + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn)? + }; + + // TODO: do we really know that write_lsn is fully flushed to disk? + // If not, maybe it's better to call fsync() here to be sure? + let flush_lsn = write_lsn; + + info!( + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, + ); + if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id); + } + + Ok(PhysicalStorage { + metrics: WalStorageMetrics::new(ttid), timeline_dir, conf: conf.clone(), - wal_seg_size: None, - write_lsn: Lsn(0), - write_record_lsn: Lsn(0), - flush_record_lsn: Lsn(0), - decoder: WalStreamDecoder::new(Lsn(0)), + wal_seg_size, + write_lsn, + write_record_lsn: write_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(write_lsn), file: None, - } - } - - /// Wrapper for flush_lsn updates that also updates metrics. - fn update_flush_lsn(&mut self) { - self.flush_record_lsn = self.write_record_lsn; + }) } /// Call fdatasync if config requires so. @@ -204,9 +221,9 @@ impl PhysicalStorage { /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&self, segno: XLogSegNo, wal_seg_size: usize) -> Result<(File, bool)> { + fn open_or_create(&self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; // Try to open already completed segment if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { @@ -222,24 +239,18 @@ impl PhysicalStorage { .open(&wal_file_partial_path) .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; - write_zeroes(&mut file, wal_seg_size)?; + write_zeroes(&mut file, self.wal_seg_size)?; self.fsync_file(&mut file)?; Ok((file, true)) } } /// Write WAL bytes, which are known to be located in a single WAL segment. - fn write_in_segment( - &mut self, - segno: u64, - xlogoff: usize, - buf: &[u8], - wal_seg_size: usize, - ) -> Result<()> { + fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { let mut file = if let Some(file) = self.file.take() { file } else { - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let (mut file, is_partial) = self.open_or_create(segno)?; assert!(is_partial, "unexpected write into non-partial segment file"); file.seek(SeekFrom::Start(xlogoff as u64))?; file @@ -247,13 +258,13 @@ impl PhysicalStorage { file.write_all(buf)?; - if xlogoff + buf.len() == wal_seg_size { + if xlogoff + buf.len() == self.wal_seg_size { // If we reached the end of a WAL segment, flush and close it. self.fdatasync_file(&mut file)?; // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_partial_path, &wal_file_path)?; } else { // otherwise, file can be reused later @@ -269,10 +280,6 @@ impl PhysicalStorage { /// /// Updates `write_lsn`. fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - if self.write_lsn != pos { // need to flush the file before discarding it if let Some(mut file) = self.file.take() { @@ -284,17 +291,17 @@ impl PhysicalStorage { while !buf.is_empty() { // Extract WAL location for this block - let xlogoff = self.write_lsn.segment_offset(wal_seg_size) as usize; - let segno = self.write_lsn.segment_number(wal_seg_size); + let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize; + let segno = self.write_lsn.segment_number(self.wal_seg_size); // If crossing a WAL boundary, only write up until we reach wal segment size. - let bytes_write = if xlogoff + buf.len() > wal_seg_size { - wal_seg_size - xlogoff + let bytes_write = if xlogoff + buf.len() > self.wal_seg_size { + self.wal_seg_size - xlogoff } else { buf.len() }; - self.write_in_segment(segno, xlogoff, &buf[..bytes_write], wal_seg_size)?; + self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?; self.write_lsn += bytes_write as u64; buf = &buf[bytes_write..]; } @@ -309,53 +316,6 @@ impl Storage for PhysicalStorage { self.flush_record_lsn } - /// Storage needs to know wal_seg_size to know which segment to read/write, but - /// wal_seg_size is not always known at the moment of storage creation. This method - /// allows to postpone its initialization. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { - if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown. This is dead path normally, should - // be used only in tests. - return Ok(()); - } - - if let Some(wal_seg_size) = self.wal_seg_size { - // physical storage is already initialized - assert_eq!(wal_seg_size, state.server.wal_seg_size as usize); - return Ok(()); - } - - // initialize physical storage - let wal_seg_size = state.server.wal_seg_size as usize; - self.wal_seg_size = Some(wal_seg_size); - - // Find out where stored WAL ends, starting at commit_lsn which is a - // known recent record boundary (unless we don't have WAL at all). - self.write_lsn = if state.commit_lsn == Lsn(0) { - Lsn(0) - } else { - find_end_of_wal(&self.timeline_dir, wal_seg_size, state.commit_lsn)? - }; - - self.write_record_lsn = self.write_lsn; - - // TODO: do we really know that write_lsn is fully flushed to disk? - // If not, maybe it's better to call fsync() here to be sure? - self.update_flush_lsn(); - - info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn, - ); - if self.flush_record_lsn < state.commit_lsn - || self.flush_record_lsn < state.peer_horizon_lsn - { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id); - } - - Ok(()) - } - /// Write WAL to disk. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. @@ -419,80 +379,83 @@ impl Storage for PhysicalStorage { // We have unflushed data (write_lsn != flush_lsn), but no file. // This should only happen if last file was fully written and flushed, // but haven't updated flush_lsn yet. - assert!(self.write_lsn.segment_offset(self.wal_seg_size.unwrap()) == 0); + if self.write_lsn.segment_offset(self.wal_seg_size) != 0 { + bail!( + "unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}", + self.write_lsn, + self.flush_record_lsn + ); + } } // everything is flushed now, let's update flush_lsn - self.update_flush_lsn(); + self.flush_record_lsn = self.write_record_lsn; Ok(()) } /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - assert!(self.write_lsn == Lsn(0) || self.write_lsn >= end_pos); + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + bail!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, + end_pos + ); + } // Close previously opened file, if any if let Some(mut unflushed_file) = self.file.take() { self.fdatasync_file(&mut unflushed_file)?; } - let xlogoff = end_pos.segment_offset(wal_seg_size) as usize; - let segno = end_pos.segment_number(wal_seg_size); - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize; + let segno = end_pos.segment_number(self.wal_seg_size); + + // Remove all segments after the given LSN. + remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?; + + let (mut file, is_partial) = self.open_or_create(segno)?; // Fill end with zeroes file.seek(SeekFrom::Start(xlogoff as u64))?; - write_zeroes(&mut file, wal_seg_size - xlogoff)?; + write_zeroes(&mut file, self.wal_seg_size - xlogoff)?; self.fdatasync_file(&mut file)?; if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_path, &wal_file_partial_path)?; } - // Remove all subsequent segments - let mut segno = segno; - loop { - segno += 1; - let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; - // TODO: better use fs::try_exists which is currently available only in nightly build - if wal_file_path.exists() { - fs::remove_file(&wal_file_path)?; - } else if wal_file_partial_path.exists() { - fs::remove_file(&wal_file_partial_path)?; - } else { - break; - } - } - // Update LSNs self.write_lsn = end_pos; self.write_record_lsn = end_pos; - self.update_flush_lsn(); + self.flush_record_lsn = end_pos; Ok(()) } fn remove_up_to(&self) -> Box Result<()>> { let timeline_dir = self.timeline_dir.clone(); - let wal_seg_size = self.wal_seg_size.unwrap(); + let wal_seg_size = self.wal_seg_size; Box::new(move |segno_up_to: XLogSegNo| { - remove_up_to(&timeline_dir, wal_seg_size, segno_up_to) + remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) }) } } -/// Remove all WAL segments in timeline_dir <= given segno. -fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo) -> Result<()> { +/// Remove all WAL segments in timeline_dir that match the given predicate. +fn remove_segments_from_disk( + timeline_dir: &Path, + wal_seg_size: usize, + remove_predicate: impl Fn(XLogSegNo) -> bool, +) -> Result<()> { let mut n_removed = 0; + let mut min_removed = u64::MAX; + let mut max_removed = u64::MIN; + for entry in fs::read_dir(&timeline_dir)? { let entry = entry?; let entry_path = entry.path(); @@ -504,19 +467,21 @@ fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo continue; } let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); - if segno <= segno_up_to { + if remove_predicate(segno) { remove_file(entry_path)?; n_removed += 1; + min_removed = min(min_removed, segno); + max_removed = max(max_removed, segno); } } } - let segno_from = segno_up_to - n_removed + 1; - info!( - "removed {} WAL segments [{}; {}]", - n_removed, - XLogFileName(PG_TLI, segno_from, wal_seg_size), - XLogFileName(PG_TLI, segno_up_to, wal_seg_size) - ); + + if n_removed > 0 { + info!( + "removed {} WAL segments [{}; {}]", + n_removed, min_removed, max_removed + ); + } Ok(()) } @@ -526,8 +491,10 @@ pub struct WalReader { pos: Lsn, wal_segment: Option>>, - enable_remote_read: bool, // S3 will be used to read WAL if LSN is not available locally + enable_remote_read: bool, + + // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, } From 7863c4a702617b2af5917d6a273a675395455e69 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:37:06 +0300 Subject: [PATCH 087/166] Regenerate Hakari files, add a CI check for that --- .github/workflows/codestyle.yml | 10 +++++++++- Cargo.lock | 22 +++------------------- libs/postgres_ffi/wal_craft/Cargo.toml | 1 + workspace_hack/Cargo.toml | 6 ++---- 4 files changed, 15 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 237cf81205..5220258ef0 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -30,7 +30,7 @@ jobs: # this is all we need to install our toolchain later via rust-toolchain.toml # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] - timeout-minutes: 60 + timeout-minutes: 90 name: check codestyle rust and postgres runs-on: ${{ matrix.os }} @@ -108,6 +108,14 @@ jobs: target key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + # https://github.com/facebookincubator/cargo-guppy/tree/main/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo install cargo-hakari + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + - name: Run cargo clippy run: ./run_clippy.sh diff --git a/Cargo.lock b/Cargo.lock index 2f4a57b698..3ce0ce465f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -377,13 +377,9 @@ version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ - "ansi_term", - "atty", "bitflags", - "strsim 0.8.0", "textwrap 0.11.0", "unicode-width", - "vec_map", ] [[package]] @@ -396,7 +392,7 @@ dependencies = [ "bitflags", "clap_lex", "indexmap", - "strsim 0.10.0", + "strsim", "termcolor", "textwrap 0.15.0", ] @@ -746,7 +742,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim 0.10.0", + "strsim", "syn", ] @@ -3023,12 +3019,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "strsim" version = "0.10.0" @@ -3685,12 +3675,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - [[package]] name = "version_check" version = "0.9.4" @@ -3709,6 +3693,7 @@ dependencies = [ "postgres", "postgres_ffi", "tempfile", + "workspace_hack", ] [[package]] @@ -3942,7 +3927,6 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 2.34.0", "either", "fail", "futures-channel", diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index f848ac1273..88466737ed 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -14,3 +14,4 @@ once_cell = "1.13.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" +workspace_hack = { version = "0.1", path = "../../../workspace_hack" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 096b3a5d70..96594bbf96 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,6 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } @@ -46,16 +45,15 @@ regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } From a5019bf771e878b8e3f02563d7803580450ff39f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Sep 2022 12:38:47 +0300 Subject: [PATCH 088/166] Use a simpler way to set extra options for benchmark test. Commit 43a4f7173e fixed the case that there are extra options in the connection string, but broke it in the case when there are not. Fix that. But on second thoughts, it's more straightforward set the options with ALTER DATABASE, so change the workflow yaml file to do that instead. --- .github/workflows/benchmarking.yml | 13 +++++++++---- test_runner/performance/test_perf_pgbench.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 0430f0b9c0..4e28223c18 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -183,12 +183,9 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new) + neon-captest-new | neon-captest-prefetch) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; - neon-captest-prefetch) - CONNSTR=${{ steps.create-neon-project.outputs.dsn }}?options=-cenable_seqscan_prefetch%3Don%20-cseqscan_prefetch_buffers%3D10 - ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; @@ -204,6 +201,14 @@ jobs: env: PLATFORM: ${{ matrix.platform }} + - name: Set database options + if: matrix.platform == 'neon-captest-prefetch' + run: | + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on" + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10" + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index d9bf237e49..e167ddaafa 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -84,7 +84,7 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - options = "-cstatement_timeout=1h " + env.pg.default_options["options"] + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: From 4b25b9652a024dd876259088ef8fad56e708ba4d Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 11:06:31 -0700 Subject: [PATCH 089/166] Rename more zid-like idents (#2480) Follow-up to PR #2433 (b8eb908a). There's still a few more unresolved locations that have been left as-is for the same compatibility reasons in the original PR. --- libs/utils/benches/benchmarks.rs | 12 +++---- libs/utils/src/pq_proto.rs | 46 +++++++++++++------------- pageserver/src/tenant.rs | 2 +- pgxn/neon/libpagestore.c | 12 +++---- pgxn/neon/pagestore_client.h | 6 ++-- pgxn/neon/pagestore_smgr.c | 12 +++---- safekeeper/src/control_file_upgrade.rs | 2 +- 7 files changed, 46 insertions(+), 46 deletions(-) diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index badcb5774e..98d839ca55 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -3,20 +3,20 @@ use criterion::{criterion_group, criterion_main, Criterion}; use utils::id; -pub fn bench_zid_stringify(c: &mut Criterion) { +pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. - let ztl = id::TenantTimelineId::generate(); + let ttid = id::TenantTimelineId::generate(); - c.bench_function("zid.to_string", |b| { + c.bench_function("id.to_string", |b| { b.iter(|| { // FIXME measurement overhead? //for _ in 0..1000 { - // ztl.tenant_id.to_string(); + // ttid.tenant_id.to_string(); //} - ztl.tenant_id.to_string(); + ttid.tenant_id.to_string(); }) }); } -criterion_group!(benches, bench_zid_stringify); +criterion_group!(benches, bench_id_stringify); criterion_main!(benches); diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index dde76039d7..21952ab87e 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -931,7 +931,7 @@ impl ReplicationFeedback { // Deserialize ReplicationFeedback message pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut zf = ReplicationFeedback::empty(); + let mut rf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); @@ -939,31 +939,31 @@ impl ReplicationFeedback { b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.current_timeline_size = buf.get_u64(); + rf.current_timeline_size = buf.get_u64(); } b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_writelsn = buf.get_u64(); + rf.ps_writelsn = buf.get_u64(); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_flushlsn = buf.get_u64(); + rf.ps_flushlsn = buf.get_u64(); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_applylsn = buf.get_u64(); + rf.ps_applylsn = buf.get_u64(); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - zf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - zf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { @@ -976,8 +976,8 @@ impl ReplicationFeedback { } } } - trace!("ReplicationFeedback parsed is {:?}", zf); - zf + trace!("ReplicationFeedback parsed is {:?}", rf); + rf } } @@ -987,29 +987,29 @@ mod tests { #[test] fn test_replication_feedback_serialization() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { @@ -1021,8 +1021,8 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f56f10d7ea..204caf6dfa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -132,7 +132,7 @@ pub enum TenantState { /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { - /// Get Timeline handle for given zenith timeline ID. + /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { self.timelines diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 296865838d..9cd2a86941 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -183,7 +183,7 @@ pageserver_send(NeonRequest * request) if (!connected) pageserver_connect(); - req_buff = zm_pack_request(request); + req_buff = nm_pack_request(request); /* * Send request. @@ -204,7 +204,7 @@ pageserver_send(NeonRequest * request) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) request); + char *msg = nm_to_string((NeonMessage *) request); neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); @@ -230,12 +230,12 @@ pageserver_receive(void) else if (resp_buff.len == -2) neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); } - resp = zm_unpack_response(&resp_buff); + resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) resp); + char *msg = nm_to_string((NeonMessage *) resp); neon_log(PageStoreTrace, "got response: %s", msg); pfree(msg); @@ -282,9 +282,9 @@ page_server_api api = { static bool check_neon_id(char **newval, void **extra, GucSource source) { - uint8 zid[16]; + uint8 id[16]; - return **newval == '\0' || HexDecodeString(zid, *newval, 16); + return **newval == '\0' || HexDecodeString(id, *newval, 16); } static char * diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 633c7b465c..e0cda11b63 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -128,9 +128,9 @@ typedef struct * message */ } NeonErrorResponse; -extern StringInfoData zm_pack_request(NeonRequest * msg); -extern NeonResponse * zm_unpack_response(StringInfo s); -extern char *zm_to_string(NeonMessage * msg); +extern StringInfoData nm_pack_request(NeonRequest * msg); +extern NeonResponse * nm_unpack_response(StringInfo s); +extern char *nm_to_string(NeonMessage * msg); /* * API diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8e6dd373b0..1187550f2a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -160,7 +160,7 @@ page_server_request(void const *req) StringInfoData -zm_pack_request(NeonRequest * msg) +nm_pack_request(NeonRequest * msg) { StringInfoData s; @@ -235,7 +235,7 @@ zm_pack_request(NeonRequest * msg) } NeonResponse * -zm_unpack_response(StringInfo s) +nm_unpack_response(StringInfo s) { NeonMessageTag tag = pq_getmsgbyte(s); NeonResponse *resp = NULL; @@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -zm_to_string(NeonMessage * msg) +nm_to_string(NeonMessage * msg) { StringInfoData s; @@ -632,7 +632,7 @@ neon_init(void) * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. */ static XLogRecPtr -zm_adjust_lsn(XLogRecPtr lsn) +nm_adjust_lsn(XLogRecPtr lsn) { /* * If lsn points to the beging of first record on page or segment, then @@ -685,7 +685,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Is it possible that the last-written LSN is ahead of last flush @@ -1569,7 +1569,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ lsn = GetXLogInsertRecPtr(); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Flush it, too. We don't actually care about it here, but let's uphold diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 87204d6b49..d8434efb20 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -167,7 +167,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); - // migrate to hexing some zids + // migrate to hexing some ids } else if version == 2 { info!("reading safekeeper control file version {}", version); let oldstate = SafeKeeperStateV2::des(&buf[..buf.len()])?; From 4a3b3ff11d89d02300041e32f43847110637f2e0 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 11:28:12 -0700 Subject: [PATCH 090/166] Move testing pageserver libpq cmds to HTTP api (#2429) Closes #2422. The APIs have been feature gated with the `testing_api!` macro so that they return 400s when support hasn't been compiled in. --- .cargo/config.toml | 2 +- .github/workflows/build_and_test.yml | 4 +- README.md | 7 +- pageserver/Cargo.toml | 6 +- pageserver/src/bin/pageserver.rs | 4 +- pageserver/src/http/models.rs | 18 +++ pageserver/src/http/routes.rs | 141 ++++++++++++++++++ pageserver/src/page_service.rs | 115 +------------- pageserver/src/repository.rs | 11 +- test_runner/README.md | 4 +- test_runner/fixtures/neon_fixtures.py | 57 +++++++ test_runner/regress/test_ancestor_branch.py | 7 +- test_runner/regress/test_basebackup_error.py | 3 +- test_runner/regress/test_branch_and_gc.py | 11 +- test_runner/regress/test_branch_behind.py | 13 +- test_runner/regress/test_broken_timeline.py | 3 +- test_runner/regress/test_gc_aggressive.py | 12 +- test_runner/regress/test_import.py | 5 +- test_runner/regress/test_old_request_lsn.py | 9 +- test_runner/regress/test_pitr_gc.py | 15 +- test_runner/regress/test_readonly_node.py | 3 +- test_runner/regress/test_recovery.py | 41 ++--- test_runner/regress/test_remote_storage.py | 5 +- test_runner/regress/test_tenant_detach.py | 23 +-- test_runner/regress/test_tenant_relocation.py | 30 ++-- test_runner/regress/test_tenants.py | 3 +- .../test_tenants_with_remote_storage.py | 2 +- test_runner/regress/test_timeline_size.py | 21 ++- test_runner/regress/test_wal_acceptor.py | 4 +- 29 files changed, 352 insertions(+), 227 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index d70d57a817..c40783bc1b 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -13,4 +13,4 @@ opt-level = 3 opt-level = 1 [alias] -build_testing = ["build", "--features", "failpoints"] +build_testing = ["build", "--features", "testing"] diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0b6cb21120..44db968753 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -100,11 +100,11 @@ jobs: run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="--features failpoints" + CARGO_FEATURES="--features testing" CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features failpoints,profiling" + CARGO_FEATURES="--features testing,profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV diff --git a/README.md b/README.md index 03ed57a0fa..dc469c36b1 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,12 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -make # builds also postgres and installs it to ./pg_install + +# either: +CARGO_BUILD_FLAGS="--features=testing" make +# or: +make debug + ./scripts/pytest ``` diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ce09e788bd..85ece97d9b 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,10 +5,10 @@ edition = "2021" [features] default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] -# Feature that enables a special API, fail_point! macro (adds some runtime cost) -# to run tests on outage conditions -failpoints = ["fail/failpoints"] profiling = ["pprof"] [dependencies] diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 92d5eab379..fb79ad3945 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -87,8 +87,8 @@ fn main() -> anyhow::Result<()> { if arg_matches.is_present("enabled-features") { let features: &[&str] = &[ - #[cfg(feature = "failpoints")] - "failpoints", + #[cfg(feature = "testing")] + "testing", #[cfg(feature = "profiling")] "profiling", ]; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index c0dc5b9677..2d7d560d2a 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -160,3 +160,21 @@ pub struct TimelineInfo { pub local: Option, pub remote: Option, } + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineGcRequest { + pub gc_horizon: Option, +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2e49429f38..bfc9e4462b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -29,6 +29,12 @@ use utils::{ lsn::Lsn, }; +// Imports only used for testing APIs +#[cfg(feature = "testing")] +use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +#[cfg(feature = "testing")] +use crate::CheckpointConfig; + struct State { conf: &'static PageServerConf, auth: Option>, @@ -661,6 +667,103 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest( + "Cannot manage failpoints because pageserver was compiled without failpoints support" + .to_owned(), + )); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = if fp.actions == "exit" { + fail::cfg_callback(fp.name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + } else { + fail::cfg(fp.name, &fp.actions) + }; + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(format!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} + +// Run GC immediately on given timeline. +// FIXME: This is just for tests. See test_runner/regress/test_gc.py. +// This probably should require special authentication or a global flag to +// enable, I don't think we want to or need to allow regular clients to invoke +// GC. +// @hllinnaka in commits ec44f4b29, 3aca717f3 +#[cfg(feature = "testing")] +async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX + let repo = tenant_mgr::get_tenant(tenant_id, false)?; + let gc_req: TimelineGcRequest = json_request(&mut request).await?; + + let _span_guard = + info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon()); + + // Use tenant's pitr setting + let pitr = repo.get_pitr_interval(); + let result = repo.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; + json_response(StatusCode::OK, result) +} + +// Run compaction immediately on given timeline. +// FIXME This is just for tests. Don't expect this to be exposed to +// the users or the api. +// @dhammika in commit a0781f229 +#[cfg(feature = "testing")] +async fn timeline_compact_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true)?; + // FIXME: currently this will return a 500 error on bad timeline id; it should be 4XX + let timeline = repo.get_timeline(timeline_id).with_context(|| { + format!("No timeline {timeline_id} in repository for tenant {tenant_id}") + })?; + timeline.compact()?; + + json_response(StatusCode::OK, ()) +} + +// Run checkpoint immediately on given timeline. +#[cfg(feature = "testing")] +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true)?; + // FIXME: currently this will return a 500 error on bad timeline id; it should be 4XX + let timeline = repo.get_timeline(timeline_id).with_context(|| { + format!("No timeline {timeline_id} in repository for tenant {tenant_id}") + })?; + timeline.checkpoint(CheckpointConfig::Forced)?; + + json_response(StatusCode::OK, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -687,12 +790,38 @@ pub fn make_router( })) } + macro_rules! testing_api { + ($handler_desc:literal, $handler:path $(,)?) => {{ + #[cfg(not(feature = "testing"))] + async fn cfg_disabled(_req: Request) -> Result, ApiError> { + Err(ApiError::BadRequest( + concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + ) + .to_owned(), + )) + } + + #[cfg(feature = "testing")] + let handler = $handler; + #[cfg(not(feature = "testing"))] + let handler = cfg_disabled; + handler + }}; + } + Ok(router .data(Arc::new( State::new(conf, auth, remote_index, remote_storage) .context("Failed to initialize router state")?, )) .get("/v1/status", status_handler) + .put( + "/v1/failpoints", + testing_api!("manage failpoints", failpoints_handler), + ) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .get("/v1/tenant/:tenant_id", tenant_status) @@ -705,6 +834,18 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", + testing_api!("run timeline GC", timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", + testing_api!("run timeline compaction", timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1461a6d117..9e159f7391 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -27,7 +27,7 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, - pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, + pq_proto::{BeMessage, FeMessage, RowDescriptor}, simple_rcu::RcuReadGuard, }; @@ -1005,31 +1005,6 @@ impl postgres_backend_async::Handler for PageServerHandler { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("failpoints ") { - ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); - - let (_, failpoints) = query_string.split_at("failpoints ".len()); - - for failpoint in failpoints.split(';') { - if let Some((name, actions)) = failpoint.split_once('=') { - info!("cfg failpoint: {} {}", name, actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - if actions == "exit" { - fail::cfg_callback(name, || { - info!("Exit requested by failpoint"); - std::process::exit(1); - }) - .unwrap(); - } else { - fail::cfg(name, actions).unwrap(); - } - } else { - bail!("Invalid failpoints format"); - } - } - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -1072,94 +1047,6 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("do_gc ") { - // Run GC immediately on given timeline. - // FIXME: This is just for tests. See test_runner/regress/test_gc.py. - // This probably should require special authentication or a global flag to - // enable, I don't think we want to or need to allow regular clients to invoke - // GC. - - // do_gc - let re = Regex::new(r"^do_gc ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)([[:digit:]]+)?") - .unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - - let _span_guard = - info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); - - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - let gc_horizon: u64 = caps - .get(4) - .map(|h| h.as_str().parse()) - .unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?; - - // Use tenant's pitr setting - let pitr = tenant.get_pitr_interval(); - let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; - pgb.write_message(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"layers_total"), - RowDescriptor::int8_col(b"layers_needed_by_cutoff"), - RowDescriptor::int8_col(b"layers_needed_by_pitr"), - RowDescriptor::int8_col(b"layers_needed_by_branches"), - RowDescriptor::int8_col(b"layers_not_updated"), - RowDescriptor::int8_col(b"layers_removed"), - RowDescriptor::int8_col(b"elapsed"), - ]))? - .write_message(&BeMessage::DataRow(&[ - Some(result.layers_total.to_string().as_bytes()), - Some(result.layers_needed_by_cutoff.to_string().as_bytes()), - Some(result.layers_needed_by_pitr.to_string().as_bytes()), - Some(result.layers_needed_by_branches.to_string().as_bytes()), - Some(result.layers_not_updated.to_string().as_bytes()), - Some(result.layers_removed.to_string().as_bytes()), - Some(result.elapsed.as_millis().to_string().as_bytes()), - ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("compact ") { - // Run compaction immediately on given timeline. - // FIXME This is just for tests. Don't expect this to be exposed to - // the users or the api. - - // compact - let re = Regex::new(r"^compact ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("Invalid compact: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - timeline.compact()?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("checkpoint ") { - // Run checkpoint immediately on given timeline. - - // checkpoint - let re = Regex::new(r"^checkpoint ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - - // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). - timeline.checkpoint(CheckpointConfig::Forced)?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index f6ea9d8c5d..cfcc87a2ed 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -176,7 +176,7 @@ impl Value { /// /// Result of performing GC /// -#[derive(Default)] +#[derive(Default, Serialize)] pub struct GcResult { pub layers_total: u64, pub layers_needed_by_cutoff: u64, @@ -185,9 +185,18 @@ pub struct GcResult { pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. + #[serde(serialize_with = "serialize_duration_as_millis")] pub elapsed: Duration, } +// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds +fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result +where + S: serde::Serializer, +{ + d.as_millis().serialize(serializer) +} + impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { self.layers_total += other.layers_total; diff --git a/test_runner/README.md b/test_runner/README.md index f17a4a5a5d..79b2418af6 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,9 +6,9 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - If you want to test tests with failpoints, you would need to add `--features failpoints` to Rust code build commands. + If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. - Usage example: `cargo build_testing --release` is equivalent to `cargo build --features failpoints --release` + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0c03429f95..1e83ee3839 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -964,6 +964,24 @@ class NeonPageserverHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None: + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def tenant_list(self) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant") self.verbose_error(res) @@ -1061,6 +1079,45 @@ class NeonPageserverHttpClient(requests.Session): assert res_json is None return res_json + def timeline_gc( + self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + ) -> dict[str, Any]: + log.info( + f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", + json={"gc_horizon": gc_horizon}, + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" + ) + log.info(f"Got compact request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" + ) + log.info(f"Got checkpoint request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index cb2621ff02..d7aebfb938 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -9,6 +9,7 @@ from fixtures.utils import query_scalar # def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. @@ -23,7 +24,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") + pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)")) pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() @@ -92,9 +93,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f"compact {tenant} {branch1_timeline} {lsn_200}" + compact = f"compact {tenant} {branch1_timeline}" log.info(compact) - env.pageserver.safe_psql(compact) + pageserver_http.timeline_compact(tenant, branch1_timeline) assert query_scalar(branch0_cur, "SELECT count(*) FROM foo") == 100000 diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 81a46ee2f0..94d3999d17 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -9,9 +9,10 @@ from fixtures.neon_fixtures import NeonEnv def test_basebackup_error(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_basebackup_error", "empty") + pageserver_http = env.pageserver.http_client() # Introduce failpoint - env.pageserver.safe_psql("failpoints basebackup-before-control-file=return") + pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index c8c5929066..12debe50eb 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -47,6 +47,7 @@ from fixtures.utils import query_scalar # could not find data for key ... at LSN ..., for request at LSN ... def test_branch_and_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() tenant, _ = env.neon_cli.create_tenant( conf={ @@ -84,7 +85,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. - env.pageserver.safe_psql(f"do_gc {tenant} {timeline_main} {lsn2 - lsn1 + 1024}") + pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 @@ -113,6 +114,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447. def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( conf={ @@ -147,10 +150,10 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. - env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") + pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)")) def do_gc(): - env.pageserver.safe_psql(f"do_gc {tenant} {b0} 0") + pageserver_http_client.timeline_gc(tenant, b0, 0) thread = threading.Thread(target=do_gc, daemon=True) thread.start() @@ -161,7 +164,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): time.sleep(1.0) # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) thread.join() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index b0d0737172..0e2a8b346b 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,4 +1,3 @@ -import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -96,7 +95,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) # branch at pre-ancestor lsn @@ -106,13 +105,11 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): ) # check that we cannot create branch based on garbage collected data - with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail env.neon_cli.create_branch( "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index fd81981b2b..7baa67935d 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -113,13 +113,14 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() tenant_id, _ = env.neon_cli.create_tenant() old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) # Introduce failpoint when creating a new timeline - env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") + pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 88d4ad8a6e..332bef225f 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -1,4 +1,5 @@ import asyncio +import concurrent.futures import random from fixtures.log_helper import log @@ -30,10 +31,15 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon async def gc(env: NeonEnv, timeline: TimelineId): - psconn = await env.pageserver.connect_async() + pageserver_http = env.pageserver.http_client() - while updates_performed < updates_to_perform: - await psconn.execute(f"do_gc {env.initial_tenant} {timeline} 0") + loop = asyncio.get_running_loop() + + with concurrent.futures.ThreadPoolExecutor() as pool: + while updates_performed < updates_to_perform: + await loop.run_in_executor( + pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + ) # At the same time, run UPDATEs and GC diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 7b61b03b97..885a0dc26f 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -270,8 +270,7 @@ def _import( assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) # Check that gc works - psconn = env.pageserver.connect() - pscur = psconn.cursor() - pscur.execute(f"do_gc {tenant} {timeline} 0") + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_gc(tenant, timeline, 0) return tar_output_file diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index c99e13f45f..3e387bb6cc 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,4 +1,3 @@ -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -29,8 +28,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Get the timeline ID of our branch. We need it for the 'do_gc' command timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - psconn = env.pageserver.connect() - pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) + pageserver_http = env.pageserver.http_client() # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. @@ -61,9 +59,8 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - gcrow = pscur.fetchone() - print_gc_result(gcrow) + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) for j in range(100): cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 57b2ee1c04..d8b7256577 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,3 @@ -from contextlib import closing - -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -54,13 +51,11 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 10000 rows: {debug_lsn} xid {debug_xid}") # run GC - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute(f"compact {env.initial_tenant} {timeline}") - # perform aggressive GC. Data still should be kept because of the PITR setting. - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_compact(env.initial_tenant, timeline) + # perform aggressive GC. Data still should be kept because of the PITR setting. + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 3be64e077f..dfa57aec25 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -106,6 +106,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() env.neon_cli.create_branch("test_timetravel", "empty") pg = env.postgres.create_start("test_timetravel") @@ -136,7 +137,7 @@ def test_timetravel(neon_simple_env: NeonEnv): wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to force a new layer file - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver env.postgres.stop_all() diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 08c15d8f09..d0ba96e8e0 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -1,7 +1,6 @@ import time from contextlib import closing -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -19,8 +18,8 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): f = env.neon_cli.pageserver_enabled_features() assert ( - "failpoints" in f["features"] - ), "Build pageserver with --features=failpoints option to run this test" + "testing" in f["features"] + ), "Build pageserver with --features=testing option to run this test" neon_env_builder.start() # Create a branch for us @@ -31,26 +30,28 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): with closing(pg.connect()) as conn: with conn.cursor() as cur: - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # Create and initialize test table - cur.execute("CREATE TABLE foo(x bigint)") - cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") + with env.pageserver.http_client() as pageserver_http: + # Create and initialize test table + cur.execute("CREATE TABLE foo(x bigint)") + cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") - # Sleep for some time to let checkpoint create image layers - time.sleep(2) + # Sleep for some time to let checkpoint create image layers + time.sleep(2) - # Configure failpoints - pscur.execute( - "failpoints flush-frozen-before-sync=sleep(2000);checkpoint-after-sync=exit" - ) + # Configure failpoints + pageserver_http.configure_failpoints( + [ + ("flush-frozen-before-sync", "sleep(2000)"), + ("checkpoint-after-sync", "exit"), + ] + ) - # Do some updates until pageserver is crashed - try: - while True: - cur.execute("update foo set x=x+1") - except Exception as err: - log.info(f"Expected server crash {err}") + # Do some updates until pageserver is crashed + try: + while True: + cur.execute("update foo set x=x+1") + except Exception as err: + log.info(f"Expected server crash {err}") log.info("Wait before server restart") env.pageserver.stop() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index cbe74cad5c..3e775b10b0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -57,6 +57,7 @@ def test_remote_storage_backup_and_restore( ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") client = env.pageserver.http_client() @@ -80,7 +81,7 @@ def test_remote_storage_backup_and_restore( wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage @@ -99,7 +100,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.start() # Introduce failpoint in download - env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") + pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) client.tenant_attach(tenant_id) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index e3c9a091f9..f18e6867a9 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,16 +1,21 @@ from threading import Thread -import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + NeonPageserverApiException, + NeonPageserverHttpClient, +) from fixtures.types import TenantId, TimelineId -def do_gc_target(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): +def do_gc_target( + pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) except Exception as e: log.error("do_gc failed: %s", e) @@ -44,13 +49,13 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start with pytest.raises( - expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" + expected_exception=NeonPageserverApiException, match="gc target timeline does not exist" ): bogus_timeline_id = TimelineId.generate() - env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) # try to concurrently run gc and detach - gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) + gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id)) gc_thread.start() last_error = None @@ -73,6 +78,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id} not found" + expected_exception=NeonPageserverApiException, match=f"Tenant {tenant_id} not found" ): - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index aa7d92f1fd..2b01546198 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -147,14 +147,13 @@ def populate_branch( def ensure_checkpoint( - pageserver_cur, pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage - pageserver_cur.execute(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) @@ -324,22 +323,19 @@ def test_tenant_relocation( # this requirement introduces a problem # if user creates a branch during migration # it wont appear on the new pageserver - with pg_cur(env.pageserver) as cur: - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_main, - current_lsn=current_lsn_main, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_main, + current_lsn=current_lsn_main, + ) - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_second, - current_lsn=current_lsn_second, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_second, + current_lsn=current_lsn_second, + ) log.info("inititalizing new pageserver") # bootstrap second pageserver diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4500395c8f..52b9e6369c 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -19,7 +19,8 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) - neon_simple_env.pageserver.safe_psql("failpoints tenant-creation-before-tmp-rename=return") + pageserver_http = neon_simple_env.pageserver.http_client() + pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): _ = neon_simple_env.neon_cli.create_tenant() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 85f371c845..83affac062 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -91,5 +91,5 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) # run final checkpoint manually to flush all the data to remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 83018f46f5..979d1a107f 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -238,6 +238,7 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") @@ -251,7 +252,7 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -264,6 +265,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") @@ -278,8 +280,8 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") - env.pageserver.safe_psql(f"compact {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -290,6 +292,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") @@ -304,7 +307,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pg.safe_psql( """ @@ -315,9 +318,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"do_gc {env.initial_tenant} {new_timeline_id} 0") + pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -326,6 +329,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Test the metrics. def test_timeline_size_metrics(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") pg = env.postgres.create_start("test_timeline_size_metrics") @@ -340,7 +344,7 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() @@ -382,6 +386,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): random.seed(100) env = neon_simple_env + pageserver_http = env.pageserver.http_client() client = env.pageserver.http_client() tenant, timeline = env.neon_cli.create_tenant() @@ -405,7 +410,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, tenant, timeline) - env.pageserver.safe_psql(f"checkpoint {tenant} {timeline}") + pageserver_http.timeline_checkpoint(tenant, timeline) timeline_total_size += get_timeline_physical_size(timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 089ed91c98..931de0f1e3 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -59,9 +59,7 @@ def wait_lsn_force_checkpoint( ) # force checkpoint to advance remote_consistent_lsn - with closing(ps.connect(**pageserver_conn_options)) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) # ensure that remote_consistent_lsn is advanced wait_for_upload( From 6fc719db13a1feec1fef4bd227147ea19e56cf0f Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 07:52:39 +0300 Subject: [PATCH 091/166] Merge timelines.rs with tenant.rs --- pageserver/src/http/routes.rs | 7 +- pageserver/src/lib.rs | 1 - pageserver/src/tenant.rs | 324 ++++++++++++++++++++++++---------- pageserver/src/timelines.rs | 168 ------------------ 4 files changed, 233 insertions(+), 267 deletions(-) delete mode 100644 pageserver/src/timelines.rs diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bfc9e4462b..0c6f7927fa 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -15,7 +15,7 @@ use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant::{TenantState, Timeline}; use crate::tenant_config::TenantConfOpt; -use crate::{config::PageServerConf, tenant_mgr, timelines}; +use crate::{config::PageServerConf, tenant_mgr}; use utils::{ auth::JwtAuth, http::{ @@ -166,10 +166,9 @@ async fn timeline_create_handler(mut request: Request) -> Result TenantId { + self.tenant_id + } + /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { @@ -142,8 +148,7 @@ impl Tenant { .with_context(|| { format!( "Timeline {} was not found for tenant {}", - timeline_id, - self.tenant_id() + timeline_id, self.tenant_id ) }) .map(Arc::clone) @@ -204,98 +209,67 @@ impl Tenant { Ok(new_timeline) } - /// Branch a timeline - pub fn branch_timeline( + /// Create a new timeline. + /// + /// Returns the new timeline ID and reference to its Timeline object. + /// + /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with + /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, + /// a new unique ID is generated. + pub async fn create_timeline( &self, - src: TimelineId, - dst: TimelineId, - start_lsn: Option, - ) -> Result> { - // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn - // about timelines, so otherwise a race condition is possible, where we create new timeline and GC - // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().unwrap(); + new_timeline_id: Option, + ancestor_timeline_id: Option, + mut ancestor_start_lsn: Option, + ) -> Result>> { + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - // In order for the branch creation task to not wait for GC/compaction, - // we need to make sure that the starting LSN of the child branch is not out of scope midway by - // - // 1. holding the GC lock to prevent overwritting timeline's GC data - // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline - // - // Step 2 is to avoid initializing the new branch using data removed by past GC iterations - // or in-queue GC iterations. - - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = timelines - .get(&src) - // message about timeline being remote is one .context up in the stack - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; - - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - - // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN - let start_lsn = start_lsn.unwrap_or_else(|| { - let lsn = src_timeline.get_last_record_lsn(); - info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); - lsn - }); - - // Check if the starting LSN is out of scope because it is less than - // 1. the latest GC cutoff LSN or - // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. - src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context(format!( - "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, - ))?; + if self + .conf + .timeline_path(&new_timeline_id, &self.tenant_id) + .exists() { - let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); - if start_lsn < cutoff { - bail!(format!( - "invalid branch start lsn: less than planned GC cutoff {cutoff}" - )); - } + debug!("timeline {new_timeline_id} already exists"); + return Ok(None); } - // Determine prev-LSN for the new timeline. We can only determine it if - // the timeline was branched at the current end of the source timeline. - let RecordLsn { - last: src_last, - prev: src_prev, - } = src_timeline.get_last_record_rlsn(); - let dst_prev = if src_last == start_lsn { - Some(src_prev) - } else { - None + let loaded_timeline = match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = self + .get_timeline(ancestor_timeline_id) + .context("Cannot branch off the timeline that's not present in pageserver")?; + + if let Some(lsn) = ancestor_start_lsn.as_mut() { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + *lsn = lsn.align(); + ancestor_timeline.wait_lsn(*lsn).await?; + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > *lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + } + + self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + } + None => self.bootstrap_timeline(new_timeline_id)?, }; - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; + // Have added new timeline into the tenant, now its background tasks are needed. + self.activate(true); - // Create the metadata file, noting the ancestor of the new timeline. - // There is initially no data in it, but all the read-calls know to look - // into the ancestor. - let metadata = TimelineMetadata::new( - start_lsn, - dst_prev, - Some(src), - start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), - src_timeline.initdb_lsn, - ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - - let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; - timelines.insert(dst, Arc::clone(&new_timeline)); - - info!("branched timeline {dst} from {src} at {start_lsn}"); - - Ok(new_timeline) + Ok(Some(loaded_timeline)) } /// perform one garbage collection iteration, removing old data files from disk. @@ -948,9 +922,171 @@ impl Tenant { Ok(totals) } - pub fn tenant_id(&self) -> TenantId { - self.tenant_id + fn branch_timeline( + &self, + src: TimelineId, + dst: TimelineId, + start_lsn: Option, + ) -> Result> { + // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn + // about timelines, so otherwise a race condition is possible, where we create new timeline and GC + // concurrently removes data that is needed by the new timeline. + let _gc_cs = self.gc_cs.lock().unwrap(); + + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + + // XXX: keep the lock to avoid races during timeline creation + let mut timelines = self.timelines.lock().unwrap(); + let src_timeline = timelines + .get(&src) + // message about timeline being remote is one .context up in the stack + .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; + + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN + let start_lsn = start_lsn.unwrap_or_else(|| { + let lsn = src_timeline.get_last_record_lsn(); + info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + lsn + }); + + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. + src_timeline + .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn, + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } + + // Determine prev-LSN for the new timeline. We can only determine it if + // the timeline was branched at the current end of the source timeline. + let RecordLsn { + last: src_last, + prev: src_prev, + } = src_timeline.get_last_record_rlsn(); + let dst_prev = if src_last == start_lsn { + Some(src_prev) + } else { + None + }; + + // create a new timeline directory + let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); + crashsafe_dir::create_dir(&timelinedir)?; + + // Create the metadata file, noting the ancestor of the new timeline. + // There is initially no data in it, but all the read-calls know to look + // into the ancestor. + let metadata = TimelineMetadata::new( + start_lsn, + dst_prev, + Some(src), + start_lsn, + *src_timeline.latest_gc_cutoff_lsn.read(), + src_timeline.initdb_lsn, + ); + crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; + save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + + let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; + timelines.insert(dst, Arc::clone(&new_timeline)); + + info!("branched timeline {dst} from {src} at {start_lsn}"); + + Ok(new_timeline) } + + /// - run initdb to init temporary instance and get bootstrap data + /// - after initialization complete, remove the temp dir. + fn bootstrap_timeline(&self, timeline_id: TimelineId) -> Result> { + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + self.conf + .timelines_path(&self.tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); + + // Init temporarily repo to get bootstrap data + run_initdb(self.conf, &initdb_path)?; + let pgdata_path = initdb_path; + + let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + + // Import the contents of the data directory at the initial checkpoint + // LSN, and any WAL after that. + // Initdb lsn will be equal to last_record_lsn which will be set after import. + // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. + let timeline = self.create_empty_timeline(timeline_id, lsn)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + timeline.checkpoint(CheckpointConfig::Forced)?; + + info!( + "created root timeline {} timeline.lsn {}", + timeline_id, + timeline.get_last_record_lsn() + ); + + // Remove temp dir. We don't need it anymore + fs::remove_dir_all(pgdata_path)?; + + Ok(timeline) + } +} + +/// Create the cluster temporarily in 'initdbpath' directory inside the repository +/// to get bootstrap data for timeline initialization. +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { + info!("running initdb in {}... ", initdbpath.display()); + + let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_output = Command::new(initdb_path) + .args(&["-D", &initdbpath.to_string_lossy()]) + .args(&["-U", &conf.superuser]) + .args(&["-E", "utf8"]) + .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .stdout(Stdio::null()) + .output() + .context("failed to execute initdb")?; + if !initdb_output.status.success() { + bail!( + "initdb failed: '{}'", + String::from_utf8_lossy(&initdb_output.stderr) + ); + } + + Ok(()) } impl Drop for Tenant { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs deleted file mode 100644 index 88b26e18f4..0000000000 --- a/pageserver/src/timelines.rs +++ /dev/null @@ -1,168 +0,0 @@ -//! -//! Timeline management code -// - -use std::{ - fs, - path::Path, - process::{Command, Stdio}, - sync::Arc, -}; - -use anyhow::{bail, Context, Result}; -use tracing::*; - -use remote_storage::path_with_suffix_extension; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use crate::config::PageServerConf; -use crate::tenant::{Tenant, Timeline}; -use crate::tenant_mgr; -use crate::CheckpointConfig; -use crate::{import_datadir, TEMP_FILE_SUFFIX}; - -// Create the cluster temporarily in 'initdbpath' directory inside the repository -// to get bootstrap data for timeline initialization. -// -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); - - let initdb_path = conf.pg_bin_dir().join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", &initdbpath.to_string_lossy()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) - .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) - .stdout(Stdio::null()) - .output() - .context("failed to execute initdb")?; - if !initdb_output.status.success() { - bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); - } - - Ok(()) -} - -// -// - run initdb to init temporary instance and get bootstrap data -// - after initialization complete, remove the temp dir. -// -fn bootstrap_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - timeline_id: TimelineId, - tenant: &Tenant, -) -> Result> { - // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` - // temporary directory for basebackup files for the given timeline. - let initdb_path = path_with_suffix_extension( - conf.timelines_path(&tenant_id) - .join(format!("basebackup-{timeline_id}")), - TEMP_FILE_SUFFIX, - ); - - // Init temporarily repo to get bootstrap data - run_initdb(conf, &initdb_path)?; - let pgdata_path = initdb_path; - - let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); - - // Import the contents of the data directory at the initial checkpoint - // LSN, and any WAL after that. - // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = tenant.create_empty_timeline(timeline_id, lsn)?; - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); - }); - - timeline.checkpoint(CheckpointConfig::Forced)?; - - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - - Ok(timeline) -} - -/// -/// Create a new timeline. -/// -/// Returns the new timeline ID and reference to its Timeline object. -/// -/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with -/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, -/// a new unique ID is generated. -/// -pub(crate) async fn create_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - new_timeline_id: Option, - ancestor_timeline_id: Option, - mut ancestor_start_lsn: Option, -) -> Result>> { - let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - debug!("timeline {new_timeline_id} already exists"); - return Ok(None); - } - - let loaded_timeline = match ancestor_timeline_id { - Some(ancestor_timeline_id) => { - let ancestor_timeline = tenant - .get_timeline(ancestor_timeline_id) - .context("Cannot branch off the timeline that's not present in pageserver")?; - - if let Some(lsn) = ancestor_start_lsn.as_mut() { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn).await?; - - let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); - if ancestor_ancestor_lsn > *lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - lsn, - ancestor_timeline_id, - ancestor_ancestor_lsn, - ); - } - } - - tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? - } - None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?, - }; - - // Have added new timeline into the tenant, now its background tasks are needed. - tenant.activate(true); - - Ok(Some(loaded_timeline)) -} From 310c507303d642c97a778f9850b57e1593ba5717 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 07:58:06 +0300 Subject: [PATCH 092/166] Merge path retrieval methods in config.rs --- pageserver/src/config.rs | 17 +++++++++++++++++ pageserver/src/storage_sync.rs | 13 ++++--------- pageserver/src/storage_sync/download.rs | 15 +++++++-------- pageserver/src/storage_sync/upload.rs | 5 +++-- pageserver/src/tenant.rs | 9 ++++----- pageserver/src/tenant/metadata.rs | 17 +---------------- pageserver/src/tenant/timeline.rs | 4 ++-- pageserver/src/tenant_config.rs | 11 ----------- pageserver/src/tenant_mgr.rs | 12 +++++------- 9 files changed, 43 insertions(+), 60 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 75c71b09d2..945ee098ea 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -22,6 +22,10 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; +/// The name of the metadata file pageserver creates per timeline. +pub const METADATA_FILE_NAME: &str = "metadata"; +const TENANT_CONFIG_NAME: &str = "config"; + pub mod defaults { use crate::tenant_config::defaults::*; use const_format::formatcp; @@ -346,6 +350,12 @@ impl PageServerConf { self.tenants_path().join(tenant_id.to_string()) } + /// Points to a place in pageserver's local directory, + /// where certain tenant's tenantconf file should be located. + pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf { + self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) + } + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } @@ -354,6 +364,13 @@ impl PageServerConf { self.timelines_path(tenant_id).join(timeline_id.to_string()) } + /// Points to a place in pageserver's local directory, + /// where certain timeline's metadata file should be located. + pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { + self.timeline_path(&timeline_id, &tenant_id) + .join(METADATA_FILE_NAME) + } + // // Postgres distribution paths // diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 64e0f9a9e3..489d0ad4ed 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -169,13 +169,8 @@ use self::{ upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; use crate::{ - config::PageServerConf, - exponential_backoff, - storage_sync::index::RemoteIndex, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::{metadata_path, TimelineMetadata}, + config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr, + task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, tenant_mgr::attach_local_tenants, }; use crate::{ @@ -1012,7 +1007,7 @@ async fn update_local_metadata( }; let remote_lsn = remote_metadata.disk_consistent_lsn(); - let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_metadata_path = conf.metadata_path(sync_id.timeline_id, sync_id.tenant_id); let local_lsn = if local_metadata_path.exists() { let local_metadata = read_metadata_file(&local_metadata_path) .await @@ -1433,7 +1428,7 @@ mod test_utils { } fs::write( - metadata_path(harness.conf, timeline_id, harness.tenant_id), + harness.conf.metadata_path(timeline_id, harness.tenant_id), metadata.to_bytes()?, ) .await?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 80d5ca5994..980001f95d 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -16,10 +16,7 @@ use tokio::{ }; use tracing::{debug, error, info, warn}; -use crate::{ - config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, - TEMP_FILE_SUFFIX, -}; +use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::{ @@ -137,7 +134,8 @@ async fn download_index_part( storage: &GenericRemoteStorage, sync_id: TenantTimelineId, ) -> Result { - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); let mut index_part_download = storage .download_storage_object(None, &index_part_path) @@ -620,9 +618,10 @@ mod tests { metadata.to_bytes()?, ); - let local_index_part_path = - metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME); + let local_index_part_path = harness + .conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME); let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; let index_part_local_path = PathBuf::from(index_part_remote_id.to_string()); fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index aa5a2232cf..75657915c0 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -15,7 +15,7 @@ use super::{ LayersUpload, SyncData, SyncQueue, }; use crate::metrics::NO_LAYERS_UPLOAD; -use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path}; +use crate::{config::PageServerConf, storage_sync::SyncTask}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -29,7 +29,8 @@ pub(super) async fn upload_index_part( let index_part_size = index_part_bytes.len(); let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); storage .upload_storage_object( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cf236a0a9c..b753c1979c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -41,7 +41,7 @@ use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; use crate::repository::GcResult; use crate::storage_sync::index::RemoteIndex; use crate::task_mgr; -use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; @@ -676,7 +676,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_id: TenantId, ) -> anyhow::Result { - let target_config_path = TenantConf::path(conf, tenant_id); + let target_config_path = conf.tenant_config_path(tenant_id); let target_config_display = target_config_path.display(); info!("loading tenantconf from {target_config_display}"); @@ -1134,7 +1134,6 @@ pub mod harness { walredo::{WalRedoError, WalRedoManager}, }; - use super::metadata::metadata_path; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; @@ -1270,7 +1269,7 @@ pub mod harness { timeline_id: TimelineId, tenant_id: TenantId, ) -> anyhow::Result { - let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_path = conf.metadata_path(timeline_id, tenant_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { format!( "Failed to read metadata bytes from path {}", @@ -1316,8 +1315,8 @@ pub mod harness { #[cfg(test)] mod tests { - use super::metadata::METADATA_FILE_NAME; use super::*; + use crate::config::METADATA_FILE_NAME; use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index ace4dc91e9..606acbf2f1 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -8,7 +8,6 @@ use std::fs::{File, OpenOptions}; use std::io::Write; -use std::path::PathBuf; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; @@ -29,9 +28,6 @@ use crate::STORAGE_FORMAT_VERSION; /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; -/// The name of the metadata file pageserver creates per timeline. -pub const METADATA_FILE_NAME: &str = "metadata"; - /// Metadata stored on disk for each timeline /// /// The fields correspond to the values we hold in memory, in Timeline. @@ -166,17 +162,6 @@ impl TimelineMetadata { } } -/// Points to a place in pageserver's local directory, -/// where certain timeline's metadata file should be located. -pub fn metadata_path( - conf: &'static PageServerConf, - timeline_id: TimelineId, - tenant_id: TenantId, -) -> PathBuf { - conf.timeline_path(&timeline_id, &tenant_id) - .join(METADATA_FILE_NAME) -} - /// Save timeline metadata to file pub fn save_metadata( conf: &'static PageServerConf, @@ -186,7 +171,7 @@ pub fn save_metadata( first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timeline_id, tenant_id); + let path = conf.metadata_path(timeline_id, tenant_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8670e979ee..b80d023c7f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,12 +24,12 @@ use crate::tenant::{ image_layer::{ImageLayer, ImageLayerWriter}, inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, - metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME}, + metadata::{save_metadata, TimelineMetadata}, par_fsync, storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, }; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 4448ffc456..4c5d5cc3f3 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -8,14 +8,9 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use crate::config::PageServerConf; use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; -use std::path::PathBuf; use std::time::Duration; -use utils::id::TenantId; - -pub const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB @@ -215,12 +210,6 @@ impl TenantConf { } } - /// Points to a place in pageserver's local directory, - /// where certain tenant's tenantconf file should be located. - pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf { - conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) - } - #[cfg(test)] pub fn dummy_conf() -> Self { TenantConf { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d6fa843305..2c6f5fa863 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,17 +12,15 @@ use tracing::*; use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::{ - ephemeral_file::is_ephemeral_file, - metadata::{TimelineMetadata, METADATA_FILE_NAME}, - Tenant, TenantState, + ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState, }; -use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; @@ -246,7 +244,7 @@ fn create_tenant_files( &temporary_tenant_dir, )?; let temporary_tenant_config_path = rebase_directory( - &TenantConf::path(conf, tenant_id), + &conf.tenant_config_path(tenant_id), &target_tenant_directory, &temporary_tenant_dir, )?; @@ -343,7 +341,7 @@ pub fn update_tenant_config( ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); - Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; + Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; Ok(()) } From 6b8dcad1bbc02b0f045c0ee192629ef129dd5755 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:13:25 +0300 Subject: [PATCH 093/166] Unify timeline creation steps --- pageserver/src/tenant.rs | 73 ++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b753c1979c..40c9f1e9ad 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -185,27 +185,12 @@ impl Tenant { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } - // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(timeline_path)?; - let new_metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - save_metadata( - self.conf, - new_timeline_id, - self.tenant_id, - &new_metadata, - true, - )?; - let new_timeline = - self.initialize_new_timeline(new_timeline_id, new_metadata, &mut timelines)?; + self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - if let hash_map::Entry::Vacant(v) = timelines.entry(new_timeline_id) { - v.insert(Arc::clone(&new_timeline)); - } - Ok(new_timeline) } @@ -1004,12 +989,7 @@ impl Tenant { *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - - let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; - timelines.insert(dst, Arc::clone(&new_timeline)); - + let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; info!("branched timeline {dst} from {src} at {start_lsn}"); Ok(new_timeline) @@ -1057,6 +1037,55 @@ impl Tenant { Ok(timeline) } + + fn create_initialized_timeline( + &self, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + timelines: &mut MutexGuard>>, + ) -> Result> { + crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id)) + .with_context(|| { + format!( + "Failed to create timeline {}/{} directory", + new_timeline_id, self.tenant_id + ) + })?; + save_metadata( + self.conf, + new_timeline_id, + self.tenant_id, + &new_metadata, + true, + ) + .with_context(|| { + format!( + "Failed to create timeline {}/{} metadata", + new_timeline_id, self.tenant_id + ) + })?; + + let new_timeline = self + .initialize_new_timeline(new_timeline_id, new_metadata, timelines) + .with_context(|| { + format!( + "Failed to initialize timeline {}/{}", + new_timeline_id, self.tenant_id + ) + })?; + + match timelines.entry(new_timeline_id) { + hash_map::Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {} in the tenant map", + new_timeline_id + ), + hash_map::Entry::Vacant(v) => { + v.insert(Arc::clone(&new_timeline)); + } + } + + Ok(new_timeline) + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository From 8d7024a8c26d9f143202d28665ec2ae8a8e32ea1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 20 Sep 2022 08:24:18 +0300 Subject: [PATCH 094/166] Move path manipulation function to utils --- Cargo.lock | 7 +-- libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/lib.rs | 47 ------------------ libs/remote_storage/src/local_fs.rs | 3 +- libs/utils/src/crashsafe_dir.rs | 49 ++++++++++++++++++- pageserver/src/storage_sync/download.rs | 7 ++- pageserver/src/tenant.rs | 6 +-- pageserver/src/tenant_mgr.rs | 4 +- .../src/walreceiver/connection_manager.rs | 2 +- pageserver/src/walredo.rs | 2 +- workspace_hack/Cargo.toml | 6 --- 11 files changed, 62 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ce0ce465f..fc4ef90b8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2444,6 +2444,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "utils", "workspace_hack", ] @@ -3929,13 +3930,7 @@ dependencies = [ "chrono", "either", "fail", - "futures-channel", - "futures-task", - "futures-util", - "generic-array", "hashbrown", - "hex", - "hyper", "indexmap", "itoa 0.4.8", "libc", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index b3485f274a..cec344a4ad 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1" metrics = { version = "0.1", path = "../metrics" } +utils = { version = "0.1", path = "../utils" } once_cell = "1.13.0" rusoto_core = "0.48" rusoto_s3 = "0.48" diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 6b3fd29a0e..4bdd2b9608 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -9,9 +9,7 @@ mod local_fs; mod s3_bucket; use std::{ - borrow::Cow, collections::HashMap, - ffi::OsStr, fmt::{Debug, Display}, num::{NonZeroU32, NonZeroUsize}, ops::Deref, @@ -344,22 +342,6 @@ impl Debug for S3Config { } } -/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension, -/// or if there's no extension, creates one and puts a suffix there. -pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { - let new_extension = match original_path - .as_ref() - .extension() - .map(OsStr::to_string_lossy) - { - Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), - None => Cow::Borrowed(suffix), - }; - original_path - .as_ref() - .with_extension(new_extension.as_ref()) -} - impl RemoteStorageConfig { pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { let local_path = toml.get("local_path"); @@ -448,35 +430,6 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { mod tests { use super::*; - #[test] - fn test_path_with_suffix_extension() { - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), - "/foo/bar.temp" - ); - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.baz.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar.baz..temp" - ); - let p = PathBuf::from("/foo/bar/dir/"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar/dir..temp" - ); - } - #[test] fn object_name() { let k = RemoteObjectId("a/b/c".to_owned()); diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 3ffbf3cb39..5723a512f6 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -16,8 +16,9 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; -use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId}; +use crate::{Download, DownloadError, RemoteObjectId}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; diff --git a/libs/utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe_dir.rs index a7eab73a43..032ab0a916 100644 --- a/libs/utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe_dir.rs @@ -1,7 +1,9 @@ use std::{ + borrow::Cow, + ffi::OsStr, fs::{self, File}, io, - path::Path, + path::{Path, PathBuf}, }; /// Similar to [`std::fs::create_dir`], except we fsync the @@ -74,6 +76,22 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { Ok(()) } +/// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + #[cfg(test)] mod tests { use tempfile::tempdir; @@ -122,4 +140,33 @@ mod tests { let invalid_dir_path = file_path.join("folder"); create_dir_all(&invalid_dir_path).unwrap_err(); } + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); + } } diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 980001f95d..3e850443d8 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,7 +9,7 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, GenericRemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -17,7 +17,10 @@ use tokio::{ use tracing::{debug, error, info, warn}; use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::{ + crashsafe_dir::path_with_suffix_extension, + id::{TenantId, TenantTimelineId, TimelineId}, +}; use super::{ index::{IndexPart, RemoteTimeline}, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 40c9f1e9ad..ca97796870 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -14,6 +14,7 @@ use anyhow::{bail, ensure, Context, Result}; use tokio::sync::watch; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use std::cmp::min; use std::collections::hash_map; @@ -45,7 +46,6 @@ use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; -use remote_storage::path_with_suffix_extension; use toml_edit; use utils::{ @@ -974,10 +974,6 @@ impl Tenant { None }; - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; - // Create the metadata file, noting the ancestor of the new timeline. // There is initially no data in it, but all the read-calls know to look // into the ancestor. diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 2c6f5fa863..fcb2c18b79 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use anyhow::Context; use tracing::*; -use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; +use remote_storage::GenericRemoteStorage; use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; @@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use utils::crashsafe_dir; +use utils::crashsafe_dir::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; mod tenants_state { diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 799062e935..148372c9d0 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1358,7 +1358,7 @@ mod tests { const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - fn dummy_state(harness: &TenantHarness) -> WalreceiverState { + fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { WalreceiverState { id: TenantTimelineId { tenant_id: harness.tenant_id, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9faabfebda..79c2edc96e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,7 +21,6 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; -use remote_storage::path_with_suffix_extension; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -36,6 +35,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 96594bbf96..dc4cbb5284 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -21,13 +21,7 @@ bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } -futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } -futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } -generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } -hex = { version = "0.4", features = ["alloc", "serde", "std"] } -hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } From 6f949e15563280cc791b02940c711a5641813891 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 20 Sep 2022 17:02:10 -0700 Subject: [PATCH 095/166] Improve pageserver/safekeepeer HTTP API errors (#2461) Part of the general work on improving pageserver logs. Brief summary of changes: * Remove `ApiError::from_err` * Remove `impl From for ApiError` * Convert `ApiError::{BadRequest, NotFound}` to use `anyhow::Error` * Note: `NotFound` has more verbose formatting because it's more likely to have useful information for the receiving "user" * Explicitly convert from `tokio::task::JoinError`s into `InternalServerError`s where appropriate Also note: many of the places where errors were implicitly converted to 500s have now been updated to return a more appropriate error. Some places where it's not yet possible to distinguish the error types have been left as 500s. --- libs/utils/src/http/error.rs | 17 +-- libs/utils/src/http/json.rs | 13 +- libs/utils/src/http/request.rs | 13 +- pageserver/src/http/routes.rs | 220 +++++++++++++++++++++------------ safekeeper/src/http/routes.rs | 39 ++++-- 5 files changed, 195 insertions(+), 107 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index b3bbec0f1c..b0ecb746d9 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,12 +1,11 @@ -use anyhow::anyhow; use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; use thiserror::Error; #[derive(Debug, Error)] pub enum ApiError { - #[error("Bad request: {0}")] - BadRequest(String), + #[error("Bad request: {0:#?}")] + BadRequest(anyhow::Error), #[error("Forbidden: {0}")] Forbidden(String), @@ -15,24 +14,20 @@ pub enum ApiError { Unauthorized(String), #[error("NotFound: {0}")] - NotFound(String), + NotFound(anyhow::Error), #[error("Conflict: {0}")] Conflict(String), #[error(transparent)] - InternalServerError(#[from] anyhow::Error), + InternalServerError(anyhow::Error), } impl ApiError { - pub fn from_err>(err: E) -> Self { - Self::InternalServerError(anyhow!(err)) - } - pub fn into_response(self) -> Response { match self { - ApiError::BadRequest(_) => HttpErrorBody::response_from_msg_and_status( - self.to_string(), + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause StatusCode::BAD_REQUEST, ), ApiError::Forbidden(_) => { diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 08f2ac4205..8981fdd1dd 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -1,3 +1,4 @@ +use anyhow::Context; use bytes::Buf; use hyper::{header, Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; @@ -9,20 +10,24 @@ pub async fn json_request Deserialize<'de>>( ) -> Result { let whole_body = hyper::body::aggregate(request.body_mut()) .await - .map_err(ApiError::from_err)?; + .context("Failed to read request body") + .map_err(ApiError::BadRequest)?; serde_json::from_reader(whole_body.reader()) - .map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err))) + .context("Failed to parse json request") + .map_err(ApiError::BadRequest) } pub fn json_response( status: StatusCode, data: T, ) -> Result, ApiError> { - let json = serde_json::to_string(&data).map_err(ApiError::from_err)?; + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(header::CONTENT_TYPE, "application/json") .body(Body::from(json)) - .map_err(ApiError::from_err)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 4984d695fd..7b96ccd584 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use super::error::ApiError; +use anyhow::anyhow; use hyper::{body::HttpBody, Body, Request}; use routerify::ext::RequestExt; @@ -10,9 +11,8 @@ pub fn get_request_param<'a>( ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), - None => Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name + None => Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in path param", ))), } } @@ -23,16 +23,15 @@ pub fn parse_request_param( ) -> Result { match get_request_param(request, param_name)?.parse() { Ok(v) => Ok(v), - Err(_) => Err(ApiError::BadRequest(format!( - "failed to parse {}", - param_name + Err(_) => Err(ApiError::BadRequest(anyhow!( + "failed to parse {param_name}", ))), } } pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { - Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())), + Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), None => Ok(()), } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0c6f7927fa..c676dfacd2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,9 +1,10 @@ use std::sync::Arc; -use anyhow::{Context, Result}; +use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinError; use tracing::*; use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; @@ -166,7 +167,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; + let local_info = local_timeline_info_from_timeline(&new_timeline, false, false) + .map_err(ApiError::InternalServerError)?; Ok(Some(TimelineInfo { tenant_id, timeline_id: new_timeline.timeline_id, @@ -184,12 +186,11 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists - Err(err) => Err(err), + Err(err) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) - .await - .map_err(ApiError::from_err)?; + .await?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -207,10 +208,11 @@ async fn timeline_list_handler(request: Request) -> Result, let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines()) + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + Ok(tenant.list_timelines()) }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let mut response_data = Vec::with_capacity(timelines.len()); for (timeline_id, timeline) in timelines { @@ -275,7 +277,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result((local_timeline_info, remote_timeline_info)) + Ok::<_, ApiError>((local_timeline_info, remote_timeline_info)) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) .await?; if local_timeline_info.is_none() && remote_timeline_info.is_none() { - Err(ApiError::NotFound(format!( + Err(ApiError::NotFound(anyhow!( "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely" ))) } else { @@ -332,14 +334,14 @@ async fn tenant_attach_handler(request: Request) -> Result, info!("Handling tenant attach {tenant_id}"); - tokio::task::spawn_blocking(move || { - if tenant_mgr::get_tenant(tenant_id, false).is_ok() { - anyhow::bail!("Tenant is already present locally") - }; - Ok(()) + tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { + Ok(_) => Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )), + Err(_) => Ok(()), }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let state = get_state(&request); let remote_index = &state.remote_index; @@ -364,12 +366,12 @@ async fn tenant_attach_handler(request: Request) -> Result, // download index parts for every tenant timeline let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await { Ok(Some(remote_timelines)) => remote_timelines, - Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())), + Ok(None) => return Err(ApiError::NotFound(anyhow!("Unknown remote tenant"))), Err(e) => { error!("Failed to retrieve remote tenant data: {:?}", e); - return Err(ApiError::NotFound( - "Failed to retrieve remote tenant".to_string(), - )); + return Err(ApiError::NotFound(anyhow!( + "Failed to retrieve remote tenant" + ))); } }; @@ -392,7 +394,8 @@ async fn tenant_attach_handler(request: Request) -> Result, for (timeline_id, mut remote_timeline) in remote_timelines { tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) .await - .context("Failed to create new timeline directory")?; + .context("Failed to create new timeline directory") + .map_err(ApiError::InternalServerError)?; remote_timeline.awaits_download = true; tenant_entry.insert(timeline_id, remote_timeline); @@ -438,7 +441,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, tenant_mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) .await - .map_err(ApiError::from_err)?; + // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. + // Replace this with better handling once the error type permits it. + .map_err(ApiError::InternalServerError)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -478,7 +486,7 @@ async fn tenant_list_handler(request: Request) -> Result, A crate::tenant_mgr::list_tenant_info(&remote_index) }) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, response_data) } @@ -490,7 +498,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro // if tenant is in progress of downloading it can be absent in global tenant map let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; let state = get_state(&request); let remote_index = &state.remote_index; @@ -519,7 +527,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro let current_physical_size = match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) .await - .map_err(ApiError::from_err)? + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))? { Err(err) => { // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). @@ -545,6 +553,16 @@ async fn tenant_status(request: Request) -> Result, ApiErro ) } +// Helper function to standardize the error messages we produce on bad durations +// +// Intended to be used with anyhow's `with_context`, e.g.: +// +// let value = result.with_context(bad_duration("name", &value))?; +// +fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") +} + async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -553,25 +571,39 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result json_response(StatusCode::CREATED, TenantCreateResponse(id))?, @@ -618,24 +659,38 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result) -> Result, ApiError> { if !fail::has_failpoints() { - return Err(ApiError::BadRequest( + return Err(ApiError::BadRequest(anyhow!( "Cannot manage failpoints because pageserver was compiled without failpoints support" - .to_owned(), - )); + ))); } let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; @@ -691,7 +754,7 @@ async fn failpoints_handler(mut request: Request) -> Result }; if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(format!( + return Err(ApiError::BadRequest(anyhow!( "Failed to configure failpoints: {err_msg}" ))); } @@ -713,7 +776,7 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result {{ #[cfg(not(feature = "testing"))] async fn cfg_disabled(_req: Request) -> Result, ApiError> { - Err(ApiError::BadRequest( - concat!( - "Cannot ", - $handler_desc, - " because pageserver was compiled without testing APIs", - ) - .to_owned(), - )) + Err(ApiError::BadRequest(anyhow!(concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + )))) } #[cfg(feature = "testing")] diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 244325368b..43c0a17f84 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,12 +1,14 @@ use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; +use anyhow::Context; use once_cell::sync::Lazy; use serde::Serialize; use serde::Serializer; use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; +use tokio::task::JoinError; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; @@ -99,7 +101,12 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result Date: Wed, 21 Sep 2022 13:13:11 +0300 Subject: [PATCH 096/166] Use prebuilt image with Hakari for CI style checks (#2488) --- .github/workflows/codestyle.yml | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 5220258ef0..641943199e 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -108,20 +108,32 @@ jobs: target key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - # https://github.com/facebookincubator/cargo-guppy/tree/main/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check every project module is covered by Hakari - run: | - cargo install cargo-hakari - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - shell: bash -euxo pipefail {0} - - name: Run cargo clippy run: ./run_clippy.sh - name: Ensure all project builds run: cargo build --locked --all --all-targets + check-rust-dependencies: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + check-codestyle-python: runs-on: [ self-hosted, Linux, k8s-runner ] steps: From b82e2e3f18cbeb08c45074015cbe4606d36c51c5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 21 Sep 2022 11:08:12 +0300 Subject: [PATCH 097/166] Bump postgres submodules and update docs/core_changes.md. The old change to downgrade a WARNING in postgres vacuumlazy.c was reverted. --- docs/core_changes.md | 25 ------------------------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/docs/core_changes.md b/docs/core_changes.md index 8f29dd9121..ea219adae9 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -148,31 +148,6 @@ relcache? (I think we do cache nblocks in relcache already, check why that's not Neon) -## Misc change in vacuumlazy.c - -``` -index 8aab6e324e..c684c4fbee 100644 ---- a/src/backend/access/heap/vacuumlazy.c -+++ b/src/backend/access/heap/vacuumlazy.c -@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) - else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) - { -- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", -+ /* ZENITH-XXX: all visible hint is not wal-logged -+ * FIXME: Replay visibilitymap changes in pageserver -+ */ -+ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); - visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); -``` - - -Is this still needed? If that WARNING happens, it looks like potential corruption that we should -fix! - - ## Use buffer manager when extending VM or FSM ``` diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 796770565f..19d948fd47 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 796770565ff668b585e80733b8d679961ad50e93 +Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 34c47d6c99..5b8b3eeef5 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 34c47d6c99415c94296d5e599ec5590d0001d6c2 +Subproject commit 5b8b3eeef5ec34c0cad9377833906a1387841d04 From 19fa410ff84ad41ce39fcbdedf1e8e7c158ef1b4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 21 Sep 2022 12:50:37 +0100 Subject: [PATCH 098/166] NeonCompare: switch to new pageserver HTTP API --- test_runner/fixtures/compare_fixtures.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index ceeeffc785..78a12c6c45 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -89,16 +89,13 @@ class NeonCompare(PgCompare): self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin + self.pageserver_http_client = self.env.pageserver.http_client() # We only use one branch and one timeline self.env.neon_cli.create_branch(branch_name, "empty") self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] - # Long-lived cursor, useful for flushing - self.psconn = self.env.pageserver.connect() - self.pscur = self.psconn.cursor() - @property def pg(self): return self._pg @@ -112,10 +109,10 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): - self.pscur.execute(f"do_gc {self.env.initial_tenant} {self.timeline} 0") + self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0) def compact(self): - self.pscur.execute(f"compact {self.env.initial_tenant} {self.timeline}") + self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline) def report_peak_memory_use(self) -> None: self.zenbenchmark.record( From 7eebb45ea6635404d494563af3d58790a44a68eb Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 21 Sep 2022 18:13:30 +0200 Subject: [PATCH 099/166] Reduce metrics footprint in safekeeper (#2491) Fixes bugs with metrics in control_file and wal_storage, where we haven't deleted metrics for inactive timelines. --- safekeeper/src/control_file.rs | 24 +----- safekeeper/src/metrics.rs | 138 ++++++++++++++++++++++++++++++++- safekeeper/src/safekeeper.rs | 4 + safekeeper/src/timeline.rs | 1 + safekeeper/src/wal_storage.rs | 92 +++++----------------- 5 files changed, 162 insertions(+), 97 deletions(-) diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 22ed34cc00..6be3f9abb2 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,7 +2,6 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use once_cell::sync::Lazy; use std::fs::{self, File, OpenOptions}; use std::io::{Read, Write}; @@ -10,8 +9,8 @@ use std::ops::Deref; use std::path::{Path, PathBuf}; use crate::control_file_upgrade::upgrade_control_file; +use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -24,16 +23,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_persist_control_file_seconds", - "Seconds to persist and sync control file, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") -}); - /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. pub trait Storage: Deref { @@ -46,7 +35,6 @@ pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: PathBuf, conf: SafeKeeperConf, - persist_control_file_seconds: Histogram, /// Last state persisted to disk. state: SafeKeeperState, @@ -56,16 +44,12 @@ impl FileStorage { /// Initialize storage by loading state from disk. pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { let timeline_dir = conf.timeline_dir(ttid); - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); let state = Self::load_control_file_conf(conf, ttid)?; Ok(FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), state, }) } @@ -77,14 +61,10 @@ impl FileStorage { state: SafeKeeperState, ) -> Result { let timeline_dir = conf.timeline_dir(ttid); - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); let store = FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), state, }; @@ -175,7 +155,7 @@ impl Storage for FileStorage { /// persists state durably to underlying storage /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { - let _timer = &self.persist_control_file_seconds.start_timer(); + let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 851a568aec..51138df776 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,12 +1,15 @@ -//! This module exports metrics for all active timelines. +//! Global safekeeper mertics and per-timeline safekeeper metrics. use std::time::{Instant, SystemTime}; +use ::metrics::{register_histogram, GaugeVec, Histogram, DISK_WRITE_SECONDS_BUCKETS}; +use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, proto::MetricFamily, Gauge, IntGaugeVec, }; +use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; use utils::{id::TenantTimelineId, lsn::Lsn}; @@ -16,6 +19,85 @@ use crate::{ GlobalTimelines, }; +// Global metrics across all timelines. +pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_bytes", + "Bytes written to WAL in a single request", + vec![ + 1.0, + 10.0, + 100.0, + 1024.0, + 8192.0, + 128.0 * 1024.0, + 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0 + ] + ) + .expect("Failed to register safekeeper_write_wal_bytes histogram") +}); +pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_seconds", + "Seconds spent writing and syncing WAL to a disk in a single request", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_write_wal_seconds histogram") +}); +pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_flush_wal_seconds", + "Seconds spent syncing WAL to a disk", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_flush_wal_seconds histogram") +}); +pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_persist_control_file_seconds", + "Seconds to persist and sync control file", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") +}); + +/// Metrics for WalStorage in a single timeline. +#[derive(Clone, Default)] +pub struct WalStorageMetrics { + /// How much bytes were written in total. + write_wal_bytes: u64, + /// How much time spent writing WAL to disk, waiting for write(2). + write_wal_seconds: f64, + /// How much time spent syncing WAL to disk, waiting for fsync(2). + flush_wal_seconds: f64, +} + +impl WalStorageMetrics { + pub fn observe_write_bytes(&mut self, bytes: usize) { + self.write_wal_bytes += bytes as u64; + WRITE_WAL_BYTES.observe(bytes as f64); + } + + pub fn observe_write_seconds(&mut self, seconds: f64) { + self.write_wal_seconds += seconds; + WRITE_WAL_SECONDS.observe(seconds); + } + + pub fn observe_flush_seconds(&mut self, seconds: f64) { + self.flush_wal_seconds += seconds; + FLUSH_WAL_SECONDS.observe(seconds); + } +} + +/// Accepts a closure that returns a result, and returns the duration of the closure. +pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { + let start = std::time::Instant::now(); + closure()?; + Ok(start.elapsed().as_secs_f64()) +} + +/// Metrics for a single timeline. pub struct FullTimelineInfo { pub ttid: TenantTimelineId, pub replicas: Vec, @@ -29,8 +111,11 @@ pub struct FullTimelineInfo { pub persisted_state: SafeKeeperState, pub flush_lsn: Lsn, + + pub wal_storage: WalStorageMetrics, } +/// Collects metrics for all active timelines. pub struct TimelineCollector { descs: Vec, commit_lsn: GenericGaugeVec, @@ -46,6 +131,9 @@ pub struct TimelineCollector { connected_computes: IntGaugeVec, disk_usage: GenericGaugeVec, acceptor_term: GenericGaugeVec, + written_wal_bytes: GenericGaugeVec, + written_wal_seconds: GaugeVec, + flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, } @@ -186,6 +274,36 @@ impl TimelineCollector { .unwrap(); descs.extend(acceptor_term.desc().into_iter().cloned()); + let written_wal_bytes = GenericGaugeVec::new( + Opts::new( + "safekeeper_written_wal_bytes_total", + "Number of WAL bytes written to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_bytes.desc().into_iter().cloned()); + + let written_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_written_wal_seconds_total", + "Total time spent in write(2) writing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_seconds.desc().into_iter().cloned()); + + let flushed_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_flushed_wal_seconds_total", + "Total time spent in fsync(2) flushing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flushed_wal_seconds.desc().into_iter().cloned()); + let collect_timeline_metrics = Gauge::new( "safekeeper_collect_timeline_metrics_seconds", "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", @@ -208,6 +326,9 @@ impl TimelineCollector { connected_computes, disk_usage, acceptor_term, + written_wal_bytes, + written_wal_seconds, + flushed_wal_seconds, collect_timeline_metrics, } } @@ -235,6 +356,9 @@ impl Collector for TimelineCollector { self.connected_computes.reset(); self.disk_usage.reset(); self.acceptor_term.reset(); + self.written_wal_bytes.reset(); + self.written_wal_seconds.reset(); + self.flushed_wal_seconds.reset(); let timelines = GlobalTimelines::get_all(); @@ -292,6 +416,15 @@ impl Collector for TimelineCollector { self.acceptor_term .with_label_values(labels) .set(tli.persisted_state.acceptor_state.term as u64); + self.written_wal_bytes + .with_label_values(labels) + .set(tli.wal_storage.write_wal_bytes); + self.written_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.write_wal_seconds); + self.flushed_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.flush_wal_seconds); if let Some(feedback) = most_advanced { self.feedback_ps_write_lsn @@ -332,6 +465,9 @@ impl Collector for TimelineCollector { mfs.extend(self.connected_computes.collect()); mfs.extend(self.disk_usage.collect()); mfs.extend(self.acceptor_term.collect()); + mfs.extend(self.written_wal_bytes.collect()); + mfs.extend(self.written_wal_seconds.collect()); + mfs.extend(self.flushed_wal_seconds.collect()); // report time it took to collect all info let elapsed = start_collecting.elapsed().as_secs_f64(); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d34a77e02b..65340ac0ed 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -998,6 +998,10 @@ mod tests { fn remove_up_to(&self) -> Box Result<()>> { Box::new(move |_segno_up_to: XLogSegNo| Ok(())) } + + fn get_metrics(&self) -> crate::metrics::WalStorageMetrics { + crate::metrics::WalStorageMetrics::default() + } } #[test] diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4000815857..ec29e13931 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -534,6 +534,7 @@ impl Timeline { mem_state: state.sk.inmem.clone(), persisted_state: state.sk.state.clone(), flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), }) } else { None diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ea613dd0f1..692bd18342 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,11 +8,11 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{bail, Context, Result}; + use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use once_cell::sync::Lazy; use postgres_ffi::v14::xlog_utils::{ find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; @@ -27,6 +27,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; +use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; @@ -36,67 +37,8 @@ use postgres_ffi::XLOG_BLCKSZ; use postgres_ffi::v14::waldecoder::WalStreamDecoder; -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; - use tokio::io::{AsyncReadExt, AsyncSeekExt}; -// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). -// i64 is faster than f64, so update to u64 when available. -static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_bytes", - "Bytes written to WAL in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - vec![ - 1.0, - 10.0, - 100.0, - 1024.0, - 8192.0, - 128.0 * 1024.0, - 1024.0 * 1024.0, - 10.0 * 1024.0 * 1024.0 - ] - ) - .expect("Failed to register safekeeper_write_wal_bytes histogram vec") -}); -static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_seconds", - "Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_write_wal_seconds histogram vec") -}); -static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_flush_wal_seconds", - "Seconds spent syncing WAL to a disk, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_flush_wal_seconds histogram vec") -}); - -struct WalStorageMetrics { - write_wal_bytes: Histogram, - write_wal_seconds: Histogram, - flush_wal_seconds: Histogram, -} - -impl WalStorageMetrics { - fn new(ttid: &TenantTimelineId) -> Self { - let tenant_id = ttid.tenant_id.to_string(); - let timeline_id = ttid.timeline_id.to_string(); - Self { - write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), - write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; @@ -113,6 +55,9 @@ pub trait Storage { /// Remove all segments <= given segno. Returns closure as we want to do /// that without timeline lock. fn remove_up_to(&self) -> Box Result<()>>; + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics; } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes @@ -187,7 +132,7 @@ impl PhysicalStorage { } Ok(PhysicalStorage { - metrics: WalStorageMetrics::new(ttid), + metrics: WalStorageMetrics::default(), timeline_dir, conf: conf.clone(), wal_seg_size, @@ -200,28 +145,26 @@ impl PhysicalStorage { } /// Call fdatasync if config requires so. - fn fdatasync_file(&self, file: &mut File) -> Result<()> { + fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_data())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?); } Ok(()) } /// Call fsync if config requires so. - fn fsync_file(&self, file: &mut File) -> Result<()> { + fn fsync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_all())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?); } Ok(()) } /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&self, segno: XLogSegNo) -> Result<(File, bool)> { + fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; @@ -335,13 +278,10 @@ impl Storage for PhysicalStorage { ); } - { - let _timer = self.metrics.write_wal_seconds.start_timer(); - self.write_exact(startpos, buf)?; - } - + let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?; // WAL is written, updating write metrics - self.metrics.write_wal_bytes.observe(buf.len() as f64); + self.metrics.observe_write_seconds(write_seconds); + self.metrics.observe_write_bytes(buf.len()); // figure out last record's end lsn for reporting (if we got the // whole record) @@ -444,6 +384,10 @@ impl Storage for PhysicalStorage { remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) }) } + + fn get_metrics(&self) -> WalStorageMetrics { + self.metrics.clone() + } } /// Remove all WAL segments in timeline_dir that match the given predicate. From e9a103c09f4e24a70697a3187419b4a51b024209 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 21 Sep 2022 21:42:47 +0300 Subject: [PATCH 100/166] [proxy] Pass extra parameters to the console (#2467) With this change we now pass additional params to the console's auth methods. --- Cargo.lock | 6 ++ proxy/Cargo.toml | 8 +- proxy/src/auth.rs | 2 +- proxy/src/auth/backend.rs | 132 +++++++++++++++--------------- proxy/src/auth/backend/console.rs | 57 +++++++++---- proxy/src/auth/backend/link.rs | 6 +- proxy/src/config.rs | 10 +-- proxy/src/http.rs | 92 ++++++++++++++++----- proxy/src/http/server.rs | 27 ++++++ proxy/src/main.rs | 48 +++++------ proxy/src/proxy.rs | 24 +++--- proxy/src/url.rs | 12 +-- workspace_hack/Cargo.toml | 1 + 13 files changed, 259 insertions(+), 166 deletions(-) create mode 100644 proxy/src/http/server.rs diff --git a/Cargo.lock b/Cargo.lock index fc4ef90b8b..0579d381cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2283,6 +2283,7 @@ dependencies = [ "tokio-rustls", "url", "utils", + "uuid", "workspace_hack", "x509-parser", ] @@ -3663,6 +3664,10 @@ name = "uuid" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom", + "serde", +] [[package]] name = "valuable" @@ -3953,6 +3958,7 @@ dependencies = [ "tokio-util", "tracing", "tracing-core", + "uuid", ] [[package]] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5417f4f2b3..7d0449cd1a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,13 +11,14 @@ bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" futures = "0.3.13" +git-version = "0.3.5" hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" itertools = "0.10.3" -once_cell = "1.13.0" md5 = "0.7.0" +once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" @@ -35,14 +36,13 @@ tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" -git-version = "0.3.5" +uuid = { version = "0.8.2", features = ["v4", "serde"]} +x509-parser = "0.13.2" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } -x509-parser = "0.13.2" - [dev-dependencies] rcgen = "0.8.14" rstest = "0.12" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a50d23e351..2df4f9d920 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,7 +1,7 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::{BackendType, DatabaseInfo}; +pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo}; mod credentials; pub use credentials::ClientCredentials; diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index de0719a196..7e93a32950 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -8,13 +8,12 @@ pub use console::{GetAuthInfoError, WakeComputeError}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, - compute, config, mgmt, - stream::PqStream, + compute, http, mgmt, stream, url, waiters::{self, Waiter, Waiters}, }; - use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; +use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -75,6 +74,14 @@ impl From for tokio_postgres::Config { } } +/// Extra query params we'd like to pass to the console. +pub struct ConsoleReqExtra<'a> { + /// A unique identifier for a connection. + pub session_id: uuid::Uuid, + /// Name of client application, if set. + pub application_name: Option<&'a str>, +} + /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector @@ -83,53 +90,83 @@ impl From for tokio_postgres::Config { /// * However, when we substitute `T` with [`ClientCredentials`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum BackendType { +#[derive(Debug)] +pub enum BackendType<'a, T> { /// Current Cloud API (V2). - Console(T), + Console(Cow<'a, http::Endpoint>, T), /// Local mock of Cloud API (V2). - Postgres(T), + Postgres(Cow<'a, url::ApiUrl>, T), /// Authentication via a web browser. - Link, + Link(Cow<'a, url::ApiUrl>), } -impl BackendType { +impl std::fmt::Display for BackendType<'_, ()> { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use BackendType::*; + match self { + Console(endpoint, _) => fmt + .debug_tuple("Console") + .field(&endpoint.url().as_str()) + .finish(), + Postgres(endpoint, _) => fmt + .debug_tuple("Postgres") + .field(&endpoint.as_str()) + .finish(), + Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + } + } +} + +impl BackendType<'_, T> { + /// Very similar to [`std::option::Option::as_ref`]. + /// This helps us pass structured config to async tasks. + pub fn as_ref(&self) -> BackendType<'_, &T> { + use BackendType::*; + match self { + Console(c, x) => Console(Cow::Borrowed(c), x), + Postgres(c, x) => Postgres(Cow::Borrowed(c), x), + Link(c) => Link(Cow::Borrowed(c)), + } + } +} + +impl<'a, T> BackendType<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`BackendType`] to [`BackendType`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { use BackendType::*; match self { - Console(x) => Console(f(x)), - Postgres(x) => Postgres(f(x)), - Link => Link, + Console(c, x) => Console(c, f(x)), + Postgres(c, x) => Postgres(c, f(x)), + Link(c) => Link(c), } } } -impl BackendType> { +impl<'a, T, E> BackendType<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { + pub fn transpose(self) -> Result, E> { use BackendType::*; match self { - Console(x) => x.map(Console), - Postgres(x) => x.map(Postgres), - Link => Ok(Link), + Console(c, x) => x.map(|x| Console(c, x)), + Postgres(c, x) => x.map(|x| Postgres(c, x)), + Link(c) => Ok(Link(c)), } } } -impl BackendType> { +impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. pub async fn authenticate( mut self, - urls: &config::AuthUrls, - client: &mut PqStream, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, ) -> super::Result { use BackendType::*; - if let Console(creds) | Postgres(creds) = &mut self { + if let Console(_, creds) | Postgres(_, creds) = &mut self { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the project name. // We now expect to see a very specific payload in the place of password. @@ -145,15 +182,13 @@ impl BackendType> { creds.project = Some(payload.project.into()); let mut config = match &self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, creds) + Console(endpoint, creds) => { + console::Api::new(endpoint, extra, creds) .wake_compute() .await? } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, creds) - .wake_compute() - .await? + Postgres(endpoint, creds) => { + postgres::Api::new(endpoint, creds).wake_compute().await? } _ => unreachable!("see the patterns above"), }; @@ -169,49 +204,18 @@ impl BackendType> { } match self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, &creds) + Console(endpoint, creds) => { + console::Api::new(&endpoint, extra, &creds) .handle_user(client) .await } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, &creds) + Postgres(endpoint, creds) => { + postgres::Api::new(&endpoint, &creds) .handle_user(client) .await } // NOTE: this auth backend doesn't use client credentials. - Link => link::handle_user(&urls.auth_link_uri, client).await, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_backend_type_map() { - let values = [ - BackendType::Console(0), - BackendType::Postgres(0), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(|x| x), value); - } - } - - #[test] - fn test_backend_type_transpose() { - let values = [ - BackendType::Console(Ok::<_, ()>(0)), - BackendType::Postgres(Ok(0)), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(Result::unwrap), value.transpose().unwrap()); + Link(url) => link::handle_user(&url, client).await, } } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index e239320e9b..e5ee07813c 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,12 +1,12 @@ //! Cloud API V2. +use super::ConsoleReqExtra; use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute::{self, ComputeConnCfg}, error::{io_error, UserFacingError}, - scram, + http, scram, stream::PqStream, - url::ApiUrl, }; use serde::{Deserialize, Serialize}; use std::future::Future; @@ -120,14 +120,23 @@ pub enum AuthInfo { #[must_use] pub(super) struct Api<'a> { - endpoint: &'a ApiUrl, + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, creds: &'a ClientCredentials<'a>, } impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { - Self { endpoint, creds } + pub(super) fn new( + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, + creds: &'a ClientCredentials, + ) -> Self { + Self { + endpoint, + extra, + creds, + } } /// Authenticate the existing user or throw an error. @@ -139,16 +148,22 @@ impl<'a> Api<'a> { } async fn get_auth_info(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_get_role_secret"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")) - .append_pair("role", self.creds.user); + let req = self + .endpoint + .get("proxy_get_role_secret") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ("role", Some(self.creds.user)), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } @@ -162,15 +177,21 @@ impl<'a> Api<'a> { /// Wake up the compute node and return the corresponding connection info. pub(super) async fn wake_compute(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_wake_compute"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")); + let req = self + .endpoint + .get("proxy_wake_compute") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index d740a4c5c4..eefa246eba 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -29,7 +29,7 @@ impl UserFacingError for LinkAuthError { } } -fn hello_message(redirect_uri: &str, session_id: &str) -> String { +fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { format!( concat![ "Welcome to Neon!\n", @@ -46,11 +46,11 @@ pub fn new_psql_session_id() -> String { } pub async fn handle_user( - redirect_uri: &reqwest::Url, + link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri.as_str(), &psql_session_id); + let greeting = hello_message(link_uri, &psql_session_id); let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8835d660d5..031fa84509 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,16 +1,10 @@ -use crate::{auth, url::ApiUrl}; +use crate::auth; use anyhow::{ensure, Context}; use std::sync::Arc; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<()>, - pub auth_urls: AuthUrls, -} - -pub struct AuthUrls { - pub auth_endpoint: ApiUrl, - pub auth_link_uri: ApiUrl, + pub auth_backend: auth::BackendType<'static, ()>, } pub struct TlsConfig { diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 5a75718742..dbeb3dc784 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,27 +1,81 @@ -use anyhow::anyhow; -use hyper::{Body, Request, Response, StatusCode}; -use std::net::TcpListener; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +pub mod server; -async fn status_handler(_: Request) -> Result, ApiError> { - json_response(StatusCode::OK, "") +use crate::url::ApiUrl; + +/// Thin convenience wrapper for an API provided by an http endpoint. +#[derive(Debug, Clone)] +pub struct Endpoint { + /// API's base URL. + endpoint: ApiUrl, + /// Connection manager with built-in pooling. + client: reqwest::Client, } -fn make_router() -> RouterBuilder { - let router = endpoint::make_router(); - router.get("/v1/status", status_handler) -} - -pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { - scopeguard::defer! { - println!("http has shut down"); +impl Endpoint { + /// Construct a new HTTP endpoint wrapper. + pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { + Self { endpoint, client } } - let service = || RouterService::new(make_router().build()?); + pub fn url(&self) -> &ApiUrl { + &self.endpoint + } - hyper::Server::from_tcp(http_listener)? - .serve(service().map_err(|e| anyhow!(e))?) - .await?; + /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// appending a single `path` segment to the base endpoint URL. + pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push(path); + self.client.get(url.into_inner()) + } - Ok(()) + /// Execute a [request](reqwest::Request). + pub async fn execute( + &self, + request: reqwest::Request, + ) -> Result { + self.client.execute(request).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn optional_query_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + // Validate that this pattern makes sense. + let req = endpoint + .get("frobnicate") + .query(&[ + ("foo", Some("10")), // should be just `foo=10` + ("bar", None), // shouldn't be passed at all + ]) + .build()?; + + assert_eq!(req.url().as_str(), "http://example.com/frobnicate?foo=10"); + + Ok(()) + } + + #[test] + fn uuid_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + let req = endpoint + .get("frobnicate") + .query(&[("session_id", uuid::Uuid::nil())]) + .build()?; + + assert_eq!( + req.url().as_str(), + "http://example.com/frobnicate?session_id=00000000-0000-0000-0000-000000000000" + ); + + Ok(()) + } } diff --git a/proxy/src/http/server.rs b/proxy/src/http/server.rs new file mode 100644 index 0000000000..5a75718742 --- /dev/null +++ b/proxy/src/http/server.rs @@ -0,0 +1,27 @@ +use anyhow::anyhow; +use hyper::{Body, Request, Response, StatusCode}; +use std::net::TcpListener; +use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; + +async fn status_handler(_: Request) -> Result, ApiError> { + json_response(StatusCode::OK, "") +} + +fn make_router() -> RouterBuilder { + let router = endpoint::make_router(); + router.get("/v1/status", status_handler) +} + +pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { + scopeguard::defer! { + println!("http has shut down"); + } + + let service = || RouterService::new(make_router().build()?); + + hyper::Server::from_tcp(http_listener)? + .serve(service().map_err(|e| anyhow!(e))?) + .await?; + + Ok(()) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index efe45f6386..f2dc7425ba 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -23,7 +23,7 @@ use anyhow::{bail, Context}; use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; -use std::{future::Future, net::SocketAddr}; +use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use utils::project_git_version; @@ -36,23 +36,6 @@ async fn flatten_err( f.map(|r| r.context("join error").and_then(|x| x)).await } -/// A proper parser for auth backend parameter. -impl clap::ValueEnum for auth::BackendType<()> { - fn value_variants<'a>() -> &'a [Self] { - use auth::BackendType::*; - &[Console(()), Postgres(()), Link] - } - - fn to_possible_value<'a>(&self) -> Option> { - use auth::BackendType::*; - Some(clap::PossibleValue::new(match self { - Console(_) => "console", - Postgres(_) => "postgres", - Link => "link", - })) - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let arg_matches = clap::App::new("Neon proxy/router") @@ -69,7 +52,7 @@ async fn main() -> anyhow::Result<()> { Arg::new("auth-backend") .long("auth-backend") .takes_value(true) - .value_parser(clap::builder::EnumValueParser::>::new()) + .possible_values(["console", "postgres", "link"]) .default_value("link"), ) .arg( @@ -135,23 +118,30 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; - let auth_backend = *arg_matches - .try_get_one::>("auth-backend")? - .unwrap(); - - let auth_urls = config::AuthUrls { - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + let auth_backend = match arg_matches.value_of("auth-backend").unwrap() { + "console" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + auth::BackendType::Console(Cow::Owned(endpoint), ()) + } + "postgres" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + auth::BackendType::Postgres(Cow::Owned(url), ()) + } + "link" => { + let url = arg_matches.value_of("uri").unwrap().parse()?; + auth::BackendType::Link(Cow::Owned(url)) + } + other => bail!("unsupported auth backend: {other}"), }; let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, - auth_urls, })); println!("Version: {GIT_VERSION}"); - println!("Authentication backend: {:?}", config.auth_backend); + println!("Authentication backend: {}", config.auth_backend); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); @@ -164,7 +154,7 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let tasks = [ - tokio::spawn(http::thread_main(http_listener)), + tokio::spawn(http::server::thread_main(http_listener)), tokio::spawn(proxy::thread_main(config, proxy_listener)), tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), ] diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 72cb822910..efb1b6f358 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,6 +1,6 @@ use crate::auth; use crate::cancellation::{self, CancelMap}; -use crate::config::{AuthUrls, ProxyConfig, TlsConfig}; +use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -99,6 +99,7 @@ async fn handle_client( let common_name = tls.and_then(|tls| tls.common_name.as_deref()); let result = config .auth_backend + .as_ref() .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); @@ -107,7 +108,7 @@ async fn handle_client( let client = Client::new(stream, creds, ¶ms); cancel_map - .with_session(|session| client.connect_to_db(&config.auth_urls, session)) + .with_session(|session| client.connect_to_db(session)) .await } @@ -179,7 +180,7 @@ struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, /// KV-dictionary with PostgreSQL connection params. params: &'a StartupMessageParams, } @@ -188,7 +189,7 @@ impl<'a, S> Client<'a, S> { /// Construct a new connection context. fn new( stream: PqStream, - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, ) -> Self { Self { @@ -201,19 +202,22 @@ impl<'a, S> Client<'a, S> { impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. - async fn connect_to_db( - self, - urls: &AuthUrls, - session: cancellation::Session<'_>, - ) -> anyhow::Result<()> { + async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { let Self { mut stream, creds, params, } = self; + let extra = auth::ConsoleReqExtra { + // Currently it's OK to generate a new UUID **here**, but + // it might be better to move this to `cancellation::Session`. + session_id: uuid::Uuid::new_v4(), + application_name: params.get("application_name"), + }; + // Authenticate and connect to a compute node. - let auth = creds.authenticate(urls, &mut stream).await; + let auth = creds.authenticate(&extra, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; let reported_auth_ok = node.reported_auth_ok; diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 76d6ad0e66..92c64bb8ad 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -1,8 +1,8 @@ use anyhow::bail; -use url::form_urlencoded::Serializer; /// A [url](url::Url) type with additional guarantees. -#[derive(Debug, Clone)] +#[repr(transparent)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ApiUrl(url::Url); impl ApiUrl { @@ -11,11 +11,6 @@ impl ApiUrl { self.0 } - /// See [`url::Url::query_pairs_mut`]. - pub fn query_pairs_mut(&mut self) -> Serializer<'_, url::UrlQuery<'_>> { - self.0.query_pairs_mut() - } - /// See [`url::Url::path_segments_mut`]. pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { // We've already verified that it works during construction. @@ -72,10 +67,7 @@ mod tests { let mut b = url.parse::().expect("unexpected parsing failure"); a.path_segments_mut().unwrap().push("method"); - a.query_pairs_mut().append_pair("key", "value"); - b.path_segments_mut().push("method"); - b.query_pairs_mut().append_pair("key", "value"); assert_eq!(a, b.into_inner()); } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index dc4cbb5284..3670ca5fea 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -43,6 +43,7 @@ tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["once_cell", "std"] } +uuid = { version = "0.8", features = ["getrandom", "serde", "std", "v4"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } From f3073a4db93e2d4e39e2bbef03ed6b742ef3afa0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Sep 2022 08:35:06 +0300 Subject: [PATCH 101/166] R-Tree layer map (#2317) Replace the layer array and linear search with R-tree So far, the in-memory layer map that holds information about layer files that exist, has used a simple Vec, in no particular order, to hold information about all the layers. That obviously doesn't scale very well; with thousands of layer files the linear search was consuming a lot of CPU. Replace it with a two-dimensional R-tree, with Key and LSN ranges as the dimensions. For the R-tree, use the 'rstar' crate. To be able to use that, we convert the Keys and LSNs into 256-bit integers. 64 bits would be enough to represent LSNs, and 128 bits would be enough to represent Keys. However, we use 256 bits, because rstar internally performs multiplication to calculate the area of rectangles, and the result of multiplying two 128 bit integers doesn't necessarily fit in 128 bits, causing integer overflow and, if overflow-checks are enabled, panic. To avoid that, we use 256 bit integers. Add a performance test that creates a lot of layer files, to demonstrate the benefit. --- Cargo.lock | 222 +++++++++++++- pageserver/Cargo.toml | 3 + pageserver/src/repository.rs | 13 + pageserver/src/tenant/delta_layer.rs | 2 +- pageserver/src/tenant/layer_map.rs | 347 +++++++++++++++++----- pageserver/src/tenant/timeline.rs | 2 +- test_runner/performance/test_layer_map.py | 39 +++ workspace_hack/Cargo.toml | 3 +- 8 files changed, 548 insertions(+), 83 deletions(-) create mode 100644 test_runner/performance/test_layer_map.py diff --git a/Cargo.lock b/Cargo.lock index 0579d381cc..ddb10352b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,6 +37,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "amplify_num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d" + [[package]] name = "ansi_term" version = "0.12.1" @@ -135,6 +141,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-polyfill" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89" +dependencies = [ + "critical-section", +] + [[package]] name = "atty" version = "0.2.14" @@ -212,6 +227,21 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bare-metal" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3" +dependencies = [ + "rustc_version 0.2.3", +] + +[[package]] +name = "bare-metal" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603" + [[package]] name = "base64" version = "0.13.0" @@ -250,6 +280,18 @@ dependencies = [ "which", ] +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + [[package]] name = "bitflags" version = "1.3.2" @@ -528,6 +570,18 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +[[package]] +name = "cortex-m" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0" +dependencies = [ + "bare-metal 0.2.5", + "bitfield", + "embedded-hal", + "volatile-register", +] + [[package]] name = "cpp_demangle" version = "0.3.5" @@ -552,7 +606,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -600,6 +654,18 @@ dependencies = [ "itertools", ] +[[package]] +name = "critical-section" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd" +dependencies = [ + "bare-metal 1.0.0", + "cfg-if", + "cortex-m", + "riscv", +] + [[package]] name = "crossbeam-channel" version = "0.5.6" @@ -844,6 +910,16 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" +[[package]] +name = "embedded-hal" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff" +dependencies = [ + "nb 0.1.3", + "void", +] + [[package]] name = "encoding_rs" version = "0.8.31" @@ -1165,6 +1241,15 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1174,6 +1259,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "heapless" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version 0.4.0", + "spin 0.9.4", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.3.3" @@ -1491,6 +1589,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "libm" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" + [[package]] name = "lock_api" version = "0.4.7" @@ -1649,6 +1753,21 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nb" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f" +dependencies = [ + "nb 1.0.0", +] + +[[package]] +name = "nb" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae" + [[package]] name = "nix" version = "0.23.1" @@ -1716,6 +1835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1828,6 +1948,7 @@ checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" name = "pageserver" version = "0.1.0" dependencies = [ + "amplify_num", "anyhow", "async-stream", "async-trait", @@ -1852,6 +1973,7 @@ dependencies = [ "itertools", "metrics", "nix", + "num-traits", "once_cell", "postgres", "postgres-protocol", @@ -1861,6 +1983,7 @@ dependencies = [ "rand", "regex", "remote_storage", + "rstar", "scopeguard", "serde", "serde_json", @@ -2515,12 +2638,33 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", + "spin 0.5.2", "untrusted", "web-sys", "winapi", ] +[[package]] +name = "riscv" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba" +dependencies = [ + "bare-metal 1.0.0", + "bit_field", + "riscv-target", +] + +[[package]] +name = "riscv-target" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "routerify" version = "3.0.0" @@ -2534,6 +2678,17 @@ dependencies = [ "regex", ] +[[package]] +name = "rstar" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rstest" version = "0.12.0" @@ -2543,7 +2698,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "rustc_version", + "rustc_version 0.4.0", "syn", ] @@ -2565,7 +2720,7 @@ dependencies = [ "log", "rusoto_credential", "rusoto_signature", - "rustc_version", + "rustc_version 0.4.0", "serde", "serde_json", "tokio", @@ -2623,7 +2778,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rusoto_credential", - "rustc_version", + "rustc_version 0.4.0", "serde", "sha2 0.9.9", "tokio", @@ -2641,13 +2796,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.13", ] [[package]] @@ -2800,12 +2964,27 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + [[package]] name = "semver" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.142" @@ -2999,6 +3178,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -3675,6 +3863,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcell" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" + [[package]] name = "vcpkg" version = "0.2.15" @@ -3687,6 +3881,21 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "volatile-register" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6" +dependencies = [ + "vcell", +] + [[package]] name = "wal_craft" version = "0.1.0" @@ -3952,6 +4161,7 @@ dependencies = [ "regex-syntax", "scopeguard", "serde", + "stable_deref_trait", "syn", "time 0.3.12", "tokio", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 85ece97d9b..1ec7ec4f98 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -54,6 +54,9 @@ once_cell = "1.13.0" crossbeam-utils = "0.8.5" fail = "0.5.0" git-version = "0.3.5" +rstar = "0.9.3" +num-traits = "0.2.15" +amplify_num = "0.4.1" postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index cfcc87a2ed..0c2fedd7d5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -24,6 +24,19 @@ pub struct Key { pub const KEY_SIZE: usize = 18; impl Key { + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. + /// As long as Neon does not support tablespace (because of lack of access to local file system), + /// we can assume that only some predefined namespace OIDs are used which can fit in u16 + pub fn to_i128(&self) -> i128 { + assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + (((self.field1 & 0xf) as i128) << 120) + | (((self.field2 & 0xFFFF) as i128) << 104) + | ((self.field3 as i128) << 72) + | ((self.field4 as i128) << 40) + | ((self.field5 as i128) << 32) + | self.field6 as i128 + } + pub fn next(&self) -> Key { self.add(1) } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index 892000c20b..57c5be91a4 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -713,7 +713,7 @@ impl DeltaLayerWriter { for buf in block_buf.blocks { file.write_all(buf.as_ref())?; } - + assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 let summary = Summary { magic: DELTA_FILE_MAGIC, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 8abeebf54c..495833e3ae 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -15,9 +15,15 @@ use crate::repository::Key; use crate::tenant::inmemory_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use crate::tenant::storage_layer::{range_eq, range_overlaps}; +use amplify_num::i256; use anyhow::Result; +use num_traits::identities::{One, Zero}; +use num_traits::{Bounded, Num, Signed}; +use rstar::{RTree, RTreeObject, AABB}; +use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::Range; +use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; @@ -47,14 +53,163 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// All the historic layers are kept here + historic_layers: RTree, - /// TODO: This is a placeholder implementation of a data structure - /// to hold information about all the layer files on disk and in - /// S3. Currently, it's just a vector and all operations perform a - /// linear scan over it. That obviously becomes slow as the - /// number of layers grows. I'm imagining that an R-tree or some - /// other 2D data structure would be the long-term solution here. - historic_layers: Vec>, + /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. + /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + l0_delta_layers: Vec>, +} + +struct LayerRTreeObject { + layer: Arc, +} + +// Representation of Key as numeric type. +// We can not use native implementation of i128, because rstar::RTree +// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). +// Overflow will cause panic in debug mode and incorrect area calculation in release mode, +// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). +// By using i256 as the type, even though all the actual values would fit in i128, we can be +// sure that multiplication doesn't overflow. +// + +#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] +struct IntKey(i256); + +impl Copy for IntKey {} + +impl IntKey { + fn from(i: i128) -> Self { + IntKey(i256::from(i)) + } +} + +impl Bounded for IntKey { + fn min_value() -> Self { + IntKey(i256::MIN) + } + fn max_value() -> Self { + IntKey(i256::MAX) + } +} + +impl Signed for IntKey { + fn is_positive(&self) -> bool { + self.0 > i256::ZERO + } + fn is_negative(&self) -> bool { + self.0 < i256::ZERO + } + fn signum(&self) -> Self { + match self.0.cmp(&i256::ZERO) { + Ordering::Greater => IntKey(i256::ONE), + Ordering::Less => IntKey(-i256::ONE), + Ordering::Equal => IntKey(i256::ZERO), + } + } + fn abs(&self) -> Self { + IntKey(self.0.abs()) + } + fn abs_sub(&self, other: &Self) -> Self { + if self.0 <= other.0 { + IntKey(i256::ZERO) + } else { + IntKey(self.0 - other.0) + } + } +} + +impl Neg for IntKey { + type Output = Self; + fn neg(self) -> Self::Output { + IntKey(-self.0) + } +} + +impl Rem for IntKey { + type Output = Self; + fn rem(self, rhs: Self) -> Self::Output { + IntKey(self.0 % rhs.0) + } +} + +impl Div for IntKey { + type Output = Self; + fn div(self, rhs: Self) -> Self::Output { + IntKey(self.0 / rhs.0) + } +} + +impl Add for IntKey { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + IntKey(self.0 + rhs.0) + } +} + +impl Sub for IntKey { + type Output = Self; + fn sub(self, rhs: Self) -> Self::Output { + IntKey(self.0 - rhs.0) + } +} + +impl Mul for IntKey { + type Output = Self; + fn mul(self, rhs: Self) -> Self::Output { + IntKey(self.0 * rhs.0) + } +} + +impl One for IntKey { + fn one() -> Self { + IntKey(i256::ONE) + } +} + +impl Zero for IntKey { + fn zero() -> Self { + IntKey(i256::ZERO) + } + fn is_zero(&self) -> bool { + self.0 == i256::ZERO + } +} + +impl Num for IntKey { + type FromStrRadixErr = ::FromStrRadixErr; + fn from_str_radix(str: &str, radix: u32) -> Result { + Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) + } +} + +impl PartialEq for LayerRTreeObject { + fn eq(&self, other: &Self) -> bool { + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + Arc::ptr_eq(&self.layer, &other.layer) + } +} + +impl RTreeObject for LayerRTreeObject { + type Envelope = AABB<[IntKey; 2]>; + fn envelope(&self) -> Self::Envelope { + let key_range = self.layer.get_key_range(); + let lsn_range = self.layer.get_lsn_range(); + AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive + ) + } } /// Return value of LayerMap::search @@ -80,19 +235,24 @@ impl LayerMap { // Find the latest image layer that covers the given key let mut latest_img: Option> = None; let mut latest_img_lsn: Option = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0i128)], + [ + IntKey::from(key.to_i128()), + IntKey::from(end_lsn.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } + assert!(l.get_key_range().contains(&key)); let img_lsn = l.get_lsn_range().start; - - if img_lsn >= end_lsn { - // too new - continue; - } + assert!(img_lsn < end_lsn); if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match return Ok(Some(SearchResult { @@ -108,19 +268,24 @@ impl LayerMap { // Search the delta layers let mut latest_delta: Option> = None; - for l in self.historic_layers.iter() { + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); if l.get_lsn_range().start >= end_lsn { - // too new - continue; + info!( + "Candidate delta layer {}..{} is too new for lsn {}", + l.get_lsn_range().start, + l.get_lsn_range().end, + end_lsn + ); } - + assert!(l.get_lsn_range().start < end_lsn); if l.get_lsn_range().end >= end_lsn { // this layer contains the requested point in the key/lsn space. // No need to search any further @@ -170,7 +335,10 @@ impl LayerMap { /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - self.historic_layers.push(layer); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + self.l0_delta_layers.push(layer.clone()); + } + self.historic_layers.insert(LayerRTreeObject { layer }); NUM_ONDISK_LAYERS.inc(); } @@ -180,17 +348,22 @@ impl LayerMap { /// This should be called when the corresponding file on disk has been deleted. /// pub fn remove_historic(&mut self, layer: Arc) { - let len_before = self.historic_layers.len(); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + let len_before = self.l0_delta_layers.len(); - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - self.historic_layers - .retain(|other| !Arc::ptr_eq(other, &layer)); - - assert_eq!(self.historic_layers.len(), len_before - 1); + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.l0_delta_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + assert_eq!(self.l0_delta_layers.len(), len_before - 1); + } + assert!(self + .historic_layers + .remove(&LayerRTreeObject { layer }) + .is_some()); NUM_ONDISK_LAYERS.dec(); } @@ -207,15 +380,26 @@ impl LayerMap { loop { let mut made_progress = false; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [ + IntKey::from(range_remain.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(range_remain.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } let img_lsn = l.get_lsn_range().start; - if !l.is_incremental() - && l.get_key_range().contains(&range_remain.start) - && lsn_range.contains(&img_lsn) - { + if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; @@ -232,8 +416,8 @@ impl LayerMap { } } - pub fn iter_historic_layers(&self) -> impl Iterator> { - self.historic_layers.iter() + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + self.historic_layers.iter().map(|e| e.layer.clone()) } /// Find the last image layer that covers 'key', ignoring any image layers @@ -241,19 +425,22 @@ impl LayerMap { fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { let mut candidate_lsn = Lsn(0); let mut candidate = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0)], + [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); let this_lsn = l.get_lsn_range().start; - if this_lsn > lsn { - continue; - } + assert!(this_lsn <= lsn); if this_lsn < candidate_lsn { // our previous candidate was better continue; @@ -279,10 +466,19 @@ impl LayerMap { lsn: Lsn, ) -> Result, Option>)>> { let mut points = vec![key_range.start]; - for l in self.historic_layers.iter() { - if l.get_lsn_range().start > lsn { - continue; - } + let envelope = AABB::from_corners( + [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], + [ + IntKey::from(key_range.end.to_i128()), + IntKey::from(lsn.0 as i128), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + assert!(l.get_lsn_range().start <= lsn); let range = l.get_key_range(); if key_range.contains(&range.start) { points.push(l.get_key_range().start); @@ -315,16 +511,29 @@ impl LayerMap { /// given key and LSN range. pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { let mut result = 0; - for l in self.historic_layers.iter() { + if lsn_range.start >= lsn_range.end { + return Ok(0); + } + let envelope = AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !range_overlaps(&l.get_lsn_range(), lsn_range) { - continue; - } - if !range_overlaps(&l.get_key_range(), key_range) { - continue; - } + assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); + assert!(range_overlaps(&l.get_key_range(), key_range)); // We ignore level0 delta layers. Unless the whole keyspace fits // into one partition @@ -341,17 +550,7 @@ impl LayerMap { /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - let mut deltas = Vec::new(); - for l in self.historic_layers.iter() { - if !l.is_incremental() { - continue; - } - if l.get_key_range() != (Key::MIN..Key::MAX) { - continue; - } - deltas.push(Arc::clone(l)); - } - Ok(deltas) + Ok(self.l0_delta_layers.clone()) } /// debugging function to print out the contents of the layer map @@ -370,8 +569,8 @@ impl LayerMap { } println!("historic_layers:"); - for layer in self.historic_layers.iter() { - layer.dump(verbose)?; + for e in self.historic_layers.iter() { + e.layer.dump(verbose)?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b80d023c7f..6de1d44876 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2050,7 +2050,7 @@ impl Timeline { l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(l)); + layers_to_remove.push(Arc::clone(&l)); } // Actually delete the layers from disk and remove them from the map. diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py new file mode 100644 index 0000000000..d71fb6d12c --- /dev/null +++ b/test_runner/performance/test_layer_map.py @@ -0,0 +1,39 @@ +import time + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Benchmark searching the layer map, when there are a lot of small layer files. +# +def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): + + env = neon_env_builder.init_start() + n_iters = 10 + n_records = 100000 + + # We want to have a lot of lot of layer files to exercise the layer map. Make + # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "100 m", + "gc_horizon": "1048576", + "checkpoint_distance": "8192", + "compaction_period": "1 s", + "compaction_threshold": "1", + "compaction_target_size": "8192", + } + ) + + env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) + pg = env.postgres.create_start("test_layer_map", tenant_id=tenant) + cur = pg.connect().cursor() + cur.execute("create table t(x integer)") + for i in range(n_iters): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") + time.sleep(1) + + cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (n_iters * n_records,) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3670ca5fea..f37a42945e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -30,7 +30,7 @@ memchr = { version = "2", features = ["std", "use_std"] } nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } -num-traits = { version = "0.2", features = ["i128", "std"] } +num-traits = { version = "0.2", features = ["i128", "libm", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } @@ -38,6 +38,7 @@ regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +stable_deref_trait = { version = "1", features = ["alloc", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } From e764c1e60fd8e7afaf346bc70f0b9269097e8a1a Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 01:02:53 +0300 Subject: [PATCH 102/166] remove self argument from several spans --- pageserver/src/page_service.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9e159f7391..7de6403b83 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -663,7 +663,7 @@ impl PageServerHandler { Ok(lsn) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_rel_exists_request( &self, timeline: &Timeline, @@ -680,7 +680,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_nblocks_request( &self, timeline: &Timeline, @@ -697,7 +697,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] async fn handle_db_size_request( &self, timeline: &Timeline, @@ -717,7 +717,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, From 86bf4919817d34a2e56590596eb5f8270ce8b79e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 17:09:28 +0300 Subject: [PATCH 103/166] Support pg 15 - Split postgres_ffi into two version specific files. - Preserve pg_version in timeline metadata. - Use pg_version in safekeeper code. Check for postgres major version mismatch. - Clean up the code to use DEFAULT_PG_VERSION constant everywhere, instead of hardcoding. - Parameterize python tests: use DEFAULT_PG_VERSION env and pg_version fixture. To run tests using a specific PostgreSQL version, pass the DEFAULT_PG_VERSION environment variable: 'DEFAULT_PG_VERSION='15' ./scripts/pytest test_runner/regress' Currently don't all tests pass, because rust code relies on the default version of PostgreSQL in a few places. --- control_plane/src/bin/neon_local.rs | 95 +++++++++++-- control_plane/src/compute.rs | 49 +++++-- control_plane/src/local_env.rs | 48 +++++-- control_plane/src/storage.rs | 22 ++- libs/postgres_ffi/src/lib.rs | 129 +++++++++++++++++- libs/postgres_ffi/src/nonrelfile_utils.rs | 2 +- libs/postgres_ffi/src/pg_constants.rs | 19 +-- libs/postgres_ffi/src/pg_constants_v14.rs | 5 + libs/postgres_ffi/src/pg_constants_v15.rs | 10 ++ libs/postgres_ffi/src/relfile_utils.rs | 25 ++-- libs/postgres_ffi/src/waldecoder.rs | 49 +------ libs/postgres_ffi/src/xlog_utils.rs | 38 +++++- pageserver/src/basebackup.rs | 82 +++++------ pageserver/src/bin/update_metadata.rs | 2 + pageserver/src/config.rs | 45 ++++-- pageserver/src/http/models.rs | 1 + pageserver/src/http/routes.rs | 1 + pageserver/src/import_datadir.rs | 20 +-- pageserver/src/lib.rs | 2 + pageserver/src/page_service.rs | 31 ++++- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/reltag.rs | 6 +- pageserver/src/storage_sync.rs | 12 +- pageserver/src/storage_sync/index.rs | 23 +++- pageserver/src/tenant.rs | 49 ++++--- pageserver/src/tenant/metadata.rs | 9 ++ pageserver/src/tenant/timeline.rs | 17 ++- pageserver/src/walingest.rs | 83 +++++++---- .../src/walreceiver/connection_manager.rs | 2 +- .../src/walreceiver/walreceiver_connection.rs | 4 +- pageserver/src/walrecord.rs | 38 ++++-- pageserver/src/walredo.rs | 30 ++-- safekeeper/src/json_ctrl.rs | 11 +- safekeeper/src/safekeeper.rs | 19 ++- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/wal_backup.rs | 3 +- safekeeper/src/wal_storage.rs | 10 +- test_runner/fixtures/neon_fixtures.py | 30 +++- test_runner/regress/test_import.py | 5 + test_runner/regress/test_pg_regress.py | 18 ++- test_runner/regress/test_wal_acceptor.py | 9 +- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 43 files changed, 777 insertions(+), 292 deletions(-) create mode 100644 libs/postgres_ffi/src/pg_constants_v14.rs create mode 100644 libs/postgres_ffi/src/pg_constants_v15.rs diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index e16fd8764a..92782ea235 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -39,6 +39,8 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); +const DEFAULT_PG_VERSION: &str = "14"; + fn default_conf(etcd_binary_path: &Path) -> String { format!( r#" @@ -105,6 +107,13 @@ fn main() -> Result<()> { .takes_value(true) .required(false); + let pg_version_arg = Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(false) + .takes_value(true) + .default_value(DEFAULT_PG_VERSION); + let port_arg = Arg::new("port") .long("port") .required(false) @@ -146,6 +155,7 @@ fn main() -> Result<()> { .required(false) .value_name("config"), ) + .arg(pg_version_arg.clone()) ) .subcommand( App::new("timeline") @@ -164,7 +174,9 @@ fn main() -> Result<()> { .subcommand(App::new("create") .about("Create a new blank timeline") .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone())) + .arg(branch_name_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("import") .about("Import timeline from basebackup directory") .arg(tenant_id_arg.clone()) @@ -178,7 +190,9 @@ fn main() -> Result<()> { .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true) .help("Wal to add after base")) .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true) - .help("Lsn the basebackup ends at"))) + .help("Lsn the basebackup ends at")) + .arg(pg_version_arg.clone()) + ) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) @@ -188,6 +202,7 @@ fn main() -> Result<()> { .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + .arg(pg_version_arg.clone()) ) .subcommand(App::new("config") .arg(tenant_id_arg.clone()) @@ -239,8 +254,9 @@ fn main() -> Result<()> { Arg::new("config-only") .help("Don't do basebackup, create compute node with only config files") .long("config-only") - .required(false) - )) + .required(false)) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) @@ -248,7 +264,9 @@ fn main() -> Result<()> { .arg(branch_name_arg.clone()) .arg(timeline_id_arg.clone()) .arg(lsn_arg.clone()) - .arg(port_arg.clone())) + .arg(port_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand( App::new("stop") .arg(pg_node_arg.clone()) @@ -501,9 +519,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { default_conf(&EtcdBroker::locate_etcd()?) }; + let pg_version = init_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - env.init().context("Failed to initialize neon repository")?; + env.init(pg_version) + .context("Failed to initialize neon repository")?; let initial_tenant_id = env .default_tenant_id .expect("default_tenant_id should be generated by the `env.init()` call above"); @@ -515,6 +540,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), + pg_version, ) .unwrap_or_else(|e| { eprintln!("pageserver init failed: {e}"); @@ -557,8 +583,19 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline_info = - pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = pageserver.timeline_create( + new_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info .local @@ -607,7 +644,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = + pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info @@ -655,7 +700,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; println!("Creating node for imported timeline ..."); env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - cplane.new_node(tenant_id, name, timeline_id, None, None)?; + + let pg_version = import_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } Some(("branch", branch_match)) => { @@ -682,6 +734,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - None, start_lsn, Some(ancestor_timeline_id), + None, )?; let new_timeline_id = timeline_info.timeline_id; @@ -797,7 +850,14 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?; + + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; } "start" => { let port: Option = match sub_args.value_of("port") { @@ -835,16 +895,23 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .map(Lsn::from_str) .transpose() .context("Failed to parse Lsn from the request")?; + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument println!( - "Starting new postgres {} on timeline {} ...", - node_name, timeline_id + "Starting new postgres (v{}) {} on timeline {} ...", + pg_version, node_name, timeline_id ); - let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; + + let node = + cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; node.start(&auth_token)?; } } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b678d620df..89994c5647 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -18,7 +18,7 @@ use utils::{ postgres_backend::AuthType, }; -use crate::local_env::LocalEnv; +use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; use crate::postgresql_conf::PostgresConf; use crate::storage::PageServerNode; @@ -81,6 +81,7 @@ impl ComputeControlPlane { timeline_id: TimelineId, lsn: Option, port: Option, + pg_version: u32, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { @@ -93,6 +94,7 @@ impl ComputeControlPlane { lsn, tenant_id, uses_wal_proposer: false, + pg_version, }); node.create_pgdata()?; @@ -118,6 +120,7 @@ pub struct PostgresNode { pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, uses_wal_proposer: bool, + pg_version: u32, } impl PostgresNode { @@ -152,6 +155,14 @@ impl PostgresNode { let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); + // Read postgres version from PG_VERSION file to determine which postgres version binary to use. + // If it doesn't exist, assume broken data directory and use default pg version. + let pg_version_path = entry.path().join("PG_VERSION"); + + let pg_version_str = + fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); + let pg_version = u32::from_str(&pg_version_str)?; + // parse recovery_target_lsn, if any let recovery_target_lsn: Option = conf.parse_field_optional("recovery_target_lsn", &context)?; @@ -167,17 +178,24 @@ impl PostgresNode { lsn: recovery_target_lsn, tenant_id, uses_wal_proposer, + pg_version, }) } - fn sync_safekeepers(&self, auth_token: &Option) -> Result { - let pg_path = self.env.pg_bin_dir().join("postgres"); + fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { + let pg_path = self.env.pg_bin_dir(pg_version).join("postgres"); let mut cmd = Command::new(&pg_path); cmd.arg("--sync-safekeepers") .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) .env("PGDATA", self.pgdata().to_str().unwrap()) .stdout(Stdio::piped()) // Comment this to avoid capturing stderr (useful if command hangs) @@ -259,8 +277,8 @@ impl PostgresNode { }) } - // Connect to a page server, get base backup, and untar it to initialize a - // new data directory + // Write postgresql.conf with default configuration + // and PG_VERSION file to the data directory of a new node. fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); @@ -357,6 +375,9 @@ impl PostgresNode { let mut file = File::create(self.pgdata().join("postgresql.conf"))?; file.write_all(conf.to_string().as_bytes())?; + let mut file = File::create(self.pgdata().join("PG_VERSION"))?; + file.write_all(self.pg_version.to_string().as_bytes())?; + Ok(()) } @@ -368,7 +389,7 @@ impl PostgresNode { // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token)?; + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; if lsn == Lsn(0) { None } else { @@ -401,7 +422,7 @@ impl PostgresNode { } fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { - let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); + let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl"); let mut cmd = Command::new(pg_ctl_path); cmd.args( [ @@ -417,8 +438,14 @@ impl PostgresNode { .concat(), ) .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ); if let Some(token) = auth_token { cmd.env("ZENITH_AUTH_TOKEN", token); } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 7afaad26dc..14bb4cf346 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,6 +20,8 @@ use utils::{ use crate::safekeeper::SafekeeperNode; +pub const DEFAULT_PG_VERSION: u32 = 14; + // // This data structures represents neon_local CLI config // @@ -195,12 +197,40 @@ impl Default for SafekeeperConf { } impl LocalEnv { - // postgres installation paths - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let mut path = self.pg_distrib_dir.clone(); + + if pg_version != DEFAULT_PG_VERSION { + // step up to the parent directory + // We assume that the pg_distrib subdirs + // for different pg versions + // are located in the same directory + // and follow the naming convention: v14, v15, etc. + path.pop(); + + match pg_version { + 14 => return path.join(format!("v{pg_version}")), + 15 => return path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + }; + } + + path } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } pub fn pageserver_bin(&self) -> anyhow::Result { @@ -290,6 +320,8 @@ impl LocalEnv { // Find postgres binaries. // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. if env.pg_distrib_dir == Path::new("") { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { env.pg_distrib_dir = postgres_bin.into(); @@ -384,7 +416,7 @@ impl LocalEnv { // // Initialize a new Neon repository // - pub fn init(&mut self) -> anyhow::Result<()> { + pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; ensure!( @@ -397,10 +429,10 @@ impl LocalEnv { "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); - if !self.pg_distrib_dir.join("bin/postgres").exists() { + if !self.pg_bin_dir(pg_version).join("postgres").exists() { bail!( "Can't find postgres binary at {}", - self.pg_distrib_dir.display() + self.pg_bin_dir(pg_version).display() ); } for binary in ["pageserver", "safekeeper"] { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 3bbbdc5865..95ade14fbf 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -112,11 +112,15 @@ impl PageServerNode { create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], + pg_version: u32, ) -> anyhow::Result { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let pg_distrib_dir_param = - format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); + let pg_distrib_dir_param = format!( + "pg_distrib_dir='{}'", + self.env.pg_distrib_dir(pg_version).display() + ); + let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); let listen_http_addr_param = format!( "listen_http_addr='{}'", @@ -159,7 +163,7 @@ impl PageServerNode { self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; let init_result = self - .try_init_timeline(create_tenant, initial_timeline_id) + .try_init_timeline(create_tenant, initial_timeline_id, pg_version) .context("Failed to create initial tenant and timeline for pageserver"); match &init_result { Ok(initial_timeline_id) => { @@ -175,10 +179,16 @@ impl PageServerNode { &self, new_tenant_id: Option, new_timeline_id: Option, + pg_version: u32, ) -> anyhow::Result { let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; - let initial_timeline_info = - self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; + let initial_timeline_info = self.timeline_create( + initial_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; Ok(initial_timeline_info.timeline_id) } @@ -497,6 +507,7 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, + pg_version: Option, ) -> anyhow::Result { self.http_request( Method::POST, @@ -506,6 +517,7 @@ impl PageServerNode { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, + pg_version, }) .send()? .error_from_body()? diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f43232ed0c..25e1f6029c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -7,6 +7,8 @@ // https://github.com/rust-lang/rust-bindgen/issues/1651 #![allow(deref_nullptr)] +use bytes::Bytes; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; macro_rules! postgres_ffi { @@ -24,11 +26,11 @@ macro_rules! postgres_ffi { stringify!($version), ".rs" )); + + include!(concat!("pg_constants_", stringify!($version), ".rs")); } pub mod controlfile_utils; pub mod nonrelfile_utils; - pub mod pg_constants; - pub mod relfile_utils; pub mod waldecoder; pub mod xlog_utils; @@ -44,6 +46,9 @@ macro_rules! postgres_ffi { postgres_ffi!(v14); postgres_ffi!(v15); +pub mod pg_constants; +pub mod relfile_utils; + // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; @@ -52,8 +57,11 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; +pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::bindings::{CheckPoint, ControlFileData}; + // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; @@ -63,6 +71,50 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; +// Export some version independent functions that are used outside of this mod +pub use v14::xlog_utils::encode_logical_message; +pub use v14::xlog_utils::get_current_timestamp; +pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::XLogFileName; + +pub use v14::bindings::DBState_DB_SHUTDOWNED; + +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { + if version == 14 { + bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0 + } else { + assert_eq!(version, 15); + bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0 + } +} + +pub fn generate_wal_segment( + segno: u64, + system_id: u64, + pg_version: u32, +) -> Result { + match pg_version { + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + _ => Err(SerializeError::BadInput), + } +} + +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, + pg_version: u32, +) -> anyhow::Result<(Bytes, u64)> { + match pg_version { + 14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + 15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + _ => anyhow::bail!("Unknown version {}", pg_version), + } +} + // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. // // NOTE: this is not to be confused with Neon timelines; different concept! @@ -74,7 +126,7 @@ pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID + id > pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c @@ -109,3 +161,74 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); } + +pub mod waldecoder { + + use crate::{v14, v15}; + use bytes::{Buf, Bytes, BytesMut}; + use std::num::NonZeroU32; + use thiserror::Error; + use utils::lsn::Lsn; + + pub enum State { + WaitingForRecord, + ReassemblingRecord { + recordbuf: BytesMut, + contlen: NonZeroU32, + }, + SkippingEverything { + skip_until_lsn: Lsn, + }, + } + + pub struct WalStreamDecoder { + pub lsn: Lsn, + pub pg_version: u32, + pub inputbuf: BytesMut, + pub state: State, + } + + #[derive(Error, Debug, Clone)] + #[error("{msg} at {lsn}")] + pub struct WalDecodeError { + pub msg: String, + pub lsn: Lsn, + } + + impl WalStreamDecoder { + pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder { + WalStreamDecoder { + lsn, + pg_version, + inputbuf: BytesMut::new(), + state: State::WaitingForRecord, + } + } + + // The latest LSN position fed to the decoder. + pub fn available(&self) -> Lsn { + self.lsn + self.inputbuf.remaining() as u64 + } + + pub fn feed_bytes(&mut self, buf: &[u8]) { + self.inputbuf.extend_from_slice(buf); + } + + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + match self.pg_version { + 14 => { + use self::v14::waldecoder::WalStreamDecoderHandler; + self.poll_decode_internal() + } + 15 => { + use self::v15::waldecoder::WalStreamDecoderHandler; + self.poll_decode_internal() + } + _ => Err(WalDecodeError { + msg: format!("Unknown version {}", self.pg_version), + lsn: self.lsn, + }), + } + } + } +} diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 1de1d367e0..01e5554b8a 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,7 +1,7 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use super::pg_constants; +use crate::pg_constants; use crate::transaction_id_precedes; use bytes::BytesMut; use log::*; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 8cc9fa7af6..6aaa739a69 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -1,14 +1,16 @@ //! //! Misc constants, copied from PostgreSQL headers. //! +//! Only place version-independent constants here. +//! //! TODO: These probably should be auto-generated using bindgen, //! rather than copied by hand. Although on the other hand, it's nice //! to have them all here in one place, and have the ability to add //! comments on them. //! -use super::bindings::{PageHeaderData, XLogRecord}; use crate::BLCKSZ; +use crate::{PageHeaderData, XLogRecord}; // // From pg_tablespace_d.h @@ -16,14 +18,6 @@ use crate::BLCKSZ; pub const DEFAULTTABLESPACE_OID: u32 = 1663; pub const GLOBALTABLESPACE_OID: u32 = 1664; -// -// Fork numbers, from relpath.h -// -pub const MAIN_FORKNUM: u8 = 0; -pub const FSM_FORKNUM: u8 = 1; -pub const VISIBILITYMAP_FORKNUM: u8 = 2; -pub const INIT_FORKNUM: u8 = 3; - // From storage_xlog.h pub const XLOG_SMGR_CREATE: u8 = 0x10; pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; @@ -114,7 +108,6 @@ pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; -pub const DB_SHUTDOWNED: u32 = 1; // From multixact.h pub const FIRST_MULTIXACT_ID: u32 = 1; @@ -169,10 +162,6 @@ pub const RM_HEAP_ID: u8 = 10; pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; -// from dbcommands_xlog.h -pub const XLOG_DBASE_CREATE: u8 = 0x00; -pub const XLOG_DBASE_DROP: u8 = 0x10; - pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; @@ -197,8 +186,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous /* Information stored in bimg_info */ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ -pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ -pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ /* From transam.h */ pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3; diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs new file mode 100644 index 0000000000..810898ee80 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -0,0 +1,5 @@ +pub const XLOG_DBASE_CREATE: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x10; + +pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ +pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs new file mode 100644 index 0000000000..6fa5eb008c --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -0,0 +1,10 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index f3476acc9c..1dc9f367ff 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,10 +1,17 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use super::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; +// +// Fork numbers, from relpath.h +// +pub const MAIN_FORKNUM: u8 = 0; +pub const FSM_FORKNUM: u8 = 1; +pub const VISIBILITYMAP_FORKNUM: u8 = 2; +pub const INIT_FORKNUM: u8 = 3; + #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] @@ -23,10 +30,10 @@ impl From for FilePathError { pub fn forkname_to_number(forkname: Option<&str>) -> Result { match forkname { // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(pg_constants::MAIN_FORKNUM), - Some("fsm") => Ok(pg_constants::FSM_FORKNUM), - Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM), - Some("init") => Ok(pg_constants::INIT_FORKNUM), + None => Ok(MAIN_FORKNUM), + Some("fsm") => Ok(FSM_FORKNUM), + Some("vm") => Ok(VISIBILITYMAP_FORKNUM), + Some("init") => Ok(INIT_FORKNUM), Some(_) => Err(FilePathError::InvalidForkName), } } @@ -34,10 +41,10 @@ pub fn forkname_to_number(forkname: Option<&str>) -> Result { /// Convert Postgres fork number to the right suffix of the relation data file. pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { match forknum { - pg_constants::MAIN_FORKNUM => None, - pg_constants::FSM_FORKNUM => Some("fsm"), - pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"), - pg_constants::INIT_FORKNUM => Some("init"), + MAIN_FORKNUM => None, + FSM_FORKNUM => Some("fsm"), + VISIBILITYMAP_FORKNUM => Some("vm"), + INIT_FORKNUM => Some("init"), _ => Some("UNKNOWN FORKNUM"), } } diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 4d79e4b1d1..5b46d52321 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -8,6 +8,7 @@ //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! +use super::super::waldecoder::{State, WalDecodeError, WalStreamDecoder}; use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use super::xlog_utils::*; use crate::WAL_SEGMENT_SIZE; @@ -16,55 +17,19 @@ use crc32c::*; use log::*; use std::cmp::min; use std::num::NonZeroU32; -use thiserror::Error; use utils::lsn::Lsn; -enum State { - WaitingForRecord, - ReassemblingRecord { - recordbuf: BytesMut, - contlen: NonZeroU32, - }, - SkippingEverything { - skip_until_lsn: Lsn, - }, -} - -pub struct WalStreamDecoder { - lsn: Lsn, - inputbuf: BytesMut, - state: State, -} - -#[derive(Error, Debug, Clone)] -#[error("{msg} at {lsn}")] -pub struct WalDecodeError { - msg: String, - lsn: Lsn, +pub trait WalStreamDecoderHandler { + fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError>; + fn poll_decode_internal(&mut self) -> Result, WalDecodeError>; + fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError>; } // // WalRecordStream is a Stream that returns a stream of WAL records // FIXME: This isn't a proper rust stream // -impl WalStreamDecoder { - pub fn new(lsn: Lsn) -> WalStreamDecoder { - WalStreamDecoder { - lsn, - inputbuf: BytesMut::new(), - state: State::WaitingForRecord, - } - } - - // The latest LSN position fed to the decoder. - pub fn available(&self) -> Lsn { - self.lsn + self.inputbuf.remaining() as u64 - } - - pub fn feed_bytes(&mut self, buf: &[u8]) { - self.inputbuf.extend_from_slice(buf); - } - +impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { let validate_impl = || { if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 { @@ -125,7 +90,7 @@ impl WalStreamDecoder { /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// - pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + fn poll_decode_internal(&mut self) -> Result, WalDecodeError> { // Run state machine that validates page headers, and reassembles records // that cross page boundaries. loop { diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index f8606b6e47..8389a6e971 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -9,12 +9,13 @@ use crc32c::crc32c_append; +use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ - CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, - XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, + CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, + XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, }; -use super::pg_constants; -use super::waldecoder::WalStreamDecoder; +use super::PG_MAJORVERSION; +use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; @@ -113,6 +114,30 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, +) -> anyhow::Result<(Bytes, u64)> { + let mut pg_control = ControlFileData::decode(pg_control_bytes)?; + let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; + + // Generate new pg_control needed for bootstrap + checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; + + //reset some fields we don't want to preserve + //TODO Check this. + //We may need to determine the value from twophase data. + checkpoint.oldestActiveXid = 0; + + //save new values in pg_control + pg_control.checkPoint = 0; + pg_control.checkPointCopy = checkpoint; + pg_control.state = DBState_DB_SHUTDOWNED; + + Ok((pg_control.encode(), pg_control.system_identifier)) +} + pub fn get_current_timestamp() -> TimestampTz { to_pg_timestamp(SystemTime::now()) } @@ -144,7 +169,10 @@ pub fn find_end_of_wal( let mut result = start_lsn; let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; - let mut decoder = WalStreamDecoder::new(start_lsn); + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + info!("find_end_of_wal PG_VERSION: {}", pg_version); + + let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); // loop over segments loop { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index eca6a3c87f..d0a57a473b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,10 +25,10 @@ use tracing::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; -use postgres_ffi::v14::{CheckPoint, ControlFileData}; +use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; use postgres_ffi::TransactionId; +use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -129,15 +129,15 @@ where // TODO include checksum // Create pgdata subdirs structure - for dir in pg_constants::PGDATA_SUBDIRS.iter() { + for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; self.ar.append(&header, &mut io::empty())?; } // Send empty config files. - for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() { + for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { - let data = pg_constants::PG_HBA.as_bytes(); + let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; self.ar.append(&header, data)?; } else { @@ -267,16 +267,12 @@ where None }; - // TODO pass this as a parameter - let pg_version = "14"; + if spcnode == GLOBALTABLESPACE_OID { + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; - if spcnode == pg_constants::GLOBALTABLESPACE_OID { - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; - - let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace @@ -305,7 +301,7 @@ where return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); + ensure!(spcnode == DEFAULTTABLESPACE_OID); // Append dir path for each database let path = format!("base/{}", dbnode); @@ -314,9 +310,10 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; @@ -348,30 +345,6 @@ where // Also send zenith.signal file with extra bootstrap data. // fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) - .context("failed get control bytes")?; - let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; - let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - - // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0; - - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - - //save new values in pg_control - pg_control.checkPoint = 0; - pg_control.checkPointCopy = checkpoint; - pg_control.state = pg_constants::DB_SHUTDOWNED; - // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -388,8 +361,23 @@ where zenith_signal.as_bytes(), )?; + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn) + .context("failed get control bytes")?; + + let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + //send pg_control - let pg_control_bytes = pg_control.encode(); let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar.append(&header, &pg_control_bytes[..])?; @@ -398,8 +386,10 @@ where let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) - .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + + let wal_seg = + postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 16359c2532..e66049c457 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -50,6 +50,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } @@ -62,6 +63,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 945ee098ea..a4346c0190 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -21,6 +21,7 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::DEFAULT_PG_VERSION; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -209,7 +210,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join("pg_install/v14")), + .join(format!("pg_install/v{}", DEFAULT_PG_VERSION))), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), @@ -374,13 +375,40 @@ impl PageServerConf { // // Postgres distribution paths // + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let mut path = self.pg_distrib_dir.clone(); - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + if pg_version != DEFAULT_PG_VERSION { + // step up to the parent directory + // We assume that the pg_distrib subdirs + // for different pg versions + // are located in the same directory + // and follow the naming convention: v14, v15, etc. + path.pop(); + + match pg_version { + 14 => return path.join(format!("v{pg_version}")), + 15 => return path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + }; + } + + path } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, @@ -449,10 +477,11 @@ impl PageServerConf { ); } - if !conf.pg_distrib_dir.join("bin/postgres").exists() { + let pg_version = DEFAULT_PG_VERSION; + if !conf.pg_bin_dir(pg_version).join("postgres").exists() { bail!( "Can't find postgres binary at {}", - conf.pg_distrib_dir.display() + conf.pg_bin_dir(pg_version).display() ); } @@ -863,7 +892,7 @@ broker_endpoints = ['{broker_endpoint}'] let workdir = tempdir_path.join("workdir"); fs::create_dir_all(&workdir)?; - let pg_distrib_dir = tempdir_path.join("pg_distrib"); + let pg_distrib_dir = tempdir_path.join(format!("pg_distrib/v{DEFAULT_PG_VERSION}")); fs::create_dir_all(&pg_distrib_dir)?; let postgres_bin_dir = pg_distrib_dir.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 2d7d560d2a..851fa881a0 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -21,6 +21,7 @@ pub struct TimelineCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, + pub pg_version: Option, } #[serde_as] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c676dfacd2..6892c0b391 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -173,6 +173,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index c1e736d552..23c4351b4e 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -16,11 +16,13 @@ use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; -use postgres_ffi::v14::relfile_utils::*; -use postgres_ffi::v14::waldecoder::*; -use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::ControlFileData; +use postgres_ffi::DBState_DB_SHUTDOWNED; use postgres_ffi::Oid; +use postgres_ffi::XLogFileName; use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -236,7 +238,7 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); @@ -354,7 +356,7 @@ pub fn import_wal_from_tar( end_lsn: Lsn, ) -> Result<()> { // Set up walingest mutable state - let mut waldecoder = WalStreamDecoder::new(start_lsn); + let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version); let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; @@ -439,7 +441,7 @@ fn import_file( len: usize, ) -> Result> { if file_path.starts_with("global") { - let spcnode = pg_constants::GLOBALTABLESPACE_OID; + let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; match file_path @@ -467,7 +469,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; @@ -495,7 +497,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index e918a39457..0bd5e242d3 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -36,6 +36,8 @@ use crate::task_mgr::TaskKind; /// format, bump this! pub const STORAGE_FORMAT_VERSION: u16 = 3; +pub const DEFAULT_PG_VERSION: u32 = 14; + // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7de6403b83..fed5d0dcc4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -43,9 +43,9 @@ use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use crate::tenant_mgr; use crate::CheckpointConfig; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; -use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::to_pg_timestamp; use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData @@ -498,12 +498,16 @@ impl PageServerHandler { timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, + pg_version: u32, ) -> anyhow::Result<()> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let timeline = tenant_mgr::get_tenant(tenant_id, true)? - .create_empty_timeline(timeline_id, base_lsn)?; + let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline( + timeline_id, + base_lsn, + pg_version, + )?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -958,16 +962,31 @@ impl postgres_backend_async::Handler for PageServerHandler { // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 4); + ensure!(params.len() >= 4); let tenant_id = TenantId::from_str(params[0])?; let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; + let pg_version = if params.len() == 5 { + u32::from_str(params[4])? + } else { + // If version is not provided, assume default. + // TODO: this may lead to weird errors if the version is wrong. + crate::DEFAULT_PG_VERSION + }; + self.check_permission(Some(tenant_id))?; match self - .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .handle_import_basebackup( + pgb, + tenant_id, + timeline_id, + base_lsn, + end_lsn, + pg_version, + ) .await { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 9d4b438dc4..fc9867dc05 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,7 +13,7 @@ use crate::tenant::Timeline; use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; @@ -125,8 +125,7 @@ impl Timeline { return Ok(nblocks); } - if (tag.forknum == pg_constants::FSM_FORKNUM - || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) + if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) && !self.get_rel_exists(tag, lsn, latest)? { // FIXME: Postgres sometimes calls smgrcreate() to create @@ -1090,6 +1089,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // 03 misc // controlfile // checkpoint +// pg_version // // Below is a full list of the keyspace allocation: // @@ -1128,7 +1128,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // // Checkpoint: // 03 00000000 00000000 00000000 00 00000001 - //-- Section 01: relation data and metadata const DBDIR_KEY: Key = Key { @@ -1402,8 +1401,9 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( tenant: &crate::tenant::Tenant, timeline_id: utils::id::TimelineId, + pg_version: u32, ) -> Result> { - let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index e3d08f8b3d..43d38bd986 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -2,8 +2,8 @@ use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::relfile_utils::forknumber_to_name; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::forknumber_to_name; use postgres_ffi::Oid; /// @@ -78,7 +78,7 @@ impl fmt::Display for RelTag { impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { - let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID { + let mut name = if self.spcnode == GLOBALTABLESPACE_OID { "global/".to_string() } else { format!("base/{}/", self.dbnode) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 489d0ad4ed..892a34a76f 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -1445,7 +1445,17 @@ mod test_utils { } pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { - TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) + TimelineMetadata::new( + disk_consistent_lsn, + None, + None, + Lsn(0), + Lsn(0), + Lsn(0), + // Any version will do + // but it should be consistent with the one in the tests + crate::DEFAULT_PG_VERSION, + ) } } diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 13495ffefe..db37c7b411 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -341,13 +341,21 @@ mod tests { use super::*; use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::DEFAULT_PG_VERSION; #[test] fn index_part_conversion() { let harness = TenantHarness::create("index_part_conversion").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let remote_timeline = RemoteTimeline { timeline_layers: HashSet::from([ timeline_path.join("layer_1"), @@ -464,8 +472,15 @@ mod tests { fn index_part_conversion_negatives() { let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let conversion_result = IndexPart::from_remote_timeline( &timeline_path, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ca97796870..5860e13534 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -171,6 +171,7 @@ impl Tenant { &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, + pg_version: u32, ) -> Result> { // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); @@ -186,7 +187,7 @@ impl Tenant { } let new_metadata = - TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); + TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, pg_version,); let new_timeline = self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -387,6 +388,11 @@ impl Tenant { let mut timelines_accessor = self.timelines.lock().unwrap(); for (timeline_id, metadata) in sorted_timelines { + info!( + "Attaching timeline {} pg_version {}", + timeline_id, + metadata.pg_version() + ); let timeline = self .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; @@ -613,7 +619,7 @@ impl Tenant { }; let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); - + let pg_version = new_metadata.pg_version(); let new_timeline = Arc::new(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), @@ -623,6 +629,7 @@ impl Tenant { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + pg_version, )); new_timeline @@ -984,6 +991,7 @@ impl Tenant { start_lsn, *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, + src_timeline.pg_version, ); let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; info!("branched timeline {dst} from {src} at {start_lsn}"); @@ -1319,6 +1327,7 @@ pub mod harness { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + _pg_version: u32, ) -> Result { let s = format!( "redo for {} to get to {}, with {} and {} records", @@ -1345,6 +1354,7 @@ mod tests { use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; @@ -1356,7 +1366,7 @@ mod tests { #[test] fn test_basic() -> Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1378,9 +1388,9 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); - let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; - match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -1404,7 +1414,7 @@ mod tests { #[test] fn test_branch() -> Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); use std::str::from_utf8; @@ -1499,7 +1509,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -1529,7 +1539,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), @@ -1555,7 +1565,7 @@ mod tests { RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? .load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; @@ -1573,7 +1583,7 @@ mod tests { fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1590,7 +1600,7 @@ mod tests { fn test_parent_keeps_data_forever_after_branching() -> Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1618,7 +1628,8 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1638,7 +1649,7 @@ mod tests { // create two timelines { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; @@ -1674,7 +1685,7 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1711,7 +1722,7 @@ mod tests { #[test] fn test_images() -> Result<()> { let tenant = TenantHarness::create("test_images")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1761,7 +1772,7 @@ mod tests { #[test] fn test_bulk_insert() -> Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let mut lsn = Lsn(0x10); @@ -1801,7 +1812,7 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1871,7 +1882,7 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1950,7 +1961,7 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 606acbf2f1..41790b4d11 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -63,6 +63,7 @@ struct TimelineMetadataBody { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, + pg_version: u32, } impl TimelineMetadata { @@ -73,6 +74,7 @@ impl TimelineMetadata { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, + pg_version: u32, ) -> Self { Self { hdr: TimelineMetadataHeader { @@ -87,6 +89,7 @@ impl TimelineMetadata { ancestor_lsn, latest_gc_cutoff_lsn, initdb_lsn, + pg_version, }, } } @@ -160,6 +163,10 @@ impl TimelineMetadata { pub fn initdb_lsn(&self) -> Lsn { self.body.initdb_lsn } + + pub fn pg_version(&self) -> u32 { + self.body.pg_version + } } /// Save timeline metadata to file @@ -212,6 +219,8 @@ mod tests { Lsn(0), Lsn(0), Lsn(0), + // Any version will do here, so use the default + crate::DEFAULT_PG_VERSION, ); let metadata_bytes = original_metadata diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6de1d44876..019de81d64 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -37,7 +37,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; +use postgres_ffi::to_pg_timestamp; use utils::{ id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, @@ -61,6 +61,8 @@ pub struct Timeline { pub tenant_id: TenantId, pub timeline_id: TimelineId, + pub pg_version: u32, + pub layers: RwLock, last_freeze_at: AtomicLsn, @@ -533,6 +535,7 @@ impl Timeline { tenant_id: TenantId, walredo_mgr: Arc, upload_layers: bool, + pg_version: u32, ) -> Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -541,6 +544,7 @@ impl Timeline { tenant_conf, timeline_id, tenant_id, + pg_version, layers: RwLock::new(LayerMap::default()), walredo_mgr, @@ -1260,6 +1264,7 @@ impl Timeline { self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, + self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -2133,9 +2138,13 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = - self.walredo_mgr - .request_redo(key, request_lsn, base_img, data.records)?; + let img = self.walredo_mgr.request_redo( + key, + request_lsn, + base_img, + data.records, + self.pg_version, + )?; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index bede4ac13e..1d5cab38b9 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,8 +34,9 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; @@ -82,7 +83,8 @@ impl<'a> WalIngest<'a> { decoded: &mut DecodedWALRecord, ) -> Result<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded).context("failed decoding wal record")?; + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -113,18 +115,49 @@ impl<'a> WalIngest<'a> { let truncate = XlSmgrTruncate::decode(&mut buf); self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_CREATE - { - let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); - for tablespace_id in dropdb.tablespace_ids { - trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + debug!( + "handle RM_DBASE_ID for Postgres version {:?}", + self.timeline.pg_version + ); + if self.timeline.pg_version == 14 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + { + let createdb = XlCreateDatabase::decode(&mut buf); + debug!("XLOG_DBASE_CREATE v14"); + + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } + } + } else if self.timeline.pg_version == 15 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG + { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY + { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { @@ -291,7 +324,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0 + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version) { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -392,7 +425,7 @@ impl<'a> WalIngest<'a> { // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { let vm_rel = RelTag { - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, @@ -568,7 +601,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::MAIN_FORKNUM, + forknum: MAIN_FORKNUM, }; self.put_rel_truncation(modification, rel, rec.blkno)?; } @@ -577,7 +610,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::FSM_FORKNUM, + forknum: FSM_FORKNUM, }; // FIXME: 'blkno' stored in the WAL record is the new size of the @@ -600,7 +633,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, }; // FIXME: Like with the FSM above, the logic to truncate the VM @@ -672,7 +705,7 @@ impl<'a> WalIngest<'a> { )?; for xnode in &parsed.xnodes { - for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM { + for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, @@ -1032,6 +1065,8 @@ mod tests { use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; + use crate::DEFAULT_PG_VERSION; + /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { spcnode: 0, @@ -1059,7 +1094,7 @@ mod tests { #[test] fn test_relsize() -> Result<()> { let tenant = TenantHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1187,7 +1222,7 @@ mod tests { #[test] fn test_drop_extend() -> Result<()> { let tenant = TenantHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1227,7 +1262,7 @@ mod tests { #[test] fn test_truncate_extend() -> Result<()> { let tenant = TenantHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1315,7 +1350,7 @@ mod tests { #[test] fn test_large_rel() -> Result<()> { let tenant = TenantHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 148372c9d0..a82e69e5ba 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1366,7 +1366,7 @@ mod tests { }, timeline: harness .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 29c4cea882..5ac9a3ef7a 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -29,7 +29,7 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; +use postgres_ffi::waldecoder::WalStreamDecoder; use utils::id::TenantTimelineId; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; @@ -166,7 +166,7 @@ pub async fn handle_walreceiver_connection( let physical_stream = ReplicationStream::new(copy_stream); pin!(physical_stream); - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index dbf9bf9d33..258e1a445f 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -3,12 +3,11 @@ //! use anyhow::Result; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::XLOG_SIZE_OF_XLOG_RECORD; -use postgres_ffi::v14::XLogRecord; +use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; +use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::bin_ser::DeserializeError; @@ -390,6 +389,16 @@ impl XlXactParsedRecord { xid = buf.get_u32_le(); trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE"); } + + if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { + let nitems = buf.get_i32_le(); + debug!( + "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", + nitems + ); + //FIXME: do we need to handle dropped stats here? + } + XlXactParsedRecord { xid, info, @@ -517,6 +526,7 @@ impl XlMultiXactTruncate { pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, + pg_version: u32, ) -> Result<(), DeserializeError> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; @@ -610,9 +620,21 @@ pub fn decode_wal_record( blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0; + blk.apply_image = if pg_version == 14 { + (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0 + } else { + assert_eq!(pg_version, 15); + (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0 + }; - if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 { + let blk_img_is_compressed = + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); + + if blk_img_is_compressed { + debug!("compressed block image , pg_version = {}", pg_version); + } + + if blk_img_is_compressed { if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { blk.hole_length = buf.get_u16_le(); } else { @@ -665,9 +687,7 @@ pub fn decode_wal_record( * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED * flag is set. */ - if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == BLCKSZ - { + if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { // TODO /* report_invalid_record(state, @@ -683,7 +703,7 @@ pub fn decode_wal_record( * IS_COMPRESSED flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 - && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 + && !blk_img_is_compressed && blk.bimg_len != BLCKSZ { // TODO diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 79c2edc96e..15a9408dc9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -46,11 +46,12 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::v14::pg_constants; use postgres_ffi::BLCKSZ; /// @@ -82,6 +83,7 @@ pub trait WalRedoManager: Send + Sync { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result; } @@ -144,6 +146,7 @@ impl WalRedoManager for PostgresRedoManager { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -166,6 +169,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..i], self.conf.wal_redo_timeout, + pg_version, ) }; img = Some(result?); @@ -184,6 +188,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..], self.conf.wal_redo_timeout, + pg_version, ) } } @@ -212,6 +217,7 @@ impl PostgresRedoManager { base_img: Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, + pg_version: u32, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; @@ -222,7 +228,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?; + let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -326,7 +332,7 @@ impl PostgresRedoManager { // sanity check that this is modifying the correct relation let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; assert!( - rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM, + rel.forknum == VISIBILITYMAP_FORKNUM, "ClearVisibilityMapFlags record on unexpected rel {}", rel ); @@ -570,7 +576,11 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result { + fn launch( + conf: &PageServerConf, + tenant_id: &TenantId, + pg_version: u32, + ) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. @@ -588,12 +598,12 @@ impl PostgresRedoProcess { fs::remove_dir_all(&datadir)?; } info!("running initdb in {}", datadir.display()); - let initdb = Command::new(conf.pg_bin_dir().join("initdb")) + let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .close_fds() .output() .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; @@ -619,14 +629,14 @@ impl PostgresRedoProcess { } // Start postgres itself - let mut child = Command::new(conf.pg_bin_dir().join("postgres")) + let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres")) .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .env("PGDATA", &datadir) // The redo process is not trusted, so it runs in seccomp mode // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 2456eb0752..3de410d117 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -22,7 +22,7 @@ use crate::safekeeper::{ use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::Timeline; use crate::GlobalTimelines; -use postgres_ffi::v14::xlog_utils; +use postgres_ffi::encode_logical_message; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ lsn::Lsn, @@ -47,6 +47,7 @@ pub struct AppendLogicalMessage { epoch_start_lsn: Lsn, begin_lsn: Lsn, truncate_lsn: Lsn, + pg_version: u32, } #[derive(Serialize, Deserialize)] @@ -68,7 +69,7 @@ pub fn handle_json_ctrl( info!("JSON_CTRL request: {:?}", append_request); // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg.ttid)?; + let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { @@ -95,11 +96,11 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(ttid: TenantTimelineId) -> Result> { +fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result> { GlobalTimelines::create( ttid, ServerInfo { - pg_version: 0, // unknown + pg_version, wal_seg_size: WAL_SEGMENT_SIZE as u32, system_id: 0, }, @@ -135,7 +136,7 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { - let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = tli.get_state().1; let begin_lsn = msg.begin_lsn; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 65340ac0ed..eec24faf2f 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -27,7 +27,7 @@ use utils::{ pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 6; const SK_PROTOCOL_VERSION: u32 = 2; -const UNKNOWN_SERVER_VERSION: u32 = 0; +pub const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. pub type Term = u64; @@ -594,15 +594,20 @@ where SK_PROTOCOL_VERSION ); } - // Postgres upgrade is not treated as fatal error - if msg.pg_version != self.state.server.pg_version + /* Postgres major version mismatch is treated as fatal error + * because safekeepers parse WAL headers and the format + * may change between versions. + */ + if msg.pg_version / 10000 != self.state.server.pg_version / 10000 && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { - warn!( + bail!( "incompatible server version {}, expected {}", - msg.pg_version, self.state.server.pg_version + msg.pg_version, + self.state.server.pg_version ); } + if msg.tenant_id != self.state.tenant_id { bail!( "invalid tenant ID, got {}, expected {}", @@ -634,6 +639,10 @@ where let mut state = self.state.clone(); state.server.system_id = msg.system_id; + state.server.wal_seg_size = msg.wal_seg_size; + if msg.pg_version != UNKNOWN_SERVER_VERSION { + state.server.pg_version = msg.pg_version; + } self.state.persist(&state)?; } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 5a38558e9c..2829c875ed 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -8,7 +8,7 @@ use crate::GlobalTimelines; use anyhow::{bail, Context, Result}; use bytes::Bytes; -use postgres_ffi::v14::xlog_utils::get_current_timestamp; +use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::min; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 0d5321fb3a..c82a003161 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,7 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; +use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::GenericRemoteStorage; use tokio::fs::File; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 692bd18342..44dc313ef6 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -29,13 +29,14 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; +use crate::safekeeper::UNKNOWN_SERVER_VERSION; use crate::wal_backup::read_object; use crate::SafeKeeperConf; -use postgres_ffi::v14::xlog_utils::XLogFileName; +use postgres_ffi::XLogFileName; use postgres_ffi::XLOG_BLCKSZ; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; +use postgres_ffi::waldecoder::WalStreamDecoder; use tokio::io::{AsyncReadExt, AsyncSeekExt}; @@ -139,7 +140,7 @@ impl PhysicalStorage { write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn), + decoder: WalStreamDecoder::new(write_lsn, UNKNOWN_SERVER_VERSION), file: None, }) } @@ -291,7 +292,8 @@ impl Storage for PhysicalStorage { self.decoder.available(), startpos, ); - self.decoder = WalStreamDecoder::new(startpos); + let pg_version = self.decoder.pg_version; + self.decoder = WalStreamDecoder::new(startpos, pg_version); } self.decoder.feed_bytes(buf); loop { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1e83ee3839..c1ebc6aa7d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,7 +59,7 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_POSTGRES_DIR = "pg_install/v14" +DEFAULT_PG_VERSION_DEFAULT = "14" DEFAULT_BRANCH_NAME = "main" BASE_PORT = 15000 @@ -71,6 +71,7 @@ base_dir = "" neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" +pg_version = "" def pytest_configure(config): @@ -100,12 +101,21 @@ def pytest_configure(config): Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. + global pg_version + pg_version = os.environ.get("DEFAULT_PG_VERSION", DEFAULT_PG_VERSION_DEFAULT) + global pg_distrib_dir + + # TODO get rid of the POSTGRES_DISTRIB_DIR env var ? + # use DEFAULT_PG_VERSION instead to generate the path env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: - pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) + pg_distrib_dir = os.path.normpath( + os.path.join(base_dir, "pg_install/v{}".format(pg_version)) + ) + log.info(f"pg_distrib_dir is {pg_distrib_dir}") if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. @@ -1185,6 +1195,7 @@ class AbstractNeonCli(abc.ABC): env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) + env_vars["DEFAULT_PG_VERSION"] = str(pg_version) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): @@ -1251,6 +1262,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + pg_version, ] ) else: @@ -1262,6 +1275,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + pg_version, ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) @@ -1296,6 +1311,8 @@ class NeonCli(AbstractNeonCli): new_branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] res = self.raw_cli(cmd) @@ -1317,6 +1334,8 @@ class NeonCli(AbstractNeonCli): branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] res = self.raw_cli(cmd) @@ -1395,6 +1414,9 @@ class NeonCli(AbstractNeonCli): cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: cmd.extend(["--timeline-id", str(initial_timeline_id)]) + + cmd.extend(["--pg-version", pg_version]) + append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, @@ -1476,6 +1498,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id or self.env.initial_tenant), "--branch-name", branch_name, + "--pg-version", + pg_version, ] if lsn is not None: args.extend(["--lsn", str(lsn)]) @@ -1500,6 +1524,8 @@ class NeonCli(AbstractNeonCli): "start", "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + pg_version, ] if lsn is not None: args.append(f"--lsn={lsn}") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 885a0dc26f..417595ae4d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -14,6 +14,7 @@ from fixtures.neon_fixtures import ( PgBin, Postgres, pg_distrib_dir, + pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -96,6 +97,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn, "--wal-tarfile", wal, + "--pg-version", + pg_version, ] ) @@ -248,6 +251,8 @@ def _import( str(lsn), "--base-tarfile", os.path.join(tar_output_file), + "--pg-version", + pg_version, ] ) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index aa5a65f446..4934fb9354 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,7 +5,13 @@ import os from pathlib import Path import pytest -from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir +from fixtures.neon_fixtures import ( + NeonEnv, + base_dir, + check_restored_datadir_content, + pg_distrib_dir, + pg_version, +) # Run the main PostgreSQL regression tests, in src/test/regress. @@ -26,8 +32,8 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(pg_version) bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -80,8 +86,8 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/isolation") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/isolation") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/isolation".format(pg_version)) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/isolation".format(pg_version)) bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -124,7 +130,7 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) src_path = os.path.join(base_dir, "test_runner/sql_regress") bindir = os.path.join(pg_distrib_dir, "bin") schedule = os.path.join(src_path, "parallel_schedule") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 931de0f1e3..73e26bd207 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -29,6 +29,7 @@ from fixtures.neon_fixtures import ( SafekeeperPort, available_remote_storages, neon_binpath, + pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -634,6 +635,9 @@ class ProposerPostgres(PgProtocol): } basepath = self.pg_bin.run_capture(command, env) + + log.info(f"postgres --sync-safekeepers output: {basepath}") + stdout_filename = basepath + ".stdout" with open(stdout_filename, "r") as stdout_f: @@ -662,7 +666,9 @@ class ProposerPostgres(PgProtocol): # insert wal in all safekeepers and run sync on proposer def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor, ): # We don't really need the full environment for this test, just the @@ -699,6 +705,7 @@ def test_sync_safekeepers( "begin_lsn": int(begin_lsn), "epoch_start_lsn": int(epoch_start_lsn), "truncate_lsn": int(epoch_start_lsn), + "pg_version": int(pg_version) * 10000, }, ) lsn = Lsn(res["inserted_wal"]["end_lsn"]) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 19d948fd47..796770565f 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 +Subproject commit 796770565ff668b585e80733b8d679961ad50e93 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 5b8b3eeef5..9383aaa9c2 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 5b8b3eeef5ec34c0cad9377833906a1387841d04 +Subproject commit 9383aaa9c2616fd81cfafb058fe0d692f5e43ac3 From 9dfede81467aaaabf21518a949ce870d735155e5 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 18:34:30 +0300 Subject: [PATCH 104/166] Handle backwards-compatibility of TimelineMetadata. This commit bumps TimelineMetadata format version and makes it independent from STORAGE_FORMAT_VERSION. --- pageserver/src/lib.rs | 6 +- pageserver/src/tenant/metadata.rs | 161 ++++++++++++++++++++++++++---- 2 files changed, 148 insertions(+), 19 deletions(-) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 0bd5e242d3..7937f72de7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -31,9 +31,11 @@ use crate::task_mgr::TaskKind; /// Current storage format version /// -/// This is embedded in the metadata file, and also in the header of all the -/// layer files. If you make any backwards-incompatible changes to the storage +/// This is embedded in the header of all the layer files. +/// If you make any backwards-incompatible changes to the storage /// format, bump this! +/// Note that TimelineMetadata uses its own version number to track +/// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; pub const DEFAULT_PG_VERSION: u32 = 14; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 41790b4d11..6d18153b4c 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -20,7 +20,12 @@ use utils::{ use crate::config::PageServerConf; use crate::virtual_file::VirtualFile; -use crate::STORAGE_FORMAT_VERSION; + +/// Use special format number to enable backward compatibility. +const METADATA_FORMAT_VERSION: u16 = 4; + +/// Previous supported format versions. +const METADATA_OLD_FORMAT_VERSION: u16 = 3; /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. /// @@ -34,19 +39,19 @@ const METADATA_MAX_SIZE: usize = 512; #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, - body: TimelineMetadataBody, + body: TimelineMetadataBodyV2, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataHeader { checksum: u32, // CRC of serialized metadata body size: u16, // size of serialized metadata - format_version: u16, // storage format version (used for compatibility checks) + format_version: u16, // metadata format version (used for compatibility checks) } const METADATA_HDR_SIZE: usize = std::mem::size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -struct TimelineMetadataBody { +struct TimelineMetadataBodyV2 { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page // server is running, but we only track the value corresponding to @@ -66,6 +71,26 @@ struct TimelineMetadataBody { pg_version: u32, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn, + // This is only set if we know it. We track it in memory when the page + // server is running, but we only track the value corresponding to + // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a + // lot. We only store it in the metadata file when we flush *all* the + // in-memory data so that 'last_record_lsn' is the same as + // 'disk_consistent_lsn'. That's OK, because after page server restart, as + // soon as we reprocess at least one record, we will have a valid + // 'prev_record_lsn' value in memory again. This is only really needed when + // doing a clean shutdown, so that there is no more WAL beyond + // 'disk_consistent_lsn' + prev_record_lsn: Option, + ancestor_timeline: Option, + ancestor_lsn: Lsn, + latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, +} + impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, @@ -80,9 +105,9 @@ impl TimelineMetadata { hdr: TimelineMetadataHeader { checksum: 0, size: 0, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, }, - body: TimelineMetadataBody { + body: TimelineMetadataBodyV2 { disk_consistent_lsn, prev_record_lsn, ancestor_timeline, @@ -94,16 +119,43 @@ impl TimelineMetadata { } } + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { + let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; + + // backward compatible only up to this version + ensure!( + hdr.format_version == METADATA_OLD_FORMAT_VERSION, + "unsupported metadata format version {}", + hdr.format_version + ); + + let metadata_size = hdr.size as usize; + + let body: TimelineMetadataBodyV1 = + TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + + let body = TimelineMetadataBodyV2 { + disk_consistent_lsn: body.disk_consistent_lsn, + prev_record_lsn: body.prev_record_lsn, + ancestor_timeline: body.ancestor_timeline, + ancestor_lsn: body.ancestor_lsn, + latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, + initdb_lsn: body.initdb_lsn, + pg_version: 14, // All timelines created before this version had pg_version 14 + }; + + hdr.format_version = METADATA_FORMAT_VERSION; + + Ok(Self { hdr, body }) + } + pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { ensure!( metadata_bytes.len() == METADATA_MAX_SIZE, "metadata bytes size is wrong" ); let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; - ensure!( - hdr.format_version == STORAGE_FORMAT_VERSION, - "format version mismatch" - ); + let metadata_size = hdr.size as usize; ensure!( metadata_size <= METADATA_MAX_SIZE, @@ -114,13 +166,20 @@ impl TimelineMetadata { hdr.checksum == calculated_checksum, "metadata checksum mismatch" ); - let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; - ensure!( - body.disk_consistent_lsn.is_aligned(), - "disk_consistent_lsn is not aligned" - ); - Ok(TimelineMetadata { hdr, body }) + if hdr.format_version != METADATA_FORMAT_VERSION { + // If metadata has the old format, + // upgrade it and return the result + TimelineMetadata::upgrade_timeline_metadata(&metadata_bytes) + } else { + let body = + TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + ensure!( + body.disk_consistent_lsn.is_aligned(), + "disk_consistent_lsn is not aligned" + ); + Ok(TimelineMetadata { hdr, body }) + } } pub fn to_bytes(&self) -> anyhow::Result> { @@ -128,7 +187,7 @@ impl TimelineMetadata { let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { size: metadata_size as u16, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, checksum: crc32c::crc32c(&body_bytes), }; let hdr_bytes = hdr.ser()?; @@ -235,4 +294,72 @@ mod tests { "Metadata that was serialized to bytes and deserialized back should not change" ); } + + // Generate old version metadata and read it with current code. + // Ensure that it is upgraded correctly + #[test] + fn test_metadata_upgrade() { + #[derive(Debug, Clone, PartialEq, Eq)] + struct TimelineMetadataV1 { + hdr: TimelineMetadataHeader, + body: TimelineMetadataBodyV1, + } + + let metadata_v1 = TimelineMetadataV1 { + hdr: TimelineMetadataHeader { + checksum: 0, + size: 0, + format_version: METADATA_OLD_FORMAT_VERSION, + }, + body: TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn(0x200), + prev_record_lsn: Some(Lsn(0x100)), + ancestor_timeline: Some(TIMELINE_ID), + ancestor_lsn: Lsn(0), + latest_gc_cutoff_lsn: Lsn(0), + initdb_lsn: Lsn(0), + }, + }; + + impl TimelineMetadataV1 { + pub fn to_bytes(&self) -> anyhow::Result> { + let body_bytes = self.body.ser()?; + let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); + let hdr = TimelineMetadataHeader { + size: metadata_size as u16, + format_version: METADATA_OLD_FORMAT_VERSION, + checksum: crc32c::crc32c(&body_bytes), + }; + let hdr_bytes = hdr.ser()?; + let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; + metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); + metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); + Ok(metadata_bytes) + } + } + + let metadata_bytes = metadata_v1 + .to_bytes() + .expect("Should serialize correct metadata to bytes"); + + // This should deserialize to the latest version format + let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) + .expect("Should deserialize its own bytes"); + + let expected_metadata = TimelineMetadata::new( + Lsn(0x200), + Some(Lsn(0x100)), + Some(TIMELINE_ID), + Lsn(0), + Lsn(0), + Lsn(0), + 14, // All timelines created before this version had pg_version 14 + ); + + assert_eq!( + deserialized_metadata.body, expected_metadata.body, + "Metadata of the old version {} should be upgraded to the latest version {}", + METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION + ); + } } From 03c606f7c5fbb1bcd2ba79ea0d21849d298c1400 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 19:40:04 +0300 Subject: [PATCH 105/166] Pass pg_version parameter to timeline import command. Add pg_version field to LocalTimelineInfo. Use pg_version in the export_import_between_pageservers script --- control_plane/src/bin/neon_local.rs | 12 ++++++------ control_plane/src/storage.rs | 6 ++++-- pageserver/src/http/models.rs | 1 + pageserver/src/http/routes.rs | 1 + pageserver/src/page_service.rs | 13 +++---------- scripts/export_import_between_pageservers.py | 4 +++- 6 files changed, 18 insertions(+), 19 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 92782ea235..93947d5326 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -695,18 +695,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); - let mut cplane = ComputeControlPlane::load(env.clone())?; - println!("Importing timeline into pageserver ..."); - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; - println!("Creating node for imported timeline ..."); - env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - let pg_version = import_match .value_of("pg-version") .unwrap() .parse::() .context("Failed to parse postgres version from the argument string")?; + let mut cplane = ComputeControlPlane::load(env.clone())?; + println!("Importing timeline into pageserver ..."); + pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; + println!("Creating node for imported timeline ..."); + env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 95ade14fbf..9032f99971 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -547,6 +547,7 @@ impl PageServerNode { timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, + pg_version: u32, ) -> anyhow::Result<()> { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); @@ -565,8 +566,9 @@ impl PageServerNode { }; // Import base - let import_cmd = - format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let import_cmd = format!( + "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" + ); let mut writer = client.copy_in(&import_cmd)?; io::copy(&mut base_reader, &mut writer)?; writer.finish()?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 851fa881a0..d5559653b2 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -138,6 +138,7 @@ pub struct LocalTimelineInfo { pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, + pub pg_version: u32, } #[serde_as] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6892c0b391..a55c6c973e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -130,6 +130,7 @@ fn local_timeline_info_from_timeline( wal_source_connstr, last_received_msg_lsn, last_received_msg_ts, + pg_version: timeline.pg_version, }; Ok(info) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index fed5d0dcc4..368b4c8bee 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -959,22 +959,15 @@ impl postgres_backend_async::Handler for PageServerHandler { // 1. Get start/end LSN from backup_manifest file // 2. Run: // cat my_backup/base.tar | psql -h $PAGESERVER \ - // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" + // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() >= 4); + ensure!(params.len() == 5); let tenant_id = TenantId::from_str(params[0])?; let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; - - let pg_version = if params.len() == 5 { - u32::from_str(params[4])? - } else { - // If version is not provided, assume default. - // TODO: this may lead to weird errors if the version is wrong. - crate::DEFAULT_PG_VERSION - }; + let pg_version = u32::from_str(params[4])?; self.check_permission(Some(tenant_id))?; diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index af847be49e..0fccf5199d 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -470,9 +470,10 @@ def import_timeline( last_lsn, prev_lsn, tar_filename, + pg_version, ): # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" + import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") @@ -594,6 +595,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + timeline["local"]["pg_version"], ) # Re-export and compare From a4397d43e997247f703b28baa81d5ffa727a65bd Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 20:16:22 +0300 Subject: [PATCH 106/166] Rename waldecoder -> waldecoder_handler.rs. Add comments --- libs/postgres_ffi/src/lib.rs | 8 +++++--- .../src/{waldecoder.rs => waldecoder_handler.rs} | 11 +++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) rename libs/postgres_ffi/src/{waldecoder.rs => waldecoder_handler.rs} (95%) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 25e1f6029c..1a6620a180 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -31,7 +31,7 @@ macro_rules! postgres_ffi { } pub mod controlfile_utils; pub mod nonrelfile_utils; - pub mod waldecoder; + pub mod waldecoder_handler; pub mod xlog_utils; pub const PG_MAJORVERSION: &str = stringify!($version); @@ -216,12 +216,14 @@ pub mod waldecoder { pub fn poll_decode(&mut self) -> Result, WalDecodeError> { match self.pg_version { + // This is a trick to support both versions simultaneously. + // See WalStreamDecoderHandler comments. 14 => { - use self::v14::waldecoder::WalStreamDecoderHandler; + use self::v14::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() } 15 => { - use self::v15::waldecoder::WalStreamDecoderHandler; + use self::v15::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() } _ => Err(WalDecodeError { diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder_handler.rs similarity index 95% rename from libs/postgres_ffi/src/waldecoder.rs rename to libs/postgres_ffi/src/waldecoder_handler.rs index 5b46d52321..b4d50375bd 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -26,8 +26,15 @@ pub trait WalStreamDecoderHandler { } // -// WalRecordStream is a Stream that returns a stream of WAL records -// FIXME: This isn't a proper rust stream +// This is a trick to support several postgres versions simultaneously. +// +// Page decoding code depends on postgres bindings, so it is compiled for each version. +// Thus WalStreamDecoder implements several WalStreamDecoderHandler traits. +// WalStreamDecoder poll_decode() method dispatches to the right handler based on the postgres version. +// Other methods are internal and are not dispatched. +// +// It is similar to having several impl blocks for the same struct, +// but the impls here are in different modules, so need to use a trait. // impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { From a69e060f0f0683683c33fa39128173aadc35a04b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 14 Sep 2022 20:38:59 +0300 Subject: [PATCH 107/166] fix clippy warning --- pageserver/src/tenant/metadata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6d18153b4c..3fb9ccd936 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -170,7 +170,7 @@ impl TimelineMetadata { if hdr.format_version != METADATA_FORMAT_VERSION { // If metadata has the old format, // upgrade it and return the result - TimelineMetadata::upgrade_timeline_metadata(&metadata_bytes) + TimelineMetadata::upgrade_timeline_metadata(metadata_bytes) } else { let body = TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; From d45de3d58f12fb143963faf61cd874831e3cc6a9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 15 Sep 2022 17:27:10 +0300 Subject: [PATCH 108/166] update build scripts to match pg_distrib_dir versioning schema --- .github/actions/run-python-test-set/action.yml | 4 ++-- .github/workflows/pg_clients.yml | 4 ++-- Dockerfile | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index e69cb28df1..fc3b1c9c37 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -85,7 +85,7 @@ runs: # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} - export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14} + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -126,7 +126,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v14/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index d04d002811..0600f9234f 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -58,12 +58,12 @@ jobs: env: REMOTE_ENV: 1 BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install shell: bash -euxo pipefail {0} run: | # Test framework expects we have psql binary; # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql"; + mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql"; ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --tb=short \ diff --git a/Dockerfile b/Dockerfile index 213934a844..876a20cc1a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -68,8 +68,8 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin -# v14 is default for now -COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ +COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ +COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -78,7 +78,7 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ -c "broker_endpoints=['http://etcd:2379']" \ - -c "pg_distrib_dir='/usr/local'" \ + -c "pg_distrib_dir='/usr/local/'" \ -c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_http_addr='0.0.0.0:9898'" From 5dddeb8d88354621a6b1e690057b16ce1a5c6a79 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 15 Sep 2022 17:40:29 +0300 Subject: [PATCH 109/166] Use non-versioned pg_distrib dir --- control_plane/src/local_env.rs | 29 ++++++++++-------------- control_plane/src/storage.rs | 2 +- docs/settings.md | 2 ++ pageserver/src/config.rs | 40 ++++++++++------------------------ pageserver/src/http/routes.rs | 2 +- 5 files changed, 26 insertions(+), 49 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 14bb4cf346..f4fbc99420 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -197,25 +197,18 @@ impl Default for SafekeeperConf { } impl LocalEnv { + pub fn pg_distrib_dir_raw(&self) -> PathBuf { + self.pg_distrib_dir.clone() + } + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { - let mut path = self.pg_distrib_dir.clone(); + let path = self.pg_distrib_dir.clone(); - if pg_version != DEFAULT_PG_VERSION { - // step up to the parent directory - // We assume that the pg_distrib subdirs - // for different pg versions - // are located in the same directory - // and follow the naming convention: v14, v15, etc. - path.pop(); - - match pg_version { - 14 => return path.join(format!("v{pg_version}")), - 15 => return path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), - }; + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), } - - path } pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { @@ -319,7 +312,7 @@ impl LocalEnv { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". // Note that later in the code we assume, that distrib dirs follow the same pattern // for all postgres versions. if env.pg_distrib_dir == Path::new("") { @@ -327,7 +320,7 @@ impl LocalEnv { env.pg_distrib_dir = postgres_bin.into(); } else { let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install/v14") + env.pg_distrib_dir = cwd.join("pg_install") } } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 9032f99971..bfbd6e91c3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -118,7 +118,7 @@ impl PageServerNode { // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", - self.env.pg_distrib_dir(pg_version).display() + self.env.pg_distrib_dir_raw().display() ); let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); diff --git a/docs/settings.md b/docs/settings.md index 30db495dbe..878681fce1 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -155,6 +155,8 @@ for other files and for sockets for incoming connections. #### pg_distrib_dir A directory with Postgres installation to use during pageserver activities. +Since pageserver supports several postgres versions, `pg_distrib_dir` contains +a subdirectory for each version with naming convention `v{PG_MAJOR_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. The default distrib dir is `./pg_install/`. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a4346c0190..b75f8f8265 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -21,7 +21,6 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::DEFAULT_PG_VERSION; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -210,7 +209,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join(format!("pg_install/v{}", DEFAULT_PG_VERSION))), + .join(format!("pg_install",))), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), @@ -376,24 +375,13 @@ impl PageServerConf { // Postgres distribution paths // pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { - let mut path = self.pg_distrib_dir.clone(); + let path = self.pg_distrib_dir.clone(); - if pg_version != DEFAULT_PG_VERSION { - // step up to the parent directory - // We assume that the pg_distrib subdirs - // for different pg versions - // are located in the same directory - // and follow the naming convention: v14, v15, etc. - path.pop(); - - match pg_version { - 14 => return path.join(format!("v{pg_version}")), - 15 => return path.join(format!("v{pg_version}")), - _ => panic!("Unsupported postgres version: {}", pg_version), - }; + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), } - - path } pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { @@ -477,14 +465,6 @@ impl PageServerConf { ); } - let pg_version = DEFAULT_PG_VERSION; - if !conf.pg_bin_dir(pg_version).join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - conf.pg_bin_dir(pg_version).display() - ); - } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); Ok(conf) @@ -654,6 +634,7 @@ mod tests { use tempfile::{tempdir, TempDir}; use super::*; + use crate::DEFAULT_PG_VERSION; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -892,9 +873,10 @@ broker_endpoints = ['{broker_endpoint}'] let workdir = tempdir_path.join("workdir"); fs::create_dir_all(&workdir)?; - let pg_distrib_dir = tempdir_path.join(format!("pg_distrib/v{DEFAULT_PG_VERSION}")); - fs::create_dir_all(&pg_distrib_dir)?; - let postgres_bin_dir = pg_distrib_dir.join("bin"); + let pg_distrib_dir = tempdir_path.join("pg_distrib"); + let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); + fs::create_dir_all(&pg_distrib_dir_versioned)?; + let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a55c6c973e..72cbb0e819 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -174,7 +174,7 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. From 1255ef806feea438c45dc3ee808ab53deefca6c6 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Sun, 18 Sep 2022 21:10:00 +0300 Subject: [PATCH 110/166] pass version to wal_craft.rs --- libs/postgres_ffi/src/xlog_utils.rs | 3 ++- libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs | 15 ++++++++++++++- libs/postgres_ffi/wal_craft/src/lib.rs | 15 +++++++++++++-- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 8389a6e971..038e0491a0 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -471,7 +471,8 @@ mod tests { .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")), + pg_version: PG_MAJORVERSION, + pg_distrib_dir: top_path.join(format!("pg_install")), datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 2a607db6dc..9b9f76de7c 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,9 +37,16 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)") + .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install)") .default_value("/usr/local") ) + .arg( + Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(true) + .takes_value(true) + ) ) .subcommand( App::new("in-existing") @@ -82,8 +89,14 @@ fn main() -> Result<()> { } Ok(()) } + Some(("with-initdb", arg_matches)) => { let cfg = Conf { + pg_version: arg_matches + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?, pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), datadir: arg_matches.value_of("datadir").unwrap().into(), }; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 2ad92d776d..7ffe19e209 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -15,6 +15,7 @@ use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Conf { + pub pg_version: u32, pub pg_distrib_dir: PathBuf, pub datadir: PathBuf, } @@ -36,12 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { }); impl Conf { + pub fn pg_distrib_dir(&self) -> PathBuf { + let path = self.pg_distrib_dir.clone(); + + match self.pg_version { + 14 => path.join(format!("v{}", self.pg_version)), + 15 => path.join(format!("v{}", self.pg_version)), + _ => panic!("Unsupported postgres version: {}", self.pg_version), + } + } + fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + self.pg_distrib_dir().join("bin") } fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + self.pg_distrib_dir().join("lib") } pub fn wal_dir(&self) -> PathBuf { From 0fde59aa4628c3e25048e014e1519e3c83462092 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Sun, 18 Sep 2022 22:45:29 +0300 Subject: [PATCH 111/166] use pg_version in python tests --- scripts/export_import_between_pageservers.py | 33 ++++-- test_runner/fixtures/neon_fixtures.py | 106 ++++++++++++------- test_runner/regress/test_import.py | 5 +- test_runner/regress/test_pg_regress.py | 26 ++--- test_runner/regress/test_wal_acceptor.py | 3 +- test_runner/regress/test_wal_restore.py | 4 +- 6 files changed, 109 insertions(+), 68 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 0fccf5199d..1285d0476b 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -80,11 +80,13 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path, pg_distrib_dir): + def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = os.path.join( + str(pg_distrib_dir), "v{}".format(pg_version), "lib" + ) def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -484,7 +486,7 @@ def import_timeline( with open(stdout_filename, "w") as stdout_f: with open(stderr_filename2, "w") as stderr_f: print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( full_cmd, stdout=stdout_f, @@ -503,7 +505,15 @@ def import_timeline( def export_timeline( - args, psql_path, pageserver_connstr, tenant_id, timeline_id, last_lsn, prev_lsn, tar_filename + args, + psql_path, + pageserver_connstr, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, + pg_version, ): # Choose filenames incomplete_filename = tar_filename + ".incomplete" @@ -518,13 +528,13 @@ def export_timeline( with open(incomplete_filename, "w") as stdout_f: with open(stderr_filename, "w") as stderr_f: print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True ) # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin) # Log more info @@ -533,7 +543,8 @@ def export_timeline( def main(args: argparse.Namespace): - psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql") + # any psql version will do here. use current DEFAULT_PG_VERSION = 14 + psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql") old_pageserver_host = args.old_pageserver_host new_pageserver_host = args.new_pageserver_host @@ -566,6 +577,8 @@ def main(args: argparse.Namespace): args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" ) + pg_version = timeline["local"]["pg_version"] + # Export timeline from old pageserver if args.only_import is False: last_lsn, prev_lsn = get_rlsn( @@ -582,6 +595,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + pg_version, ) # Import into new pageserver @@ -595,7 +609,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, - timeline["local"]["pg_version"], + pg_version, ) # Re-export and compare @@ -609,6 +623,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, re_export_filename, + pg_version, ) # Check the size is the same diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c1ebc6aa7d..3c60437426 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,8 +59,8 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_PG_VERSION_DEFAULT = "14" DEFAULT_BRANCH_NAME = "main" +DEFAULT_PG_VERSION_DEFAULT = "14" BASE_PORT = 15000 WORKER_PORT_NUM = 1000 @@ -71,7 +71,7 @@ base_dir = "" neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" -pg_version = "" +default_pg_version = "" def pytest_configure(config): @@ -101,29 +101,36 @@ def pytest_configure(config): Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. - global pg_version - pg_version = os.environ.get("DEFAULT_PG_VERSION", DEFAULT_PG_VERSION_DEFAULT) + global default_pg_version + log.info(f"default_pg_version is {default_pg_version}") + env_default_pg_version = os.environ.get("DEFAULT_PG_VERSION") + if env_default_pg_version: + default_pg_version = env_default_pg_version + log.info(f"default_pg_version is set to {default_pg_version}") + else: + default_pg_version = DEFAULT_PG_VERSION_DEFAULT global pg_distrib_dir - # TODO get rid of the POSTGRES_DISTRIB_DIR env var ? - # use DEFAULT_PG_VERSION instead to generate the path env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: - pg_distrib_dir = os.path.normpath( - os.path.join(base_dir, "pg_install/v{}".format(pg_version)) - ) + pg_distrib_dir = os.path.normpath(os.path.join(base_dir, "pg_install")) log.info(f"pg_distrib_dir is {pg_distrib_dir}") + psql_bin_path = os.path.join(pg_distrib_dir, "v{}".format(default_pg_version), "bin/psql") + postgres_bin_path = os.path.join( + pg_distrib_dir, "v{}".format(default_pg_version), "bin/postgres" + ) + if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/psql")): - raise Exception('psql not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(psql_bin_path): + raise Exception('psql not found at "{}"'.format(psql_bin_path)) else: - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/postgres")): - raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(postgres_bin_path): + raise Exception('postgres not found at "{}"'.format(postgres_bin_path)) if os.getenv("REMOTE_ENV"): # we are in remote env and do not have neon binaries locally @@ -549,6 +556,7 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.remote_storage_prefix: Optional[str] = None self.keep_remote_storage_contents: bool = True + self.pg_version = default_pg_version def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -761,6 +769,7 @@ class NeonEnv: self.broker = config.broker self.remote_storage = config.remote_storage self.remote_storage_users = config.remote_storage_users + self.pg_version = config.pg_version # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -1195,7 +1204,6 @@ class AbstractNeonCli(abc.ABC): env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) - env_vars["DEFAULT_PG_VERSION"] = str(pg_version) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): @@ -1263,7 +1271,7 @@ class NeonCli(AbstractNeonCli): "--timeline-id", str(timeline_id), "--pg-version", - pg_version, + self.env.pg_version, ] ) else: @@ -1276,7 +1284,7 @@ class NeonCli(AbstractNeonCli): "--timeline-id", str(timeline_id), "--pg-version", - pg_version, + self.env.pg_version, ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) @@ -1302,7 +1310,9 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[TenantId] = None + self, + new_branch_name: str, + tenant_id: Optional[TenantId] = None, ) -> TimelineId: cmd = [ "timeline", @@ -1312,7 +1322,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1326,7 +1336,11 @@ class NeonCli(AbstractNeonCli): return TimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[TenantId] = None): + def create_root_branch( + self, + branch_name: str, + tenant_id: Optional[TenantId] = None, + ): cmd = [ "timeline", "create", @@ -1335,7 +1349,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1405,7 +1419,9 @@ class NeonCli(AbstractNeonCli): return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[TimelineId] = None + self, + config_toml: str, + initial_timeline_id: Optional[TimelineId] = None, ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1415,7 +1431,7 @@ class NeonCli(AbstractNeonCli): if initial_timeline_id: cmd.extend(["--timeline-id", str(initial_timeline_id)]) - cmd.extend(["--pg-version", pg_version]) + cmd.extend(["--pg-version", self.env.pg_version]) append_pageserver_param_overrides( params_to_update=cmd, @@ -1443,7 +1459,10 @@ class NeonCli(AbstractNeonCli): log.info(f"pageserver_enabled_features success: {res.stdout}") return json.loads(res.stdout) - def pageserver_start(self, overrides=()) -> "subprocess.CompletedProcess[str]": + def pageserver_start( + self, + overrides=(), + ) -> "subprocess.CompletedProcess[str]": start_args = ["pageserver", "start", *overrides] append_pageserver_param_overrides( params_to_update=start_args, @@ -1499,7 +1518,7 @@ class NeonCli(AbstractNeonCli): "--branch-name", branch_name, "--pg-version", - pg_version, + self.env.pg_version, ] if lsn is not None: args.extend(["--lsn", str(lsn)]) @@ -1525,7 +1544,7 @@ class NeonCli(AbstractNeonCli): "--tenant-id", str(tenant_id or self.env.initial_tenant), "--pg-version", - pg_version, + self.env.pg_version, ] if lsn is not None: args.append(f"--lsn={lsn}") @@ -1655,11 +1674,13 @@ def append_pageserver_param_overrides( class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path): + def __init__(self, log_dir: Path, pg_version: str): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_version = pg_version + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") + self.pg_lib_dir = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "lib") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = self.pg_lib_dir def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -1714,8 +1735,8 @@ class PgBin: @pytest.fixture(scope="function") -def pg_bin(test_output_dir: Path) -> PgBin: - return PgBin(test_output_dir) +def pg_bin(test_output_dir: Path, pg_version: str) -> PgBin: + return PgBin(test_output_dir, pg_version) class VanillaPostgres(PgProtocol): @@ -1762,12 +1783,19 @@ class VanillaPostgres(PgProtocol): self.stop() +@pytest.fixture(scope="session") +def pg_version() -> str: + return default_pg_version + + @pytest.fixture(scope="function") def vanilla_pg( - test_output_dir: Path, port_distributor: PortDistributor + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, ) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: yield vanilla_pg @@ -1803,8 +1831,8 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope="function") -def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: - pg_bin = PgBin(test_output_dir) +def remote_pg(test_output_dir: Path, pg_version: str) -> Iterator[RemotePostgres]: + pg_bin = PgBin(test_output_dir, pg_version) connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: @@ -2533,7 +2561,11 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + pg: Postgres, +): # Get the timeline ID. We need it for the 'basebackup' command timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) @@ -2544,7 +2576,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") cmd = rf""" @@ -2557,7 +2589,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} + psql_env = {"LD_LIBRARY_PATH": pg_bin.pg_lib_dir} result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) # Print captured stdout/stderr if basebackup cmd failed. diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 417595ae4d..c84d282a4d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( PgBin, Postgres, pg_distrib_dir, - pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -98,7 +97,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build "--wal-tarfile", wal, "--pg-version", - pg_version, + env.pg_version, ] ) @@ -252,7 +251,7 @@ def _import( "--base-tarfile", os.path.join(tar_output_file), "--pg-version", - pg_version, + env.pg_version, ] ) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 4934fb9354..f23811b671 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,13 +5,7 @@ import os from pathlib import Path import pytest -from fixtures.neon_fixtures import ( - NeonEnv, - base_dir, - check_restored_datadir_content, - pg_distrib_dir, - pg_version, -) +from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir # Run the main PostgreSQL regression tests, in src/test/regress. @@ -32,9 +26,9 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) - src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(pg_version) - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(env.pg_version) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -86,9 +80,11 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/isolation".format(pg_version)) - src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/isolation".format(pg_version)) - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/isolation".format(env.pg_version)) + src_path = os.path.join( + base_dir, "vendor/postgres-v{}/src/test/isolation".format(env.pg_version) + ) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -130,9 +126,9 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "../build/v{}/src/test/regress").format(pg_version) + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) src_path = os.path.join(base_dir, "test_runner/sql_regress") - bindir = os.path.join(pg_distrib_dir, "bin") + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 73e26bd207..d5a5ec2f36 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -29,7 +29,6 @@ from fixtures.neon_fixtures import ( SafekeeperPort, available_remote_storages, neon_binpath, - pg_version, wait_for_last_record_lsn, wait_for_upload, ) @@ -705,7 +704,7 @@ def test_sync_safekeepers( "begin_lsn": int(begin_lsn), "epoch_start_lsn": int(epoch_start_lsn), "truncate_lsn": int(epoch_start_lsn), - "pg_version": int(pg_version) * 10000, + "pg_version": int(env.pg_version) * 10000, }, ) lsn = Lsn(res["inserted_wal"]["end_lsn"]) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 21921a3bc2..db6f1e5137 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -26,11 +26,11 @@ def test_wal_restore( env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" - with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: + with VanillaPostgres(data_dir, PgBin(test_output_dir, env.pg_version), port) as restored: pg_bin.run_capture( [ os.path.join(base_dir, "libs/utils/scripts/restore_from_wal.sh"), - os.path.join(pg_distrib_dir, "bin"), + os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin"), str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), str(data_dir), str(port), From 8d890b3cbb150136dd6a7eab9556bd006fe18823 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 08:38:30 +0300 Subject: [PATCH 112/166] fix clippy warnings --- libs/postgres_ffi/src/xlog_utils.rs | 6 ++++-- pageserver/src/config.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 038e0491a0..2c16cc9cd9 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -466,13 +466,15 @@ mod tests { fn test_end_of_wal(test_name: &str) { use wal_craft::*; + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); let cfg = Conf { - pg_version: PG_MAJORVERSION, - pg_distrib_dir: top_path.join(format!("pg_install")), + pg_version, + pg_distrib_dir: top_path.join("pg_install"), datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b75f8f8265..a52a3e8262 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -209,7 +209,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join(format!("pg_install",))), + .join("pg_install")), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), From 862902f9e5846b7edef14b296557a926efec5264 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 14:38:51 +0300 Subject: [PATCH 113/166] Update readme and openapi spec --- pageserver/src/http/openapi_spec.yml | 3 +++ test_runner/README.md | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 1f2eba05ec..4e748207c8 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -307,6 +307,7 @@ paths: description: | Create a timeline. Returns new timeline id on success.\ If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. + If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. requestBody: content: application/json: @@ -322,6 +323,8 @@ paths: ancestor_start_lsn: type: string format: hex + pg_version: + type: integer responses: "201": description: TimelineInfo diff --git a/test_runner/README.md b/test_runner/README.md index 79b2418af6..d6ee5730ac 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -60,6 +60,12 @@ Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. +Since pageserver supports several postgres versions, `POSTGRES_DISTRIB_DIR` must contain +a subdirectory for each version with naming convention `v{PG_VERSION}/`. +Inside that dir, a `bin/postgres` binary should be present. +`DEFAULT_PG_VERSION`: The version of Postgres to use, +This is used to construct full path to the postgres binaries. +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. From ed6b75e3018922f1110cb451de94e634d860e2ad Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 15:03:11 +0300 Subject: [PATCH 114/166] show pg_version in create_timeline info span --- pageserver/src/http/routes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 72cbb0e819..55429420a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -191,7 +191,7 @@ async fn timeline_create_handler(mut request: Request) -> Result Err(ApiError::InternalServerError(err)), } } - .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) .await?; Ok(match new_timeline_info { From 3618c242b9ffbf678f7e68472a5d256ad51cc538 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 19 Sep 2022 15:14:01 +0300 Subject: [PATCH 115/166] use version specific find_end_of_wal function --- safekeeper/src/wal_storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 44dc313ef6..9e198fc148 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -14,7 +14,7 @@ use std::pin::Pin; use tokio::io::AsyncRead; use postgres_ffi::v14::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, + IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, }; use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; From d8d3cd49f4ad753443364574a31d84fc56557b46 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 15:31:05 +0300 Subject: [PATCH 116/166] Update libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs Co-authored-by: MMeent --- libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 9b9f76de7c..9563298cd8 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,7 +37,7 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install)") + .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") .default_value("/usr/local") ) .arg( From eba419fda360bdc4a2025474b2afcd92d0ff369b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:15:34 +0300 Subject: [PATCH 117/166] Clean up the pg_version choice code --- libs/postgres_ffi/src/lib.rs | 13 ++++++------- pageserver/src/walingest.rs | 2 +- pageserver/src/walrecord.rs | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 1a6620a180..95ecc7b061 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -79,14 +79,13 @@ pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { - if version == 14 { - bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0 - } else { - assert_eq!(version, 15); - bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { + match version { + 14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0), + 15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 - || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0), + _ => anyhow::bail!("Unknown version {}", version), } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1d5cab38b9..d3d2c6d9b2 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -324,7 +324,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version) + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? { // Extract page image from FPI record let img_len = blk.bimg_len as usize; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 258e1a445f..38fb9a4247 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -527,7 +527,7 @@ pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, pg_version: u32, -) -> Result<(), DeserializeError> { +) -> Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -628,7 +628,7 @@ pub fn decode_wal_record( }; let blk_img_is_compressed = - postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; if blk_img_is_compressed { debug!("compressed block image , pg_version = {}", pg_version); From d098542ddeb1b01b5e05e299f4979ad1677f127a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:46:20 +0300 Subject: [PATCH 118/166] Make test_timeline_size_metrics more stable: Compare size with Vanilla postgres size instead of hardcoded value --- test_runner/regress/test_timeline_size.py | 38 +++++++++++++++++++---- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 979d1a107f..3a482be5db 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -3,6 +3,7 @@ import random import re import time from contextlib import closing +from pathlib import Path import psycopg2.errors import psycopg2.extras @@ -11,7 +12,10 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, + PgBin, + PortDistributor, Postgres, + VanillaPostgres, assert_timeline_local, wait_for_last_flush_lsn, ) @@ -327,7 +331,12 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # The timeline logical and physical sizes are also exposed as prometheus metrics. # Test the metrics. -def test_timeline_size_metrics(neon_simple_env: NeonEnv): +def test_timeline_size_metrics( + neon_simple_env: NeonEnv, + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, +): env = neon_simple_env pageserver_http = env.pageserver.http_client() @@ -369,11 +378,28 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): assert matches tl_logical_size_metric = int(matches.group(1)) - # An empty database is around 8 MB. There at least 3 databases, 'postgres', - # 'template0', 'template1'. So the total size should be about 32 MB. This isn't - # very accurate and can change with different PostgreSQL versions, so allow a - # couple of MB of slack. - assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024) + pgdatadir = test_output_dir / "pgdata-vanilla" + pg_bin = PgBin(test_output_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure([f"port={port}"]) + vanilla_pg.start() + + # Create database based on template0 because we can't connect to template0 + vanilla_pg.safe_psql("CREATE TABLE foo (t text)") + vanilla_pg.safe_psql( + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""" + ) + vanilla_size_sum = vanilla_pg.safe_psql( + "select sum(pg_database_size(oid)) from pg_database" + )[0][0] + + # Compare the size with Vanilla postgres. + # Allow some slack, because the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + assert math.isclose(tl_logical_size_metric, vanilla_size_sum, abs_tol=2 * 1024 * 1024) # The sum of the sizes of all databases, as seen by pg_database_size(), should also # be close. Again allow some slack, the logical size metric includes some things like From 1fa7d6aebf4df5e55f4f4c98e9cdba507a7d2345 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 18:48:58 +0300 Subject: [PATCH 119/166] Use DEFAULT_PG_VERSION env in CI pytest --- .github/actions/run-python-test-set/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index fc3b1c9c37..bed0bc69dc 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -86,6 +86,7 @@ runs: # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 From 64f64d563777cf311624f7bbc23e06ab9a9b7b3d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 10:02:43 +0300 Subject: [PATCH 120/166] Fix after rebase: bump vendor/postgres-v14 to match main --- vendor/postgres-v14 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 796770565f..19d948fd47 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 796770565ff668b585e80733b8d679961ad50e93 +Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 From 2d012f0d324a0c764e3956171c981fa2e0455464 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 12:37:13 +0300 Subject: [PATCH 121/166] Fix rebase conflicts in pageserver code --- pageserver/src/tenant.rs | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5860e13534..ed41641277 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -186,8 +186,15 @@ impl Tenant { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } - let new_metadata = - TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn, pg_version,); + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + pg_version, + ); let new_timeline = self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); @@ -207,6 +214,7 @@ impl Tenant { new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, + pg_version: u32, ) -> Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); @@ -249,7 +257,7 @@ impl Tenant { self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? } - None => self.bootstrap_timeline(new_timeline_id)?, + None => self.bootstrap_timeline(new_timeline_id, pg_version)?, }; // Have added new timeline into the tenant, now its background tasks are needed. @@ -1001,7 +1009,11 @@ impl Tenant { /// - run initdb to init temporary instance and get bootstrap data /// - after initialization complete, remove the temp dir. - fn bootstrap_timeline(&self, timeline_id: TimelineId) -> Result> { + fn bootstrap_timeline( + &self, + timeline_id: TimelineId, + pg_version: u32, + ) -> Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let initdb_path = path_with_suffix_extension( @@ -1012,7 +1024,7 @@ impl Tenant { ); // Init temporarily repo to get bootstrap data - run_initdb(self.conf, &initdb_path)?; + run_initdb(self.conf, &initdb_path, pg_version)?; let pgdata_path = initdb_path; let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); @@ -1021,7 +1033,7 @@ impl Tenant { // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = self.create_empty_timeline(timeline_id, lsn)?; + let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?; import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; fail::fail_point!("before-checkpoint-new-timeline", |_| { @@ -1094,10 +1106,10 @@ impl Tenant { /// Create the cluster temporarily in 'initdbpath' directory inside the repository /// to get bootstrap data for timeline initialization. -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> { info!("running initdb in {}... ", initdbpath.display()); - let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_path = conf.pg_bin_dir(pg_version).join("initdb"); let initdb_output = Command::new(initdb_path) .args(&["-D", &initdbpath.to_string_lossy()]) .args(&["-U", &conf.superuser]) @@ -1107,8 +1119,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // so no need to fsync it .arg("--no-sync") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .stdout(Stdio::null()) .output() .context("failed to execute initdb")?; From 5e151192f5b4bc1df4162914426fd026193fae0c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 12:43:11 +0300 Subject: [PATCH 122/166] Fix rebase conflicts in safekeeper code --- safekeeper/src/timeline.rs | 8 +++++++- safekeeper/src/wal_storage.rs | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index ec29e13931..c16fc9f40c 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -24,12 +24,12 @@ use utils::{ pq_proto::ReplicationFeedback, }; -use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, ServerInfo, }; use crate::send_wal::HotStandbyFeedback; +use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage; @@ -103,6 +103,10 @@ impl SharedState { bail!(TimelineError::UninitializedWalSegSize(*ttid)); } + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(*ttid)); + } + // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; @@ -270,6 +274,8 @@ pub enum TimelineError { AlreadyExists(TenantTimelineId), #[error("Timeline {0} is not initialized, wal_seg_size is zero")] UninitializedWalSegSize(TenantTimelineId), + #[error("Timeline {0} is not initialized, pg_version is unknown")] + UninitialinzedPgVersion(TenantTimelineId), } /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 9e198fc148..95ad71bbbd 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -13,9 +13,7 @@ use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use postgres_ffi::v14::xlog_utils::{ - IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, -}; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; @@ -29,7 +27,6 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; -use crate::safekeeper::UNKNOWN_SERVER_VERSION; use crate::wal_backup::read_object; use crate::SafeKeeperConf; @@ -117,7 +114,19 @@ impl PhysicalStorage { let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn)? + match state.server.pg_version / 10000 { + 14 => postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + 15 => postgres_ffi::v15::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + _ => bail!("unsupported postgres version"), + } }; // TODO: do we really know that write_lsn is fully flushed to disk? @@ -140,7 +149,7 @@ impl PhysicalStorage { write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn, UNKNOWN_SERVER_VERSION), + decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), file: None, }) } From 262fa3be0911a5e8ed7c310012cb064e5e39f470 Mon Sep 17 00:00:00 2001 From: Egor Suvorov Date: Thu, 22 Sep 2022 17:07:08 +0300 Subject: [PATCH 123/166] pageserver pg proto: add missing auth checks (#2494) Fixes #1858 --- pageserver/src/page_service.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 368b4c8bee..758faa4d9a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1023,6 +1023,9 @@ impl postgres_backend_async::Handler for PageServerHandler { let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); let tenant_id = TenantId::from_str(params[0])?; + + self.check_permission(Some(tenant_id))?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), @@ -1067,14 +1070,14 @@ impl postgres_backend_async::Handler for PageServerHandler { let caps = re .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); + self.check_permission(Some(tenant_id))?; + + let timeline = get_local_timeline(tenant_id, timeline_id)?; pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; From 7138db927947515aae31cca0132a16e9d98469d4 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 15:48:35 +0300 Subject: [PATCH 124/166] Fix paths to postgres binaries in the deploy script --- .github/ansible/get_binaries.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index f44a1ca50a..f96cff247f 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -24,7 +24,8 @@ tar -xzf postgres_install.tar.gz -C neon_install docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/ +docker cp ${ID}:/usr/local/v14/bin/postgres neon_install/bin/v14 +docker cp ${ID}:/usr/local/v15/bin/postgres neon_install/bin/v15 docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball From 8b42c184e77f1284902e60fe29c353b7d8322eb1 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 16:06:32 +0300 Subject: [PATCH 125/166] Update LD_LIBRARY_PATH in deploy scripts --- .github/ansible/deploy.yaml | 4 ++-- .github/ansible/systemd/pageserver.service | 2 +- .github/ansible/systemd/safekeeper.service | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 6982445558..7409051574 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -58,7 +58,7 @@ creates: "/storage/pageserver/data/tenants" environment: NEON_REPO_DIR: "/storage/pageserver/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - pageserver @@ -132,7 +132,7 @@ creates: "/storage/safekeeper/data/safekeeper.id" environment: NEON_REPO_DIR: "/storage/safekeeper/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - safekeeper diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service index bb78054fa3..688c7e7b87 100644 --- a/.github/ansible/systemd/pageserver.service +++ b/.github/ansible/systemd/pageserver.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=pageserver -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service index d5c6d00017..36af414761 100644 --- a/.github/ansible/systemd/safekeeper.service +++ b/.github/ansible/systemd/safekeeper.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=safekeeper -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed From 7c1695e87d91f3ebac6c64ca699304c15568559d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 16:11:46 +0300 Subject: [PATCH 126/166] fix psql path in export_import_between_pageservers script --- scripts/export_import_between_pageservers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 1285d0476b..6f6c3864dd 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -710,8 +710,8 @@ if __name__ == "__main__": "--psql-path", dest="psql_path", required=False, - default="/usr/local/bin/psql", - help="Path to the psql binary. Default: /usr/local/bin/psql", + default="/usr/local/v14/bin/psql", + help="Path to the psql binary. Default: /usr/local/v14/bin/psql", ) parser.add_argument( "--only-import", From eb9200abc82ba9634b9fdf229415df7dffb7a38b Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 17:11:52 +0300 Subject: [PATCH 127/166] Use version-specific path in pytest CI script --- .github/actions/run-python-test-set/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index bed0bc69dc..f3531004a1 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -127,7 +127,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v14/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v{$DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. From c81ede8644ea8cdd71b102235f7cd2fffa2a53d2 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 Sep 2022 20:51:31 +0300 Subject: [PATCH 128/166] Hotfix for safekeeper timelines with unknown pg_version. Assume DEFAULT_PG_VERSION = 14 --- safekeeper/src/wal_storage.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 95ad71bbbd..eee7c703f9 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -125,7 +125,17 @@ impl PhysicalStorage { wal_seg_size, state.commit_lsn, )?, - _ => bail!("unsupported postgres version"), + pg_majorversion => { + // This is a quik hack to work with old timelines that don't have + // pg_version in the control file. We can remove it after this is fixed properly. + const DEFAULT_PG_MAJOR_VERSION: u32 = 14; + warn!("unknown postgres version {pg_majorversion} assume {DEFAULT_PG_MAJOR_VERSION}"); + postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )? + } } }; From 43560506c070ae1c557c9bdd847ea0497dde1923 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 17:23:02 +0300 Subject: [PATCH 129/166] remove duplicate walreceiver connection span --- pageserver/src/walreceiver/walreceiver_connection.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 5ac9a3ef7a..15cfad1dcd 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -16,7 +16,7 @@ use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; -use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info, trace, warn}; use super::TaskEvent; use crate::metrics::LIVE_CONNECTIONS_COUNT; @@ -112,8 +112,7 @@ pub async fn handle_walreceiver_connection( _ = connection_cancellation.changed() => info!("Connection cancelled"), } Ok(()) - } - .instrument(info_span!("walreceiver connection")), + }, ); // Immediately increment the gauge, then create a job to decrement it on task exit. From b0377f750a798f99e71b640d3a07ae76d480435f Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Fri, 23 Sep 2022 10:25:26 +0200 Subject: [PATCH 130/166] Add staging-test region to normal staging rollouts (#2500) --- .github/ansible/staging.hosts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index c470f8a814..f5accc188a 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -3,11 +3,15 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-ps-3 console_region_id=27 zenith-us-stage-ps-4 console_region_id=27 +zenith-us-stage-test-ps-1 console_region_id=28 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 zenith-us-stage-sk-6 console_region_id=27 +zenith-us-stage-test-sk-1 console_region_id=28 +zenith-us-stage-test-sk-2 console_region_id=28 +zenith-us-stage-test-sk-3 console_region_id=28 [storage:children] pageservers From 52819898e4c65bcc79206d4ff20af9f1f5f08396 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Fri, 23 Sep 2022 11:25:29 +0200 Subject: [PATCH 131/166] Extend image push step with production ECR (#2465) * Extend image push step with production ECR * Put copy step before auth change * Use correct name * Only push on main * Fix typo --- .github/workflows/build_and_test.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 44db968753..5f84e20452 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -588,7 +588,16 @@ jobs: - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust - - name: Configure docker login + - name: Push images to production ECR + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest + + - name: Configure Docker Hub login run: | # ECR Credential Helper & Docker Hub don't work together in config, hence reset echo "" > /github/home/.docker/config.json @@ -609,7 +618,7 @@ jobs: - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned - - name: Add latest tag to images + - name: Add latest tag to images in Docker Hub if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' From eb0c6bcf1a1b4eed35ba2bb439b5e30905e753f9 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 22 Sep 2022 17:31:16 +0300 Subject: [PATCH 132/166] reenable storage deployments --- .github/ansible/deploy.yaml | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 7409051574..e206f9d5ba 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -63,18 +63,18 @@ tags: - pageserver - # - name: update remote storage (s3) config - # lineinfile: - # path: /storage/pageserver/data/pageserver.toml - # line: "{{ item }}" - # loop: - # - "[remote_storage]" - # - "bucket_name = '{{ bucket_name }}'" - # - "bucket_region = '{{ bucket_region }}'" - # - "prefix_in_bucket = '{{ inventory_hostname }}'" - # become: true - # tags: - # - pageserver + - name: update remote storage (s3) config + lineinfile: + path: /storage/pageserver/data/pageserver.toml + line: "{{ item }}" + loop: + - "[remote_storage]" + - "bucket_name = '{{ bucket_name }}'" + - "bucket_region = '{{ bucket_region }}'" + - "prefix_in_bucket = '{{ inventory_hostname }}'" + become: true + tags: + - pageserver - name: upload systemd service definition ansible.builtin.template: @@ -87,15 +87,15 @@ tags: - pageserver - # - name: start systemd service - # ansible.builtin.systemd: - # daemon_reload: yes - # name: pageserver - # enabled: yes - # state: restarted - # become: true - # tags: - # - pageserver + - name: start systemd service + ansible.builtin.systemd: + daemon_reload: yes + name: pageserver + enabled: yes + state: restarted + become: true + tags: + - pageserver - name: post version to console when: console_mgmt_base_url is defined From 3e65209a067d7243162d9bd84841425e088a0d9b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 23 Sep 2022 12:50:36 +0100 Subject: [PATCH 133/166] Nightly Benchmarks: use Postgres binaries from artifacts (#2501) --- .github/actions/download/action.yml | 9 ++++-- .../actions/run-python-test-set/action.yml | 2 +- .github/actions/upload/action.yml | 9 ++++-- .github/workflows/benchmarking.yml | 21 ++++++++++---- .github/workflows/build_and_test.yml | 29 +++++++++++++++++-- 5 files changed, 54 insertions(+), 16 deletions(-) diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index 5aa45164e7..731ef6639d 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -12,6 +12,9 @@ inputs: description: "Allow to skip if file doesn't exist, fail otherwise" default: false required: false + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -23,18 +26,18 @@ runs: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then echo '::set-output name=SKIPPED::true' exit 0 else - echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist" + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi fi diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index f3531004a1..cc6ab65b76 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -127,7 +127,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v{$DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index de8df3230f..291a2cf3b0 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -7,6 +7,9 @@ inputs: path: description: "A directory or file to upload" required: true + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -42,14 +45,14 @@ runs: env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) FILESIZE=$(du -sh ${ARCHIVE} | cut -f1) - time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} + time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME} # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary - echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} + echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4e28223c18..4d91e9fa74 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -46,7 +46,8 @@ jobs: runs-on: [self-hosted, zenith-benchmarker] env: - POSTGRES_DISTRIB_DIR: "/usr/pgsql-14" + POSTGRES_DISTRIB_DIR: /tmp/pg_install + DEFAULT_PG_VERSION: 14 steps: - name: Checkout zenith repo @@ -71,7 +72,7 @@ jobs: echo Poetry poetry --version echo Pgbench - $POSTGRES_DISTRIB_DIR/bin/pgbench --version + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - name: Create Neon Project id: create-neon-project @@ -140,7 +141,8 @@ jobs: env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: "10gb" - POSTGRES_DISTRIB_DIR: /usr + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} @@ -163,10 +165,17 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install Deps + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Add Postgres binaries to PATH run: | - sudo apt -y update - sudo apt install -y postgresql-14 + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version + echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project if: matrix.platform != 'neon-captest-reuse' diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5f84e20452..8a7cdec89c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -268,6 +268,32 @@ jobs: if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data + upload-latest-artifacts: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ regress-tests ] + if: github.ref_name == 'main' + steps: + - name: Copy Neon artifact to the latest directory + shell: bash -euxo pipefail {0} + env: + BUCKET: neon-github-public-dev + PREFIX: artifacts/${{ github.run_id }} + run: | + for build_type in debug release; do + FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME} + done + benchmarks: runs-on: dev container: @@ -335,9 +361,6 @@ jobs: curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync - # Workaround for https://github.com/neondatabase/cloud/issues/2188 - psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10 - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json coverage-report: From bc3ba23e0a485e3fc5434ea093062bc4347915f1 Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 23 Sep 2022 14:35:36 +0200 Subject: [PATCH 134/166] Fix extreme metrics bloat in storage sync (#2506) * Fix extreme metrics bloat in storage sync From 78 metrics per (timeline, tenant) pair down to (max) 10 metrics per (timeline, tenant) pair, plus another 117 metrics in a global histogram that replaces the previous per-timeline histogram. * Drop image sync operation metric series when dropping TimelineMetrics. --- pageserver/src/metrics.rs | 45 ++++++++++++++++++++++----- pageserver/src/storage_sync.rs | 56 +++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 32 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 2f03943429..5c2f81d731 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,8 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ - register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, - register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec, - IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, + UIntGaugeVec, }; use once_cell::sync::Lazy; use utils::id::{TenantId, TimelineId}; @@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { .expect("failed to register pageserver remote storage remaining sync items int gauge") }); -pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { +pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_gauge_vec!( + "pageserver_remote_storage_image_sync_duration", + "Time spent to synchronize (up/download) a whole pageserver image", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register per-timeline pageserver image sync time vec") +}); + +pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"]; +pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"]; + +pub static IMAGE_SYNC_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_image_sync_count", + "Number of synchronization operations executed for pageserver images. \ + Grouped by tenant, timeline, operation_kind and status", + &["tenant_id", "timeline_id", "operation_kind", "status"] + ) + .expect("failed to register pageserver image sync count vec") +}); + +pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", - &["tenant_id", "timeline_id", "operation_kind", "status"], + Grouped by operation_kind and status", + &["operation_kind", "status"], vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] ) .expect("failed to register pageserver image sync time histogram vec") @@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets { () => { vec![ 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, - 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, ] }; } @@ -411,6 +434,14 @@ impl Drop for TimelineMetrics { for op in SMGR_QUERY_TIME_OPERATIONS { let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); } + + for op in IMAGE_SYNC_OPERATION_KINDS { + for status in IMAGE_SYNC_STATUS { + let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]); + } + } + + let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]); } } diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 892a34a76f..776d9214d4 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -178,6 +178,7 @@ use crate::{ TenantTimelineValues, }; +use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use self::download::download_index_parts; @@ -835,7 +836,6 @@ async fn process_sync_task_batch( sync_id, upload_data, sync_start, - "upload", ) .await } @@ -879,7 +879,6 @@ async fn process_sync_task_batch( sync_id, download_data, sync_start, - "download", ) .await; } @@ -911,7 +910,6 @@ async fn process_sync_task_batch( sync_id, delete_data, sync_start, - "delete", ) .instrument(info_span!("delete_timeline_data")) .await; @@ -948,8 +946,9 @@ async fn download_timeline_data( sync_id: TenantTimelineId, new_download_data: SyncData, sync_start: Instant, - task_name: &str, ) -> DownloadStatus { + static TASK_NAME: &str = "download"; + match download_timeline_layers( conf, storage, @@ -961,19 +960,19 @@ async fn download_timeline_data( .await { DownloadedTimeline::Abort => { - register_sync_status(sync_id, sync_start, task_name, None); + register_sync_status(sync_id, sync_start, TASK_NAME, None); if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } } DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); return DownloadStatus::Downloaded; } Err(e) => { @@ -984,7 +983,7 @@ async fn download_timeline_data( error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; sync_queue.push(sync_id, SyncTask::Download(download_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } } } @@ -1060,8 +1059,9 @@ async fn delete_timeline_data( sync_id: TenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, - task_name: &str, ) { + static TASK_NAME: &str = "delete"; + let timeline_delete = &mut new_delete_data.data; if !timeline_delete.deletion_registered { @@ -1077,14 +1077,14 @@ async fn delete_timeline_data( error!("Failed to update remote timeline {sync_id}: {e:?}"); new_delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return; } } timeline_delete.deletion_registered = true; let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; - register_sync_status(sync_id, sync_start, task_name, Some(sync_status)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status)); } async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { @@ -1103,8 +1103,8 @@ async fn upload_timeline_data( sync_id: TenantTimelineId, new_upload_data: SyncData, sync_start: Instant, - task_name: &str, ) -> UploadStatus { + static TASK_NAME: &str = "upload"; let mut uploaded_data = match upload_timeline_layers( storage, sync_queue, @@ -1115,7 +1115,7 @@ async fn upload_timeline_data( .await { UploadedTimeline::FailedAndRescheduled(e) => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return UploadStatus::Failed(e); } UploadedTimeline::Successful(upload_data) => upload_data, @@ -1134,14 +1134,14 @@ async fn upload_timeline_data( .await { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); UploadStatus::Uploaded } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); UploadStatus::Failed(e) } } @@ -1391,16 +1391,22 @@ fn register_sync_status( let tenant_id = sync_id.tenant_id.to_string(); let timeline_id = sync_id.timeline_id.to_string(); - match sync_status { - Some(true) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"]) - } - Some(false) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"]) - } - None => return, - } - .observe(secs_elapsed) + + let sync_status = match sync_status { + Some(true) => "success", + Some(false) => "failure", + None => "abort", + }; + + IMAGE_SYNC_TIME_HISTOGRAM + .with_label_values(&[sync_name, sync_status]) + .observe(secs_elapsed); + IMAGE_SYNC_TIME + .with_label_values(&[&tenant_id, &timeline_id]) + .add(secs_elapsed); + IMAGE_SYNC_COUNT + .with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status]) + .inc(); } #[cfg(test)] From ebab89ebd22fa77ff0cf6821ff22716642fe8a03 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 23 Sep 2022 13:51:33 +0100 Subject: [PATCH 135/166] test_runner: pass password to pgbench via PGPASSWORD (#2468) --- test_runner/fixtures/log_helper.py | 13 -------- test_runner/fixtures/neon_fixtures.py | 5 +++ test_runner/performance/test_perf_pgbench.py | 34 +++++++++++++------- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 7d112fce89..17f2402391 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,6 +1,5 @@ import logging import logging.config -import re """ This file configures logging to use in python tests. @@ -30,17 +29,6 @@ LOGGING = { } -class PasswordFilter(logging.Filter): - """Filter out password from logs.""" - - # Good enough to filter our passwords produced by PgProtocol.connstr - FILTER = re.compile(r"(\s*)password=[^\s]+(\s*)") - - def filter(self, record: logging.LogRecord) -> bool: - record.msg = self.FILTER.sub(r"\1password=\2", str(record.msg)) - return True - - def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. @@ -50,6 +38,5 @@ def getLogger(name="root") -> logging.Logger: # default logger for tests log = getLogger() -log.addFilter(PasswordFilter()) logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3c60437426..aa9fd68df5 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -283,10 +283,15 @@ class PgProtocol: return str(make_dsn(**self.conn_options(**kwargs))) def conn_options(self, **kwargs): + """ + Construct a dictionary of connection options from default values and extra parameters. + An option can be dropped from the returning dictionary by None-valued extra parameter. + """ result = self.default_options.copy() if "dsn" in kwargs: result.update(parse_dsn(kwargs["dsn"])) result.update(kwargs) + result = {k: v for k, v in result.items() if v is not None} # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index e167ddaafa..656826d6a3 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -4,7 +4,7 @@ import os import timeit from datetime import datetime from pathlib import Path -from typing import List +from typing import Dict, List import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult @@ -24,14 +24,18 @@ def utc_now_timestamp() -> int: return calendar.timegm(datetime.utcnow().utctimetuple()) -def init_pgbench(env: PgCompare, cmdline): +def init_pgbench(env: PgCompare, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + # calculate timestamps and durations separately # timestamp is intended to be used for linking to grafana and logs # duration is actually a metric and uses float instead of int for timestamp start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() with env.record_pageserver_writes("init.pageserver_writes"): - out = env.pg_bin.run_capture(cmdline) + out = env.pg_bin.run_capture(cmdline, env=environ) env.flush() duration = timeit.default_timer() - t0 @@ -48,13 +52,15 @@ def init_pgbench(env: PgCompare, cmdline): env.zenbenchmark.record_pg_bench_init_result("init", res) -def run_pgbench(env: PgCompare, prefix: str, cmdline): +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + with env.record_pageserver_writes(f"{prefix}.pageserver_writes"): run_start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - out = env.pg_bin.run_capture( - cmdline, - ) + out = env.pg_bin.run_capture(cmdline, env=environ) run_duration = timeit.default_timer() - t0 run_end_timestamp = utc_now_timestamp() env.flush() @@ -82,10 +88,14 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline): def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): env.zenbenchmark.record("scale", scale, "", MetricReport.TEST_PARAM) + password = env.pg.default_options.get("password", None) + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + if workload_type == PgBenchLoadType.INIT: # Run initialize - options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") - init_pgbench(env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options=options)]) + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload @@ -99,8 +109,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) if workload_type == PgBenchLoadType.SELECT_ONLY: @@ -115,8 +126,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) env.report_size() From 1dffba9de6a0e30e1cb63c9462c88c2f6587d2f0 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 23 Sep 2022 18:30:44 +0300 Subject: [PATCH 136/166] Write more tests for the proxy... (#1918) And change a few more things in the process. --- proxy/src/auth/backend/console.rs | 12 ++++++++ proxy/src/auth/credentials.rs | 11 +++----- proxy/src/cancellation.rs | 46 +++++++++++++++++++++++++++++++ proxy/src/parse.rs | 28 ++++++++++++++++++- 4 files changed, 89 insertions(+), 8 deletions(-) diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index e5ee07813c..a351b82c6a 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -259,3 +259,15 @@ fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; Some((host, port.parse().ok()?)) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_host_port() { + let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 5432); + } +} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index ea71eba010..e43bcf8791 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -54,13 +54,10 @@ impl<'a> ClientCredentials<'a> { let dbname = get_param("database")?; // Project name might be passed via PG's command-line options. - let project_a = params.options_raw().and_then(|options| { - for opt in options { - if let Some(value) = opt.strip_prefix("project=") { - return Some(Cow::Borrowed(value)); - } - } - None + let project_a = params.options_raw().and_then(|mut options| { + options + .find_map(|opt| opt.strip_prefix("project=")) + .map(Cow::Borrowed) }); // Alternative project name is in fact a subdomain from SNI. diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index b7412b6f5b..92f8e35dab 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -52,6 +52,16 @@ impl CancelMap { let session = Session::new(key, self); f(session).await } + + #[cfg(test)] + fn contains(&self, session: &Session) -> bool { + self.0.lock().contains_key(&session.key) + } + + #[cfg(test)] + fn is_empty(&self) -> bool { + self.0.lock().is_empty() + } } /// This should've been a [`std::future::Future`], but @@ -104,3 +114,39 @@ impl<'a> Session<'a> { self.key } } + +#[cfg(test)] +mod tests { + use super::*; + use once_cell::sync::Lazy; + + #[tokio::test] + async fn check_session_drop() -> anyhow::Result<()> { + static CANCEL_MAP: Lazy = Lazy::new(Default::default); + + let (tx, rx) = tokio::sync::oneshot::channel(); + let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { + assert!(CANCEL_MAP.contains(&session)); + + tx.send(()).expect("failed to send"); + let () = futures::future::pending().await; // sleep forever + + Ok(()) + })); + + // Wait until the task has been spawned. + let () = rx.await.context("failed to hear from the task")?; + + // Drop the session's entry by cancelling the task. + task.abort(); + let error = task.await.expect_err("task should have failed"); + if !error.is_cancelled() { + anyhow::bail!(error); + } + + // Check that the session has been dropped. + assert!(CANCEL_MAP.is_empty()); + + Ok(()) + } +} diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 8a05ff9c82..cbd48d91e9 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -1,6 +1,5 @@ //! Small parsing helpers. -use std::convert::TryInto; use std::ffi::CStr; pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { @@ -10,9 +9,36 @@ pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other)) } +/// See . pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { (bytes.len() >= N).then(|| { let (head, tail) = bytes.split_at(N); (head.try_into().unwrap(), tail) }) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_cstr() { + assert!(split_cstr(b"").is_none()); + assert!(split_cstr(b"foo").is_none()); + + let (cstr, rest) = split_cstr(b"\0").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b""); + assert_eq!(rest, b""); + + let (cstr, rest) = split_cstr(b"foo\0bar").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b"foo"); + assert_eq!(rest, b"bar"); + } + + #[test] + fn test_split_at_const() { + assert!(split_at_const::<0>(b"").is_some()); + assert!(split_at_const::<1>(b"").is_none()); + assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); + } +} From 5ccd54c699a3953486ce200c6f8ad3a9e39b8eb0 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 23 Sep 2022 13:08:05 +0300 Subject: [PATCH 137/166] Add support for h3-pg and re-enable plv8 --- Dockerfile.compute-node-v14 | 50 ++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index 8ddf752191..f3773868d0 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -8,9 +8,12 @@ ARG TAG=pinned # Layer "build-deps" # FROM debian:bullseye-slim AS build-deps +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev + libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev # # Layer "pg-build" @@ -37,7 +40,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ tar xvzf postgis-3.3.0.tar.gz && \ @@ -59,15 +62,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ # Build plv8 # FROM build-deps AS plv8-build -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 # https://github.com/plv8/plv8/issues/475 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ - apt update && \ +RUN apt update && \ apt install -y --no-install-recommends -t testing binutils RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -79,12 +80,45 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "h3-pg-build" +# Build h3_pg +# +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN apt update && \ + apt install -y --no-install-recommends -t testing cmake + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + # # Layer "neon-pg-ext-build" # compile neon extensions # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -132,8 +166,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ chmod 0750 /var/db/postgres/compute && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig -# TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl From 805bb198c287a5a1ac3e28627165313335c69cc9 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Fri, 23 Sep 2022 11:49:28 -0700 Subject: [PATCH 138/166] Miscellaneous small fixups (#2503) Changes are: * Correct typo "firts" -> "first" * Change to * Fix weird indentation that rustfmt was failing to handle * Use existing `anyhow::{anyhow,bail}!` as `{anyhow,bail}!` if it's already in scope * Spell `Result` as `anyhow::Result` * In general, closer to matching the rest of the codebase * Change usages of `hash_map::Entry` to `Entry` when it's already in scope * A quick search shows our style on this one varies across the files it's used in --- pageserver/src/tenant.rs | 23 +++++++++++------------ pageserver/src/tenant/timeline.rs | 8 +++++--- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ed41641277..c9ad3bf232 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -17,7 +17,6 @@ use tracing::*; use utils::crashsafe_dir::path_with_suffix_extension; use std::cmp::min; -use std::collections::hash_map; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -246,12 +245,12 @@ impl Tenant { let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - lsn, - ancestor_timeline_id, - ancestor_ancestor_lsn, - ); + bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); } } @@ -406,11 +405,11 @@ impl Tenant { .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; match timelines_accessor.entry(timeline.timeline_id) { - hash_map::Entry::Occupied(_) => anyhow::bail!( + Entry::Occupied(_) => bail!( "Found freshly initialized timeline {} in the tenant map", timeline.timeline_id ), - hash_map::Entry::Vacant(v) => { + Entry::Vacant(v) => { v.insert(timeline); } } @@ -768,7 +767,7 @@ impl Tenant { }) .with_context(|| { format!( - "Failed to fsync on firts save for config {}", + "Failed to fsync on first save for config {}", target_config_path.display() ) })?; @@ -1091,11 +1090,11 @@ impl Tenant { })?; match timelines.entry(new_timeline_id) { - hash_map::Entry::Occupied(_) => anyhow::bail!( + Entry::Occupied(_) => bail!( "Found freshly initialized timeline {} in the tenant map", new_timeline_id ), - hash_map::Entry::Vacant(v) => { + Entry::Vacant(v) => { v.insert(Arc::clone(&new_timeline)); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 019de81d64..74e873e632 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -343,7 +343,9 @@ impl Timeline { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + Ordering::Greater => { + unreachable!("the returned lsn should never be after the requested lsn") + } } Some((cached_lsn, cached_img)) } @@ -726,10 +728,10 @@ impl Timeline { Ok(()) } - pub fn layer_removal_guard(&self) -> Result, anyhow::Error> { + pub fn layer_removal_guard(&self) -> anyhow::Result> { self.layer_removal_cs .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map_err(|e| anyhow!("cannot lock compaction critical section {e}")) } /// Retrieve current logical size of the timeline. From 093264a69523c5f8f007b35cf26be4e0b11c1de9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 23 Sep 2022 19:59:27 +0300 Subject: [PATCH 139/166] Fix deploy bin and lib paths for postgres --- .github/ansible/get_binaries.sh | 4 ++-- Dockerfile | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index f96cff247f..dbbd5b454a 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -24,8 +24,8 @@ tar -xzf postgres_install.tar.gz -C neon_install docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/v14/bin/postgres neon_install/bin/v14 -docker cp ${ID}:/usr/local/v15/bin/postgres neon_install/bin/v15 +docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ +docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball diff --git a/Dockerfile b/Dockerfile index 876a20cc1a..69402919ec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,9 +19,8 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf pg_install/v14/build \ - && rm -rf pg_install/v15/build \ - && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . + && rm -rf pg_install/build \ + && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . # Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build From 1165686201db64f5c58dbfcb791462f85a513352 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 23 Sep 2022 20:13:58 +0300 Subject: [PATCH 140/166] fix deploy lib paths for postgres --- .github/ansible/get_binaries.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index dbbd5b454a..b2f1fb38e6 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -26,6 +26,8 @@ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ +docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/ +docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball From 367cc012903a7dc60d061a17ab61227f97598120 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 26 Sep 2022 10:07:18 +0300 Subject: [PATCH 141/166] Fix deploy paths --- .github/ansible/get_binaries.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index b2f1fb38e6..a484bfb0a0 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -21,6 +21,7 @@ docker pull --quiet neondatabase/neon:${DOCKER_TAG} ID=$(docker create neondatabase/neon:${DOCKER_TAG}) docker cp ${ID}:/data/postgres_install.tar.gz . tar -xzf postgres_install.tar.gz -C neon_install +mkdir neon_install/bin/ docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ From df45c0d0e57477768097c13c2c3299e634f963b8 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 26 Sep 2022 12:16:52 +0300 Subject: [PATCH 142/166] Disable plv8 again --- Dockerfile.compute-node-v14 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index f3773868d0..ed57b29009 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -116,7 +116,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +# plv8 still sometimes crashes during the creation +# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ From d15116f2cc4b26ad36f9cf28c5cf9f9343269cc3 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 23 Sep 2022 14:36:08 +0000 Subject: [PATCH 143/166] Update pg_version for old timelines --- safekeeper/src/control_file_upgrade.rs | 12 ++++++++++++ safekeeper/src/safekeeper.rs | 3 +-- safekeeper/src/timeline.rs | 2 ++ safekeeper/src/wal_storage.rs | 16 +++++----------- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index d8434efb20..1ce9186085 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -248,6 +248,18 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result oldstate.timeline_start_lsn = Lsn(1); oldstate.local_start_lsn = Lsn(1); + return Ok(oldstate); + } else if version == 6 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.server.pg_version != 0 { + return Ok(oldstate); + } + + // set pg_version to the default v14 + info!("setting pg_version to 140005"); + oldstate.server.pg_version = 140005; + return Ok(oldstate); } bail!("unsupported safekeeper control file version {}", version) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index eec24faf2f..7869aa8b3a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -25,7 +25,7 @@ use utils::{ }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 6; +pub const SK_FORMAT_VERSION: u32 = 7; const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; @@ -639,7 +639,6 @@ where let mut state = self.state.clone(); state.server.system_id = msg.system_id; - state.server.wal_seg_size = msg.wal_seg_size; if msg.pg_version != UNKNOWN_SERVER_VERSION { state.server.pg_version = msg.pg_version; } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index c16fc9f40c..dc7503af65 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -314,6 +314,8 @@ impl Timeline { ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, ) -> Result { + let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); + let shared_state = SharedState::restore(&conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state.commit_lsn); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index eee7c703f9..8fbd479d95 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -111,6 +111,10 @@ impl PhysicalStorage { // Find out where stored WAL ends, starting at commit_lsn which is a // known recent record boundary (unless we don't have WAL at all). + // + // NB: find_end_of_wal MUST be backwards compatible with the previously + // written WAL. If find_end_of_wal fails to read any WAL written by an + // older version of the code, we could lose data forever. let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { @@ -125,17 +129,7 @@ impl PhysicalStorage { wal_seg_size, state.commit_lsn, )?, - pg_majorversion => { - // This is a quik hack to work with old timelines that don't have - // pg_version in the control file. We can remove it after this is fixed properly. - const DEFAULT_PG_MAJOR_VERSION: u32 = 14; - warn!("unknown postgres version {pg_majorversion} assume {DEFAULT_PG_MAJOR_VERSION}"); - postgres_ffi::v14::xlog_utils::find_end_of_wal( - &timeline_dir, - wal_seg_size, - state.commit_lsn, - )? - } + _ => bail!("unsupported postgres version: {}", state.server.pg_version), } }; From fb68d01449edb4be9a0d064d69a442dd3688783e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 26 Sep 2022 23:57:02 +0300 Subject: [PATCH 144/166] Preserve task result in TaskHandle by keeping join handle around (#2521) * Preserve task result in TaskHandle by keeping join handle around The solution is not great, but it should hep to debug staging issue I tried to do it in a least destructive way. TaskHandle used only in one place so it is ok to use something less generic unless we want to extend its usage across the codebase. In its current current form for its single usage place it looks too abstract Some problems around this code: 1. Task can drop event sender and continue running 2. Task cannot be joined several times (probably not needed, but still, can be surprising) 3. Had to split task event into two types because ahyhow::Error does not implement clone. So TaskContinueEvent derives clone but usual task evend does not. Clone requirement appears because we clone the current value in next_task_event. Taking it by reference is complicated. 4. Split between Init and Started is artificial and comes from watch::channel requirement to have some initial value. To summarize from 3 and 4. It may be a better idea to use RWLock or a bounded channel instead --- pageserver/src/walreceiver.rs | 76 ++++++++++++++----- .../src/walreceiver/connection_manager.rs | 43 ++++++----- .../src/walreceiver/walreceiver_connection.rs | 16 ++-- 3 files changed, 89 insertions(+), 46 deletions(-) diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index deac299747..c7de24080a 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -31,7 +31,6 @@ use etcd_broker::Client; use itertools::Itertools; use once_cell::sync::OnceCell; use std::future::Future; -use std::sync::Arc; use tokio::sync::watch; use tracing::*; use url::Url; @@ -88,37 +87,44 @@ pub fn is_etcd_client_initialized() -> bool { /// That may lead to certain events not being observed by the listener. #[derive(Debug)] pub struct TaskHandle { - events_receiver: watch::Receiver>, + join_handle: Option>>, + events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } -#[derive(Debug, Clone)] pub enum TaskEvent { + Update(TaskStateUpdate), + End(anyhow::Result<()>), +} + +#[derive(Debug, Clone)] +pub enum TaskStateUpdate { + Init, Started, - NewEvent(E), - End, + Progress(E), } impl TaskHandle { /// Initializes the task, starting it immediately after the creation. pub fn spawn( - task: impl FnOnce(Arc>>, watch::Receiver<()>) -> Fut + Send + 'static, + task: impl FnOnce(watch::Sender>, watch::Receiver<()>) -> Fut + + Send + + 'static, ) -> Self where - Fut: Future> + Send, - E: Sync + Send + 'static, + Fut: Future> + Send, + E: Send + Sync + 'static, { let (cancellation, cancellation_receiver) = watch::channel(()); - let (events_sender, events_receiver) = watch::channel(TaskEvent::Started); - let events_sender = Arc::new(events_sender); + let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); - let sender = Arc::clone(&events_sender); - let _ = WALRECEIVER_RUNTIME.spawn(async move { - events_sender.send(TaskEvent::Started).ok(); - task(sender, cancellation_receiver).await + let join_handle = WALRECEIVER_RUNTIME.spawn(async move { + events_sender.send(TaskStateUpdate::Started).ok(); + task(events_sender, cancellation_receiver).await }); TaskHandle { + join_handle: Some(join_handle), events_receiver, cancellation, } @@ -126,15 +132,45 @@ impl TaskHandle { async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => TaskEvent::End, + Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), + Err(_task_channel_part_dropped) => { + TaskEvent::End(match self.join_handle.take() { + Some(jh) => { + if !jh.is_finished() { + warn!("sender is dropped while join handle is still alive"); + } + + jh.await + .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) + .and_then(|x| x) + } + None => { + // Another option is to have an enum, join handle or result and give away the reference to it + Err(anyhow::anyhow!("Task was joined more than once")) + } + }) + } } } /// Aborts current task, waiting for it to finish. - pub async fn shutdown(mut self) { - self.cancellation.send(()).ok(); - // wait until the sender is dropped - while self.events_receiver.changed().await.is_ok() {} + pub async fn shutdown(self) { + match self.join_handle { + Some(jh) => { + self.cancellation.send(()).ok(); + match jh.await { + Ok(Ok(())) => debug!("Shutdown success"), + Ok(Err(e)) => error!("Shutdown task error: {e:?}"), + Err(join_error) => { + if join_error.is_cancelled() { + error!("Shutdown task was cancelled"); + } else { + error!("Shutdown task join error: {join_error}") + } + } + } + } + None => {} + } } } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index a82e69e5ba..29179e9871 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,10 +16,10 @@ use std::{ time::Duration, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; use crate::tenant::Timeline; +use crate::{task_mgr, walreceiver::TaskStateUpdate}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -145,19 +145,26 @@ async fn connection_manager_loop_step( let wal_connection = walreceiver_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Started => {}, - TaskEvent::NewEvent(status) => { - if status.has_processed_wal { - // We have advanced last_record_lsn by processing the WAL received - // from this safekeeper. This is good enough to clean unsuccessful - // retries history and allow reconnecting to this safekeeper without - // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + TaskEvent::Update(c) => { + match c { + TaskStateUpdate::Init | TaskStateUpdate::Started => {}, + TaskStateUpdate::Progress(status) => { + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + } + wal_connection.status = status.to_owned(); + } } - wal_connection.status = status; }, - TaskEvent::End => { - debug!("WAL receiving task finished"); + TaskEvent::End(walreceiver_task_result) => { + match walreceiver_task_result { + Ok(()) => debug!("WAL receiving task finished"), + Err(e) => error!("wal receiver task finished with an error: {e:?}"), + } walreceiver_state.drop_old_connection(false).await; }, } @@ -363,13 +370,13 @@ impl WalreceiverState { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - &new_wal_source_connstr, - events_sender.as_ref(), + new_wal_source_connstr, + events_sender, cancellation, connect_timeout, ) .await - .map_err(|e| format!("walreceiver connection handling failure: {e:#}")) + .context("walreceiver connection handling failure") } .instrument(info_span!("walreceiver_connection", id = %id)) }); @@ -885,7 +892,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1145,7 +1152,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1233,7 +1240,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 15cfad1dcd..ef5baeb570 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -18,8 +18,7 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; -use super::TaskEvent; -use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -55,8 +54,8 @@ pub struct WalConnectionStatus { /// messages as we go. pub async fn handle_walreceiver_connection( timeline: Arc, - wal_source_connstr: &str, - events_sender: &watch::Sender>, + wal_source_connstr: String, + events_sender: watch::Sender>, mut cancellation: watch::Receiver<()>, connect_timeout: Duration, ) -> anyhow::Result<()> { @@ -81,7 +80,7 @@ pub async fn handle_walreceiver_connection( streaming_lsn: None, commit_lsn: None, }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); return Ok(()); } @@ -133,7 +132,7 @@ pub async fn handle_walreceiver_connection( connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); return Ok(()); } @@ -201,7 +200,7 @@ pub async fn handle_walreceiver_connection( } &_ => {} }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -267,7 +266,8 @@ pub async fn handle_walreceiver_connection( if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { // We have successfully processed at least one WAL record. connection_status.has_processed_wal = true; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) + { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } From 2233ca2a391e25699b459c76669b7cb5a1396b5f Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Thu, 22 Sep 2022 12:46:20 +0200 Subject: [PATCH 145/166] seqwait.rs unit tests don't check return value --- libs/utils/src/seqwait.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 467b900a13..bf330a482c 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -240,7 +240,6 @@ where mod tests { use super::*; use std::sync::Arc; - use std::thread::sleep; use std::time::Duration; impl MonotonicCounter for i32 { @@ -258,17 +257,19 @@ mod tests { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh1 = tokio::task::spawn(async move { seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).await.expect_err("no 999"); + seq2.wait_for_timeout(999, Duration::from_millis(100)) + .await + .expect_err("no 999"); }); - tokio::task::spawn(async move { + let jh2 = tokio::task::spawn(async move { seq3.wait_for(42).await.expect("wait_for 42"); seq3.wait_for(0).await.expect("wait_for 0"); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_millis(200)).await; let old = seq.advance(99); assert_eq!(old, 0); seq.wait_for(100).await.expect("wait_for 100"); @@ -277,6 +278,9 @@ mod tests { assert_eq!(seq.advance(98), 100); assert_eq!(seq.load(), 100); + jh1.await.unwrap(); + jh2.await.unwrap(); + seq.shutdown(); } @@ -284,15 +288,18 @@ mod tests { async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh = tokio::task::spawn(async move { let timeout = Duration::from_millis(1); let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - tokio::time::sleep(Duration::from_secs(1)).await; + tokio::time::sleep(Duration::from_millis(200)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); - assert_eq!(old, 0) + assert_eq!(old, 0); + jh.await.unwrap(); + + seq.shutdown(); } } From fc7087b16f79a3c0c04f8ea8c6fdc2cd74472f81 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 27 Sep 2022 10:57:59 +0200 Subject: [PATCH 146/166] Add metric for loaded safekeeper timelines (#2509) --- safekeeper/src/metrics.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 51138df776..095d80623a 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -2,7 +2,7 @@ use std::time::{Instant, SystemTime}; -use ::metrics::{register_histogram, GaugeVec, Histogram, DISK_WRITE_SECONDS_BUCKETS}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, @@ -135,6 +135,7 @@ pub struct TimelineCollector { written_wal_seconds: GaugeVec, flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, + timelines_count: IntGauge, } impl Default for TimelineCollector { @@ -311,6 +312,13 @@ impl TimelineCollector { .unwrap(); descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + let timelines_count = IntGauge::new( + "safekeeper_timelines", + "Total number of timelines loaded in-memory", + ) + .unwrap(); + descs.extend(timelines_count.desc().into_iter().cloned()); + TimelineCollector { descs, commit_lsn, @@ -330,6 +338,7 @@ impl TimelineCollector { written_wal_seconds, flushed_wal_seconds, collect_timeline_metrics, + timelines_count, } } } @@ -361,6 +370,7 @@ impl Collector for TimelineCollector { self.flushed_wal_seconds.reset(); let timelines = GlobalTimelines::get_all(); + let timelines_count = timelines.len(); for arc_tli in timelines { let tli = arc_tli.info_for_metrics(); @@ -474,6 +484,10 @@ impl Collector for TimelineCollector { self.collect_timeline_metrics.set(elapsed); mfs.extend(self.collect_timeline_metrics.collect()); + // report total number of timelines + self.timelines_count.set(timelines_count as i64); + mfs.extend(self.timelines_count.collect()); + mfs } } From dabb6d2675717dad380805434e1984a7d0a73f96 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 27 Sep 2022 12:36:17 +0200 Subject: [PATCH 147/166] Fix log level for sk startup logs (#2526) --- libs/postgres_ffi/src/xlog_utils.rs | 6 +++--- safekeeper/src/wal_storage.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 2c16cc9cd9..fbd8468a93 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -170,7 +170,7 @@ pub fn find_end_of_wal( let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); - info!("find_end_of_wal PG_VERSION: {}", pg_version); + debug!("find_end_of_wal PG_VERSION: {}", pg_version); let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); @@ -182,7 +182,7 @@ pub fn find_end_of_wal( match open_wal_segment(&seg_file_path)? { None => { // no more segments - info!( + debug!( "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", result, seg_file_path ); @@ -205,7 +205,7 @@ pub fn find_end_of_wal( match decoder.poll_decode() { Ok(Some(record)) => result = record.0, Err(e) => { - info!( + debug!( "find_end_of_wal reached end at {:?}, decode error: {:?}", result, e ); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8fbd479d95..bc5e2d7b24 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -137,7 +137,7 @@ impl PhysicalStorage { // If not, maybe it's better to call fsync() here to be sure? let flush_lsn = write_lsn; - info!( + debug!( "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); From 7b2f9dc9080821985525fd81fd33e10967062fb1 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 13:33:55 +0300 Subject: [PATCH 148/166] Reuse existing tenants during attach (#2540) --- pageserver/src/storage_sync.rs | 1 + pageserver/src/tenant.rs | 46 ++++----- pageserver/src/tenant_mgr.rs | 27 +++--- .../test_tenants_with_remote_storage.py | 96 +++++++++++++++++++ 4 files changed, 136 insertions(+), 34 deletions(-) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 776d9214d4..bee460d173 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -639,6 +639,7 @@ pub fn spawn_storage_sync_task( (storage, remote_index_clone, sync_queue), max_sync_errors, ) + .instrument(info_span!("storage_sync_loop")) .await; Ok(()) }, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c9ad3bf232..672ee3a488 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -400,16 +400,19 @@ impl Tenant { timeline_id, metadata.pg_version() ); - let timeline = self - .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) - .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; - - match timelines_accessor.entry(timeline.timeline_id) { - Entry::Occupied(_) => bail!( - "Found freshly initialized timeline {} in the tenant map", - timeline.timeline_id + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + match timelines_accessor.entry(timeline_id) { + Entry::Occupied(_) => warn!( + "Timeline {}/{} already exists in the tenant map, skipping its initialization", + self.tenant_id, timeline_id ), Entry::Vacant(v) => { + let timeline = self + .initialize_new_timeline(timeline_id, metadata, ancestor) + .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; v.insert(timeline); } } @@ -609,21 +612,14 @@ impl Tenant { &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, + ancestor: Option>, ) -> anyhow::Result> { - let ancestor = match new_metadata.ancestor_timeline() { - Some(ancestor_timeline_id) => Some( - timelines - .get(&ancestor_timeline_id) - .cloned() - .with_context(|| { - format!( - "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" - ) - })?, - ), - None => None, - }; + if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { + anyhow::ensure!( + ancestor.is_some(), + "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" + ) + } let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); let pg_version = new_metadata.pg_version(); @@ -1080,8 +1076,12 @@ impl Tenant { ) })?; + let ancestor = new_metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id)) + .cloned(); let new_timeline = self - .initialize_new_timeline(new_timeline_id, new_metadata, timelines) + .initialize_new_timeline(new_timeline_id, new_metadata, ancestor) .with_context(|| { format!( "Failed to initialize timeline {}/{}", diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index fcb2c18b79..1efd3d4af4 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -107,6 +107,9 @@ pub fn init_tenant_mgr( /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. +/// +/// Attach happens on startup and sucessful timeline downloads +/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, @@ -122,18 +125,20 @@ pub fn attach_local_tenants( ); debug!("Timelines to attach: {local_timelines:?}"); - let tenant = load_local_tenant(conf, tenant_id, remote_index); - { - match tenants_state::write_tenants().entry(tenant_id) { - hash_map::Entry::Occupied(_) => { - error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); - continue; - } - hash_map::Entry::Vacant(v) => { - v.insert(Arc::clone(&tenant)); - } + let mut tenants_accessor = tenants_state::write_tenants(); + let tenant = match tenants_accessor.entry(tenant_id) { + hash_map::Entry::Occupied(o) => { + info!("Tenant {tenant_id} was found in pageserver's memory"); + Arc::clone(o.get()) } - } + hash_map::Entry::Vacant(v) => { + info!("Tenant {tenant_id} was not found in pageserver's memory, loading it"); + let tenant = load_local_tenant(conf, tenant_id, remote_index); + v.insert(Arc::clone(&tenant)); + tenant + } + }; + drop(tenants_accessor); if tenant.current_state() == TenantState::Broken { warn!("Skipping timeline load for broken tenant {tenant_id}") diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 83affac062..d8424e22c8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -7,19 +7,25 @@ # import asyncio +import os +from pathlib import Path from typing import List, Tuple import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserverHttpClient, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -93,3 +99,93 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem # run final checkpoint manually to flush all the data to remote storage pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenants_attached_after_download( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="remote_storage_kind", + ) + + data_id = 1 + data_secret = "very secret secret" + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + for checkpoint_number in range(1, 3): + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info(f"waiting for checkpoint {checkpoint_number} upload") + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info(f"upload of checkpoint {checkpoint_number} is done") + + ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 + env.postgres.stop_all() + env.pageserver.stop() + + timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + local_layer_deleted = False + for path in Path.iterdir(timeline_dir): + if path.name.startswith("00000"): + # Looks like a layer file. Remove it + os.remove(path) + local_layer_deleted = True + break + assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" + + ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + env.pageserver.start() + client = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + ) + + restored_timelines = client.timeline_list(tenant_id) + assert ( + len(restored_timelines) == 1 + ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + retored_timeline = restored_timelines[0] + assert retored_timeline["timeline_id"] == str( + timeline_id + ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + + +def expect_tenant_to_download_timeline( + client: NeonPageserverHttpClient, + tenant_id: TenantId, +): + for tenant in client.tenant_list(): + if tenant["id"] == str(tenant_id): + assert not tenant.get( + "has_in_progress_downloads", True + ), f"Tenant {tenant_id} should have no downloads in progress" + return + assert False, f"Tenant {tenant_id} is missing on pageserver" From 4f2ac51bdd21ada43efc2b30ad2b3724ed9331cf Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 30 Sep 2022 12:21:56 +0300 Subject: [PATCH 149/166] Bump rustc to 1.61 --- .github/workflows/build_and_test.yml | 6 +++--- .github/workflows/codestyle.yml | 2 +- rust-toolchain.toml | 9 ++++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8a7cdec89c..22042489a8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -127,8 +127,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo- + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -389,7 +389,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 641943199e..6d39958bab 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -106,7 +106,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 1a27e92fec..5aa0f8d4e5 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,11 +1,10 @@ [toolchain] # We try to stick to a toolchain version that is widely available on popular distributions, so that most people # can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later -# version, we can consider updating. As of this writing, 1.60 is available on Debian 'experimental' but not yet on -# 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach -# 'testing' soon (and similarly for the other distributions). -# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. -channel = "1.60" # do update GitHub CI cache values for rust builds, when changing this value +# version, we can consider updating. +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package, +# we use "unstable" version number as the highest version used in the project by default. +channel = "1.61" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From 31123d1fa89f445581826559e8ed440455f01cff Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 3 Oct 2022 17:44:17 +0300 Subject: [PATCH 150/166] Silence clippies, minor doc fix (#2543) * doc: remove stray backtick * chore: clippy::let_unit_value * chore: silence useless_transmute, duplicate_mod * chore: remove allowing deref_nullptr not needed since bindgen 0.60.0. * chore: remove repeated allowed lints they are already allowed from the crate root. --- docs/sourcetree.md | 2 +- libs/postgres_ffi/src/lib.rs | 8 +++++--- libs/postgres_ffi/src/xlog_utils.rs | 6 ------ pageserver/src/tenant/timeline.rs | 2 +- proxy/src/cancellation.rs | 4 ++-- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 8043450a55..c468134b81 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -96,7 +96,7 @@ A single virtual environment with all dependencies is described in the single `P sudo apt install python3.9 ``` - Install `poetry` - - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. + - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 95ecc7b061..f3dad159be 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -3,9 +3,11 @@ #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] -// suppress warnings on rust 1.53 due to bindgen unit tests. -// https://github.com/rust-lang/rust-bindgen/issues/1651 -#![allow(deref_nullptr)] +// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +#![allow(clippy::useless_transmute)] +// modules included with the postgres_ffi macro depend on the types of the specific version's +// types, and trigger a too eager lint. +#![allow(clippy::duplicate_mod)] use bytes::Bytes; use utils::bin_ser::SerializeError; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index fbd8468a93..953723a8f0 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -57,12 +57,10 @@ pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; /// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG. const XID_CHECKPOINT_INTERVAL: u32 = 1024; -#[allow(non_snake_case)] pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo } -#[allow(non_snake_case)] pub fn XLogSegNoOffsetToRecPtr( segno: XLogSegNo, offset: u32, @@ -71,7 +69,6 @@ pub fn XLogSegNoOffsetToRecPtr( segno * (wal_segsz_bytes as u64) + (offset as u64) } -#[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { format!( "{:>08X}{:>08X}{:>08X}", @@ -81,7 +78,6 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize ) } -#[allow(non_snake_case)] pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; @@ -89,12 +85,10 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) } -#[allow(non_snake_case)] pub fn IsXLogFileName(fname: &str) -> bool { return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); } -#[allow(non_snake_case)] pub fn IsPartialXLogFileName(fname: &str) -> bool { fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 74e873e632..247e076230 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -627,7 +627,7 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); drop(tenant_conf_guard); let self_clone = Arc::clone(self); - let _ = spawn_connection_manager_task( + spawn_connection_manager_task( self.conf.broker_etcd_prefix.clone(), self_clone, walreceiver_connect_timeout, diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 92f8e35dab..eb9312e6bb 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -129,13 +129,13 @@ mod tests { assert!(CANCEL_MAP.contains(&session)); tx.send(()).expect("failed to send"); - let () = futures::future::pending().await; // sleep forever + futures::future::pending::<()>().await; // sleep forever Ok(()) })); // Wait until the task has been spawned. - let () = rx.await.context("failed to hear from the task")?; + rx.await.context("failed to hear from the task")?; // Drop the session's entry by cancelling the task. task.abort(); From 537b2c1ae6d9c61ae7ed4a02c04a370354b3bcdb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 4 Oct 2022 10:49:39 +0300 Subject: [PATCH 151/166] Remove unnecessary check for open PostgreSQL TCP port. The loop checked if the TCP port is open for connections, by trying to connect to it. That seems unnecessary. By the time the postmaster.pid file says that it's ready, the port should be open. Remove that check. --- compute_tools/src/compute.rs | 9 +-------- compute_tools/src/pg_helpers.rs | 22 ++++++++-------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 58469b1c97..1e848627e3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -258,14 +258,7 @@ impl ComputeNode { .spawn() .expect("cannot start postgres process"); - // Try default Postgres port if it is not provided - let port = self - .spec - .cluster - .settings - .find("port") - .unwrap_or_else(|| "5432".to_string()); - wait_for_postgres(&mut pg, &port, pgdata_path)?; + wait_for_postgres(&mut pg, pgdata_path)?; // If connection fails, // it may be the old node with `zenith_admin` superuser. diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ac065fa60c..8802dae639 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,11 +1,9 @@ use std::fmt::Write; use std::fs::File; use std::io::{BufRead, BufReader}; -use std::net::{SocketAddr, TcpStream}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; -use std::str::FromStr; use std::{fs, thread, time}; use anyhow::{bail, Result}; @@ -230,21 +228,16 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { Ok(postgres_dbs) } -/// Wait for Postgres to become ready to accept connections: -/// - state should be `ready` in the `pgdata/postmaster.pid` -/// - and we should be able to connect to 127.0.0.1:5432 -pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> { +/// Wait for Postgres to become ready to accept connections. It's ready to +/// accept connections when the state-field in `pgdata/postmaster.pid` says +/// 'ready'. +pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); let mut slept: u64 = 0; // ms let pause = time::Duration::from_millis(100); - let timeout = time::Duration::from_millis(10); - let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap(); - loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout, - // but postgres starts listening almost immediately, even if it is not really - // ready to accept connections). + // Sleep POSTGRES_WAIT_TIMEOUT at max if slept >= POSTGRES_WAIT_TIMEOUT { bail!("timed out while waiting for Postgres to start"); } @@ -263,10 +256,9 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); - let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); // Now Postgres is ready to accept connections - if status == "ready" && can_connect { + if status == "ready" { break; } } @@ -276,6 +268,8 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() slept += 100; } + log::info!("PostgreSQL is now running, continuing to configure it"); + Ok(()) } From 9b9bbad462160bf75df7ee69bc83a4da9eee2b38 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 4 Oct 2022 13:00:15 +0300 Subject: [PATCH 152/166] Use 'notify' crate to wait for PostgreSQL startup. Compute node startup time is very important. After launching PostgreSQL, use 'notify' to be notified immediately when it has updated the PID file, instead of polling. The polling loop had 100 ms interval so this shaves up to 100 ms from the startup time. --- Cargo.lock | 70 +++++++++++++++++++++++++++++++++ compute_tools/Cargo.toml | 2 + compute_tools/src/pg_helpers.rs | 62 +++++++++++++++++++++++------ workspace_hack/Cargo.toml | 1 + 4 files changed, 124 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ddb10352b8..69a8fa19ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -497,8 +497,10 @@ dependencies = [ "chrono", "clap 3.2.16", "env_logger", + "futures", "hyper", "log", + "notify", "postgres", "regex", "serde", @@ -1072,6 +1074,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + [[package]] name = "futures" version = "0.3.21" @@ -1493,6 +1504,26 @@ dependencies = [ "str_stack", ] +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "instant" version = "0.1.12" @@ -1552,6 +1583,26 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kqueue" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6112e8f37b59803ac47a42d14f1f3a59bbf72fc6857ffc5be455e28a691f8e" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" +dependencies = [ + "bitflags", + "libc", +] + [[package]] name = "kstring" version = "1.0.6" @@ -1797,6 +1848,24 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "notify" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2c66da08abae1c024c01d635253e402341b4060a12e99b31c7594063bf490a" +dependencies = [ + "bitflags", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "mio", + "walkdir", + "winapi", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -4142,6 +4211,7 @@ dependencies = [ "bstr", "bytes", "chrono", + "crossbeam-utils", "either", "fail", "hashbrown", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b13f7f191d..43cf7ae2dd 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -8,8 +8,10 @@ anyhow = "1.0" chrono = "0.4" clap = "3.0" env_logger = "0.9" +futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } +notify = "5.0.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8802dae639..769dbfac73 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,16 +1,19 @@ use std::fmt::Write; +use std::fs; use std::fs::File; use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; -use std::{fs, thread, time}; +use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use postgres::{Client, Transaction}; use serde::Deserialize; -const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds +use notify::{RecursiveMode, Watcher}; + +const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Rust representation of Postgres role info with only those fields /// that matter for us. @@ -233,29 +236,63 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { /// 'ready'. pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); - let mut slept: u64 = 0; // ms - let pause = time::Duration::from_millis(100); + // PostgreSQL writes line "ready" to the postmaster.pid file, when it has + // completed initialization and is ready to accept connections. We want to + // react quickly and perform the rest of our initialization as soon as + // PostgreSQL starts accepting connections. Use 'notify' to be notified + // whenever the PID file is changed, and whenever it changes, read it to + // check if it's now "ready". + // + // You cannot actually watch a file before it exists, so we first watch the + // data directory, and once the postmaster.pid file appears, we switch to + // watch the file instead. We also wake up every 100 ms to poll, just in + // case we miss some events for some reason. Not strictly necessary, but + // better safe than sorry. + let (tx, rx) = std::sync::mpsc::channel(); + let mut watcher = notify::recommended_watcher(move |res| { + let _ = tx.send(res); + })?; + watcher.watch(pgdata, RecursiveMode::NonRecursive)?; + + let started_at = Instant::now(); + let mut postmaster_pid_seen = false; loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max - if slept >= POSTGRES_WAIT_TIMEOUT { - bail!("timed out while waiting for Postgres to start"); - } - if let Ok(Some(status)) = pg.try_wait() { // Postgres exited, that is not what we expected, bail out earlier. let code = status.code().unwrap_or(-1); bail!("Postgres exited unexpectedly with code {}", code); } + let res = rx.recv_timeout(Duration::from_millis(100)); + log::debug!("woken up by notify: {res:?}"); + // If there are multiple events in the channel already, we only need to be + // check once. Swallow the extra events before we go ahead to check the + // pid file. + while let Ok(res) = rx.try_recv() { + log::debug!("swallowing extra event: {res:?}"); + } + // Check that we can open pid file first. if let Ok(file) = File::open(&pid_path) { + if !postmaster_pid_seen { + log::debug!("postmaster.pid appeared"); + watcher + .unwatch(pgdata) + .expect("Failed to remove pgdata dir watch"); + watcher + .watch(&pid_path, RecursiveMode::NonRecursive) + .expect("Failed to add postmaster.pid file watch"); + postmaster_pid_seen = true; + } + let file = BufReader::new(file); let last_line = file.lines().last(); // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); + log::debug!("last line of postmaster.pid: {status:?}"); // Now Postgres is ready to accept connections if status == "ready" { @@ -264,8 +301,11 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { } } - thread::sleep(pause); - slept += 100; + // Give up after POSTGRES_WAIT_TIMEOUT. + let duration = started_at.elapsed(); + if duration >= POSTGRES_WAIT_TIMEOUT { + bail!("timed out while waiting for Postgres to start"); + } } log::info!("PostgreSQL is now running, continuing to configure it"); diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f37a42945e..6977665c7d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,6 +19,7 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } +crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } From 5cf53786f9196c9461119ed5a0653707b7804e96 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 26 Sep 2022 21:47:08 +0300 Subject: [PATCH 153/166] Improve pytest ergonomics 1. Disable perf tests by default 2. Add instruction to run tests in parallel --- pytest.ini | 1 + test_runner/README.md | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/pytest.ini b/pytest.ini index bfa07e520b..7197b078c6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,6 +5,7 @@ filterwarnings = ignore:record_property is incompatible with junit_family:pytest.PytestWarning addopts = -m 'not remote_cluster' + --ignore=test_runner/performance markers = remote_cluster testpaths = diff --git a/test_runner/README.md b/test_runner/README.md index d6ee5730ac..e066ac3235 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -56,6 +56,14 @@ If you want to run all tests that have the string "bench" in their names: `./scripts/pytest -k bench` +To run tests in parellel we utilize `pytest-xdist` plugin. By default everything runs single threaded. Number of workers can be specified with `-n` argument: + +`./scripts/pytest -n4` + +By default performance tests are excluded. To run them explicitly pass performance tests selection to the script: + +`./scripts/pytest test_runner/performance` + Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. From 231dfbaed630963e709166677908fff0b558e35e Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 22:13:26 +0300 Subject: [PATCH 154/166] Do not remove empty timelines/ directory for tenants --- pageserver/src/tenant_mgr.rs | 44 ++++++++++++++++++----------- test_runner/regress/test_tenants.py | 37 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 1efd3d4af4..0e8ee8c067 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -108,6 +108,10 @@ pub fn init_tenant_mgr( /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. /// +/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully. +/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines. +/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before). +/// /// Attach happens on startup and sucessful timeline downloads /// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( @@ -173,16 +177,28 @@ fn load_local_tenant( remote_index.clone(), conf.remote_storage_config.is_some(), )); - match Tenant::load_tenant_config(conf, tenant_id) { - Ok(tenant_conf) => { - tenant.update_tenant_config(tenant_conf); - tenant.activate(false); - } - Err(e) => { - error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); - tenant.set_state(TenantState::Broken); + + let tenant_timelines_dir = conf.timelines_path(&tenant_id); + if !tenant_timelines_dir.is_dir() { + error!( + "Tenant {} has no timelines directory at {}", + tenant_id, + tenant_timelines_dir.display() + ); + tenant.set_state(TenantState::Broken); + } else { + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } } } + tenant } @@ -630,14 +646,10 @@ fn collect_timelines_for_tenant( } if tenant_timelines.is_empty() { - match remove_if_empty(&timelines_dir) { - Ok(true) => info!( - "Removed empty tenant timelines directory {}", - timelines_dir.display() - ), - Ok(false) => (), - Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), - } + // this is normal, we've removed all broken, empty and temporary timeline dirs + // but should allow the tenant to stay functional and allow creating new timelines + // on a restart, we require tenants to have the timelines dir, so leave it on disk + debug!("Tenant {tenant_id} has no timelines loaded"); } Ok((tenant_id, tenant_timelines)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 52b9e6369c..ba5109a16f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,4 +1,5 @@ import os +import shutil from contextlib import closing from datetime import datetime from pathlib import Path @@ -201,3 +202,39 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) assert post_detach_samples == set() + + +def test_pageserver_with_empty_tenants(neon_simple_env: NeonEnv): + env = neon_simple_env + client = env.pageserver.http_client() + + tenant_without_timelines_dir = env.initial_tenant + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + + tenant_with_empty_timelines_dir = client.tenant_create() + for timeline_dir_entry in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ): + if timeline_dir_entry.is_dir(): + shutil.rmtree(timeline_dir_entry) + else: + timeline_dir_entry.unlink() + + env.postgres.stop_all() + for _ in range(0, 3): + env.pageserver.stop() + env.pageserver.start() + + client = env.pageserver.http_client() + tenants = client.tenant_list() + + assert ( + len(tenants) == 1 + ), "Pageserver should attach only tenants with empty timelines/ dir on restart" + loaded_tenant = tenants[0] + assert loaded_tenant["id"] == str( + tenant_with_empty_timelines_dir + ), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory" + assert loaded_tenant["state"] == { + "Active": {"background_jobs_running": False} + }, "Empty tenant should be loaded and ready for timeline creation" From d823e84ed5497c61ff04b9a4f689470c62ec2e9a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Oct 2022 23:14:39 +0300 Subject: [PATCH 155/166] Allow attaching tenants with zero timelines --- pageserver/src/http/routes.rs | 13 ++++-- test_runner/fixtures/neon_fixtures.py | 7 +++- test_runner/regress/test_tenants.py | 57 ++++++++++++++++++++------- 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 55429420a8..a1bd65c308 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -337,9 +337,16 @@ async fn tenant_attach_handler(request: Request) -> Result, info!("Handling tenant attach {tenant_id}"); tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { - Ok(_) => Err(ApiError::Conflict( - "Tenant is already present locally".to_owned(), - )), + Ok(tenant) => { + if tenant.list_timelines().is_empty() { + info!("Attaching to tenant {tenant_id} with zero timelines"); + Ok(()) + } else { + Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )) + } + } Err(_) => Ok(()), }) .await diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index aa9fd68df5..5c2c3edbd8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -455,6 +455,9 @@ class RemoteStorageKind(enum.Enum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" + # Pass to tests that are generic to remote storage + # to ensure the test pass with or without the remote storage + NOOP = "noop" def available_remote_storages() -> List[RemoteStorageKind]: @@ -583,7 +586,9 @@ class NeonEnvBuilder: test_name: str, force_enable: bool = True, ): - if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + if remote_storage_kind == RemoteStorageKind.NOOP: + return + elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: self.enable_local_fs_remote_storage(force_enable=force_enable) elif remote_storage_kind == RemoteStorageKind.MOCK_S3: self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index ba5109a16f..f49b6fccb9 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -8,8 +8,13 @@ from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + RemoteStorageKind, + available_remote_storages, +) +from fixtures.types import Lsn, TenantId, TimelineId from prometheus_client.samples import Sample @@ -204,26 +209,50 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde assert post_detach_samples == set() -def test_pageserver_with_empty_tenants(neon_simple_env: NeonEnv): - env = neon_simple_env +# Check that empty tenants work with or without the remote storage +@pytest.mark.parametrize( + "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP] +) +def test_pageserver_with_empty_tenants( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_with_empty_tenants", + ) + + env = neon_env_builder.init_start() client = env.pageserver.http_client() tenant_without_timelines_dir = env.initial_tenant + log.info( + f"Tenant {tenant_without_timelines_dir} becomes broken: it abnormally looses tenants/ directory and is expected to be completely ignored when pageserver restarts" + ) shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") tenant_with_empty_timelines_dir = client.tenant_create() - for timeline_dir_entry in Path.iterdir( - Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" - ): - if timeline_dir_entry.is_dir(): - shutil.rmtree(timeline_dir_entry) - else: - timeline_dir_entry.unlink() + log.info( + f"Tenant {tenant_with_empty_timelines_dir} gets all of its timelines deleted: still should be functional" + ) + temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir) + for temp_timeline in temp_timelines: + client.timeline_delete( + tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"]) + ) + files_in_timelines_dir = sum( + 1 + for _p in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ) + ) + assert ( + files_in_timelines_dir == 0 + ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" + # Trigger timeline reinitialization after pageserver restart env.postgres.stop_all() - for _ in range(0, 3): - env.pageserver.stop() - env.pageserver.start() + env.pageserver.stop() + env.pageserver.start() client = env.pageserver.http_client() tenants = client.tenant_list() From 580584c8fce303da90d898d81703ab54e81e39b9 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 4 Oct 2022 19:14:45 +0100 Subject: [PATCH 156/166] Remove control_plane deps on pageserver/safekeeper (#2513) Creates new `pageserver_api` and `safekeeper_api` crates to serve as the shared dependencies. Should reduce both recompile times and cold compile times. Decreases the size of the optimized `neon_local` binary: 380M -> 179M. No significant changes for anything else (mostly as expected). --- Cargo.lock | 28 +++++++++++++++++-- control_plane/Cargo.toml | 6 ++-- control_plane/src/bin/neon_local.rs | 6 ++-- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 2 +- libs/pageserver_api/Cargo.toml | 12 ++++++++ libs/pageserver_api/src/lib.rs | 9 ++++++ .../pageserver_api/src}/models.rs | 12 +++++++- libs/safekeeper_api/Cargo.toml | 12 ++++++++ libs/safekeeper_api/src/lib.rs | 10 +++++++ .../safekeeper_api/src}/models.rs | 0 pageserver/Cargo.toml | 1 + pageserver/src/config.rs | 8 +++--- pageserver/src/http/mod.rs | 3 +- pageserver/src/tenant.rs | 13 +-------- safekeeper/Cargo.toml | 1 + safekeeper/src/http/mod.rs | 3 +- safekeeper/src/lib.rs | 9 +++--- 18 files changed, 104 insertions(+), 33 deletions(-) create mode 100644 libs/pageserver_api/Cargo.toml create mode 100644 libs/pageserver_api/src/lib.rs rename {pageserver/src/http => libs/pageserver_api/src}/models.rs (90%) create mode 100644 libs/safekeeper_api/Cargo.toml create mode 100644 libs/safekeeper_api/src/lib.rs rename {safekeeper/src/http => libs/safekeeper_api/src}/models.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 69a8fa19ab..ab508c7109 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -542,11 +542,11 @@ dependencies = [ "git-version", "nix", "once_cell", - "pageserver", + "pageserver_api", "postgres", "regex", "reqwest", - "safekeeper", + "safekeeper_api", "serde", "serde_with", "tar", @@ -2044,6 +2044,7 @@ dependencies = [ "nix", "num-traits", "once_cell", + "pageserver_api", "postgres", "postgres-protocol", "postgres-types", @@ -2072,6 +2073,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -2960,6 +2972,7 @@ dependencies = [ "postgres_ffi", "regex", "remote_storage", + "safekeeper_api", "serde", "serde_json", "serde_with", @@ -2975,6 +2988,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "safekeeper_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "same-file" version = "1.0.6" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index ab9df8534c..ee8481e141 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -19,7 +19,9 @@ thiserror = "1" nix = "0.23" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } -pageserver = { path = "../pageserver" } -safekeeper = { path = "../safekeeper" } +# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api +# instead, so that recompile times are better. +pageserver_api = { path = "../libs/pageserver_api" } +safekeeper_api = { path = "../libs/safekeeper_api" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 93947d5326..0c26842b34 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -12,12 +12,12 @@ use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage::PageServerNode; use control_plane::{etcd, local_env}; -use pageserver::config::defaults::{ +use pageserver_api::models::TimelineInfo; +use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use pageserver::http::models::TimelineInfo; -use safekeeper::defaults::{ +use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 600a9ffe05..34b2f3000a 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -12,7 +12,7 @@ use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; -use safekeeper::http::models::TimelineCreateRequest; +use safekeeper_api::models::TimelineCreateRequest; use thiserror::Error; use utils::{ connstring::connection_address, diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index bfbd6e91c3..59cb3d7efb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{ +use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; use postgres::{Config, NoTls}; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml new file mode 100644 index 0000000000..be8762100c --- /dev/null +++ b/libs/pageserver_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "pageserver_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs new file mode 100644 index 0000000000..a36c1692a9 --- /dev/null +++ b/libs/pageserver_api/src/lib.rs @@ -0,0 +1,9 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/pageserver/src/http/models.rs b/libs/pageserver_api/src/models.rs similarity index 90% rename from pageserver/src/http/models.rs rename to libs/pageserver_api/src/models.rs index d5559653b2..43059ead84 100644 --- a/pageserver/src/http/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -7,7 +7,17 @@ use utils::{ lsn::Lsn, }; -use crate::tenant::TenantState; +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} #[serde_as] #[derive(Serialize, Deserialize)] diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml new file mode 100644 index 0000000000..852d643f30 --- /dev/null +++ b/libs/safekeeper_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "safekeeper_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs new file mode 100644 index 0000000000..0a391478da --- /dev/null +++ b/libs/safekeeper_api/src/lib.rs @@ -0,0 +1,10 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/safekeeper/src/http/models.rs b/libs/safekeeper_api/src/models.rs similarity index 100% rename from safekeeper/src/http/models.rs rename to libs/safekeeper_api/src/models.rs diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 1ec7ec4f98..88430f3a86 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -58,6 +58,7 @@ rstar = "0.9.3" num-traits = "0.2.15" amplify_num = "0.4.1" +pageserver_api = { path = "../libs/pageserver_api" } postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a52a3e8262..6e3c7baad8 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,10 +30,10 @@ pub mod defaults { use crate::tenant_config::defaults::*; use const_format::formatcp; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + pub use pageserver_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs index 4c0be17ecd..1c083bd382 100644 --- a/pageserver/src/http/mod.rs +++ b/pageserver/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use pageserver_api::models; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 672ee3a488..c2fb9ef242 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -45,6 +45,7 @@ use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; +pub use pageserver_api::models::TenantState; use toml_edit; use utils::{ @@ -118,18 +119,6 @@ pub struct Tenant { upload_layers: bool, } -/// A state of a tenant in pageserver's memory. -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub enum TenantState { - /// Tenant is fully operational, its background jobs might be running or not. - Active { background_jobs_running: bool }, - /// A tenant is recognized by pageserver, but not yet ready to operate: - /// e.g. not present locally and being downloaded or being read into memory from the file system. - Paused, - /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. - Broken, -} - /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 87ee63d1df..cb1cecade9 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,6 +33,7 @@ toml_edit = { version = "0.13", features = ["easy"] } thiserror = "1" parking_lot = "0.12.1" +safekeeper_api = { path = "../libs/safekeeper_api" } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 4c0be17ecd..1831470007 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use safekeeper_api::models; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 58a237a5d3..e38a5a4633 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -27,14 +27,13 @@ mod timelines_global_map; pub use timelines_global_map::GlobalTimelines; pub mod defaults { - use const_format::formatcp; use std::time::Duration; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + pub use safekeeper_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; } From b99bed510d742babc061097a528f2dc09284c681 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 5 Oct 2022 16:14:09 +0300 Subject: [PATCH 157/166] Move proxies to neon-proxy namespace (#2555) --- .github/workflows/build_and_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 22042489a8..4f2f8f0833 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -768,5 +768,5 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s From f25dd75be9539e44b4d3d8c5864f73cea910f897 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 6 Oct 2022 01:07:02 +0300 Subject: [PATCH 158/166] Fix deadlock in safekeeper metrics (#2566) We had a problem where almost all of the threads were waiting on a futex syscall. More specifically: - `/metrics` handler was inside `TimelineCollector::collect()`, waiting on a mutex for a single Timeline - This exact timeline was inside `control_file::FileStorage::persist()`, waiting on a mutex for Lazy initialization of `PERSIST_CONTROL_FILE_SECONDS` - `PERSIST_CONTROL_FILE_SECONDS: Lazy` was blocked on `prometheus::register` - `prometheus::register` calls `DEFAULT_REGISTRY.write().register()` to take a write lock on Registry and add a new metric - `DEFAULT_REGISTRY` lock was already taken inside `DEFAULT_REGISTRY.gather()`, which was called by `/metrics` handler to collect all metrics This commit creates another Registry with a separate lock, to avoid deadlock in a case where `TimelineCollector` triggers registration of new metrics inside default registry. --- libs/metrics/src/lib.rs | 19 +++++++++++++++++-- libs/utils/src/http/endpoint.rs | 9 ++++++++- safekeeper/src/bin/safekeeper.rs | 3 +-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 920d3fd17e..e290828d37 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,7 +3,7 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use once_cell::sync::Lazy; -use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec}; +use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; pub use prometheus::opts; pub use prometheus::register; pub use prometheus::{core, default_registry, proto}; @@ -17,6 +17,7 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; +use prometheus::{Registry, Result}; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; @@ -32,13 +33,27 @@ macro_rules! register_uint_gauge_vec { }}; } +/// Special internal registry, to collect metrics independently from the default registry. +/// Was introduced to fix deadlock with lazy registration of metrics in the default registry. +static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); + +/// Register a collector in the internal registry. MUST be called before the first call to `gather()`. +/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector +/// while holding the lock. +pub fn register_internal(c: Box) -> Result<()> { + INTERNAL_REGISTRY.register(c) +} + /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so /// it might be fine to do it this way to keep things simple. pub fn gather() -> Vec { update_rusage_metrics(); - prometheus::gather() + let mut mfs = prometheus::gather(); + let mut internal_mfs = INTERNAL_REGISTRY.gather(); + mfs.append(&mut internal_mfs); + mfs } static DISK_IO_BYTES: Lazy = Lazy::new(|| { diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 4066791e2b..7a519929cf 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -9,6 +9,7 @@ use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; +use tokio::task::JoinError; use tracing::info; use std::future::Future; @@ -35,7 +36,13 @@ async fn prometheus_metrics_handler(_req: Request) -> Result, init: bo // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. - let registry = metrics::default_registry(); let timeline_collector = safekeeper::metrics::TimelineCollector::new(); - registry.register(Box::new(timeline_collector))?; + metrics::register_internal(Box::new(timeline_collector))?; let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; From ff8c481777ecb82c6553a9235f79199154a5a8b3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 6 Oct 2022 09:01:56 +0300 Subject: [PATCH 159/166] Normalize last_record LSN in wal receiver (#2529) * Add test for branching on page boundary * Normalize start recovery point Co-authored-by: Heikki Linnakangas Co-authored-by: Thang Pham --- .../src/walreceiver/walreceiver_connection.rs | 10 +++++ test_runner/regress/test_branching.py | 38 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index ef5baeb570..a4a6af455c 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -12,6 +12,8 @@ use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres_ffi::v14::xlog_utils::normalize_lsn; +use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; @@ -156,6 +158,14 @@ pub async fn handle_walreceiver_connection( // There might be some padding after the last full record, skip it. startpoint += startpoint.calc_padding(8u32); + // If the starting point is at a WAL page boundary, skip past the page header. We don't need the page headers + // for anything, and in some corner cases, the compute node might have never generated the WAL for page headers + //. That happens if you create a branch at page boundary: the start point of the branch is at the page boundary, + // but when the compute node first starts on the branch, we normalize the first REDO position to just after the page + // header (see generate_pg_control()), so the WAL for the page header is never streamed from the compute node + // to the safekeepers. + startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); + info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 0c1490294d..3b78700e9f 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -6,6 +6,8 @@ from typing import List import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres +from fixtures.types import Lsn +from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix @@ -88,3 +90,39 @@ def test_branching_with_pgbench( for pg in pgs: res = pg.safe_psql("SELECT count(*) from pgbench_accounts") assert res[0] == (100000 * scale,) + + +# Test branching from an "unnormalized" LSN. +# +# Context: +# When doing basebackup for a newly created branch, pageserver generates +# 'pg_control' file to bootstrap WAL segment by specifying the redo position +# a "normalized" LSN based on the timeline's starting LSN: +# +# checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; +# +# This test checks if the pageserver is able to handle a "unnormalized" starting LSN. +# +# Related: see discussion in https://github.com/neondatabase/neon/pull/2143#issuecomment-1209092186 +def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBin): + XLOG_BLCKSZ = 8192 + + env = neon_simple_env + + env.neon_cli.create_branch("b0") + pg0 = env.postgres.create_start("b0") + + pg_bin.run_capture(["pgbench", "-i", pg0.connstr()]) + + with pg0.cursor() as cur: + curr_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # Specify the `start_lsn` as a number that is divided by `XLOG_BLCKSZ` + # and is smaller than `curr_lsn`. + start_lsn = Lsn((int(curr_lsn) - XLOG_BLCKSZ) // XLOG_BLCKSZ * XLOG_BLCKSZ) + + log.info(f"Branching b1 from b0 starting at lsn {start_lsn}...") + env.neon_cli.create_branch("b1", "b0", ancestor_start_lsn=start_lsn) + pg1 = env.postgres.create_start("b1") + + pg_bin.run_capture(["pgbench", "-i", pg1.connstr()]) From c5a428a61a7d60b7f75b062a18b9257d2fe6896d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 4 Oct 2022 21:27:18 +0300 Subject: [PATCH 160/166] Update Dockerfile.compute-node-v15 to match v14 version. Fix build script to promote the image for v15 to neon dockerhub --- .github/workflows/build_and_test.yml | 4 +-- Dockerfile.compute-node-v15 | 51 +++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4f2f8f0833..72018a12a8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -564,7 +564,7 @@ jobs: promote-images: runs-on: dev - needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-tools-image ] + needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: @@ -573,7 +573,7 @@ jobs: # compute-node uses postgres 14, which is default now # cloud repo depends on this image name, thus duplicating it # remove compute-node when cloud repo is updated - name: [ neon, compute-node, compute-node-v14, compute-tools ] + name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ] steps: - name: Promote image to latest diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index f949ef7680..7e33a0d7c8 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -13,9 +13,12 @@ ARG TAG=pinned # Layer "build-deps" # FROM debian:bullseye-slim AS build-deps +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev + libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev # # Layer "pg-build" @@ -42,7 +45,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ tar xvzf postgis-3.3.0.tar.gz && \ @@ -64,15 +67,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ # Build plv8 # FROM build-deps AS plv8-build -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 # https://github.com/plv8/plv8/issues/475 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ - apt update && \ +RUN apt update && \ apt install -y --no-install-recommends -t testing binutils RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -84,12 +85,46 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "h3-pg-build" +# Build h3_pg +# +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN apt update && \ + apt install -y --no-install-recommends -t testing cmake + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + # # Layer "neon-pg-ext-build" # compile neon extensions # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +# plv8 still sometimes crashes during the creation +# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -137,8 +172,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ chmod 0750 /var/db/postgres/compute && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig -# TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl From 4a216c5f7f3735c34bae9810501a662559e666c8 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 5 Oct 2022 11:06:13 +0300 Subject: [PATCH 161/166] Use PostGIS 3.3.1 that is compatible with pg 15 --- Dockerfile.compute-node-v15 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index 7e33a0d7c8..bdb4330c4f 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -5,7 +5,7 @@ ARG TAG=pinned # apparently, ARGs don't get replaced in RUN commands in kaniko -# ARG POSTGIS_VERSION=3.3.0 +# ARG POSTGIS_VERSION=3.3.1 # ARG PLV8_VERSION=3.1.4 # ARG PG_VERSION=v15 @@ -47,9 +47,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ - tar xvzf postgis-3.3.0.tar.gz && \ - cd postgis-3.3.0 && \ +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ + tar xvzf postgis-3.3.1.tar.gz && \ + cd postgis-3.3.1 && \ ./autogen.sh && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure && \ From ed85d97f1754c8ce64958c5c73d02bf017a8f81c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 5 Oct 2022 14:22:41 +0300 Subject: [PATCH 162/166] bump vendor/postgres-v15. Rebase it to Stamp 15rc2 --- vendor/postgres-v15 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 9383aaa9c2..ff18cec1ee 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 9383aaa9c2616fd81cfafb058fe0d692f5e43ac3 +Subproject commit ff18cec1ee9b80055accd9c76b040875329b11ed From 254cb7dc4f8968373a154c020d1c843559000551 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 5 Oct 2022 19:02:11 +0300 Subject: [PATCH 163/166] Update CI script to push compute-node-v15 to dockerhub --- .github/workflows/build_and_test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 72018a12a8..6556fb6c9b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -608,6 +608,9 @@ jobs: - name: Pull compute node v14 image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14 + - name: Pull compute node v15 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15 + - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust @@ -638,6 +641,9 @@ jobs: - name: Push compute node v14 image to Docker Hub run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push compute node v15 image to Docker Hub + run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned @@ -650,6 +656,7 @@ jobs: crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] From e8b195acb7bccb564c014dcbdc887ebeb52f1a51 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Oct 2022 11:13:40 +0300 Subject: [PATCH 164/166] fix: apply notify workaround on m1 mac docker (#2564) workaround as discussed in the notify repository. --- compute_tools/src/pg_helpers.rs | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 769dbfac73..ad7ea0abc8 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -250,9 +250,36 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { // case we miss some events for some reason. Not strictly necessary, but // better safe than sorry. let (tx, rx) = std::sync::mpsc::channel(); - let mut watcher = notify::recommended_watcher(move |res| { + let (mut watcher, rx): (Box, _) = match notify::recommended_watcher(move |res| { let _ = tx.send(res); - })?; + }) { + Ok(watcher) => (Box::new(watcher), rx), + Err(e) => { + match e.kind { + notify::ErrorKind::Io(os) if os.raw_os_error() == Some(38) => { + // docker on m1 macs does not support recommended_watcher + // but return "Function not implemented (os error 38)" + // see https://github.com/notify-rs/notify/issues/423 + let (tx, rx) = std::sync::mpsc::channel(); + + // let's poll it faster than what we check the results for (100ms) + let config = + notify::Config::default().with_poll_interval(Duration::from_millis(50)); + + let watcher = notify::PollWatcher::new( + move |res| { + let _ = tx.send(res); + }, + config, + )?; + + (Box::new(watcher), rx) + } + _ => return Err(e.into()), + } + } + }; + watcher.watch(pgdata, RecursiveMode::NonRecursive)?; let started_at = Instant::now(); From 47bae68a2eb889375b332b726e31597d2a06f0a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s?= Date: Thu, 6 Oct 2022 11:42:50 +0200 Subject: [PATCH 165/166] Make get_lsn_by_timestamp available in mgmt API (#2536) (#2560) Co-authored-by: andres --- pageserver/src/http/openapi_spec.yml | 56 ++++++++++++++++++++++++ pageserver/src/http/routes.rs | 49 +++++++++++++++++++++ pageserver/src/page_service.rs | 30 ------------- test_runner/fixtures/neon_fixtures.py | 13 ++++++ test_runner/regress/test_lsn_mapping.py | 57 +++++++++++-------------- 5 files changed, 144 insertions(+), 61 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 4e748207c8..97fdcd7bbd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -207,6 +207,62 @@ paths: schema: $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + get: + description: Get LSN by a timestamp + parameters: + - name: timestamp + in: query + required: true + schema: + type: string + format: date-time + description: A timestamp to get the LSN + responses: + "200": + description: OK + content: + application/json: + schema: + type: string + "400": + description: Error when no tenant id found in path, no timeline id or invalid timestamp + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a1bd65c308..e743f27aff 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -12,6 +12,7 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; +use crate::pgdatadir_mapping::LsnForTimestamp; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant::{TenantState, Timeline}; @@ -265,6 +266,23 @@ fn query_param_present(request: &Request, param: &str) -> bool { .unwrap_or(false) } +fn get_query_param(request: &Request, param_name: &str) -> Result { + request.uri().query().map_or( + Err(ApiError::BadRequest(anyhow!("empty query in request"))), + |v| { + url::form_urlencoded::parse(v.as_bytes()) + .into_owned() + .find(|(k, _)| k == param_name) + .map_or( + Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in query parameters" + ))), + |(_, v)| Ok(v), + ) + }, + ) +} + async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -329,6 +347,33 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let timestamp_raw = get_query_param(&request, "timestamp")?; + let timestamp = humantime::parse_rfc3339(timestamp_raw.as_str()) + .with_context(|| format!("Invalid time: {:?}", timestamp_raw)) + .map_err(ApiError::BadRequest)?; + let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); + + let timeline = tenant_mgr::get_tenant(tenant_id, true) + .and_then(|tenant| tenant.get_timeline(timeline_id)) + .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}")) + .map_err(ApiError::NotFound)?; + let result = match timeline + .find_lsn_for_timestamp(timestamp_pg) + .map_err(ApiError::InternalServerError)? + { + LsnForTimestamp::Present(lsn) => format!("{}", lsn), + LsnForTimestamp::Future(_lsn) => "future".into(), + LsnForTimestamp::Past(_lsn) => "past".into(), + LsnForTimestamp::NoData(_lsn) => "nodata".into(), + }; + json_response(StatusCode::OK, result) +} + // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create async fn tenant_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; @@ -908,6 +953,10 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", + get_lsn_by_timestamp_handler, + ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", testing_api!("run timeline GC", timeline_gc_handler), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 758faa4d9a..795a99058d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -12,7 +12,6 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use futures::{Stream, StreamExt}; -use regex::Regex; use std::io; use std::net::TcpListener; use std::str; @@ -35,7 +34,6 @@ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; -use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; use crate::task_mgr; @@ -45,7 +43,6 @@ use crate::tenant_mgr; use crate::CheckpointConfig; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; -use postgres_ffi::to_pg_timestamp; use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData @@ -1062,33 +1059,6 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("get_lsn_by_timestamp ") { - // Locate LSN of last transaction with timestamp less or equal than sppecified - // TODO lazy static - let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$") - .unwrap(); - let caps = re - .captures(query_string) - .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; - let timestamp_pg = to_pg_timestamp(timestamp); - - self.check_permission(Some(tenant_id))?; - - let timeline = get_local_timeline(tenant_id, timeline_id)?; - pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( - b"lsn", - )]))?; - let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? { - LsnForTimestamp::Present(lsn) => format!("{}", lsn), - LsnForTimestamp::Future(_lsn) => "future".into(), - LsnForTimestamp::Past(_lsn) => "past".into(), - LsnForTimestamp::NoData(_lsn) => "nodata".into(), - }; - pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5c2c3edbd8..38d818b3d8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1136,6 +1136,19 @@ class NeonPageserverHttpClient(requests.Session): assert res_json is None return res_json + def timeline_get_lsn_by_timestamp( + self, tenant_id: TenantId, timeline_id: TimelineId, timestamp + ): + log.info( + f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" + ) + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index ef99954a76..c5a49a6704 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -15,7 +15,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): pgmain = env.postgres.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") - ps_cur = env.pageserver.connect().cursor() cur = pgmain.connect().cursor() # Create table, and insert rows, each in a separate transaction # Disable synchronous_commit to make this initialization go faster. @@ -38,37 +37,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Wait until WAL is received by pageserver wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id) - # Check edge cases: timestamp in the future - probe_timestamp = tbl[-1][1] + timedelta(hours=1) - result = query_scalar( - ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", - ) - assert result == "future" - - # timestamp too the far history - probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = query_scalar( - ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", - ) - assert result == "past" - - # Probe a bunch of timestamps in the valid range - for i in range(1, len(tbl), 100): - probe_timestamp = tbl[i][1] - - # Call get_lsn_by_timestamp to get the LSN - lsn = query_scalar( - ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant} {new_timeline_id} '{probe_timestamp.isoformat()}Z'", + with env.pageserver.http_client() as client: + # Check edge cases: timestamp in the future + probe_timestamp = tbl[-1][1] + timedelta(hours=1) + result = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" ) + assert result == "future" - # Launch a new read-only node at that LSN, and check that only the rows - # that were supposed to be committed at that point in time are visible. - pg_here = env.postgres.create_start( - branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + # timestamp too the far history + probe_timestamp = tbl[0][1] - timedelta(hours=10) + result = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" ) - assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i + assert result == "past" - pg_here.stop_and_destroy() + # Probe a bunch of timestamps in the valid range + for i in range(1, len(tbl), 100): + probe_timestamp = tbl[i][1] + lsn = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" + ) + # Call get_lsn_by_timestamp to get the LSN + # Launch a new read-only node at that LSN, and check that only the rows + # that were supposed to be committed at that point in time are visible. + pg_here = env.postgres.create_start( + branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + ) + assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i + + pg_here.stop_and_destroy() From 687ba81366491c84250e4df9c20e31e595b34476 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 6 Oct 2022 16:53:52 +0300 Subject: [PATCH 166/166] Display sync safekeepers output in compute_ctl (#2571) Pipe postgres output to compute_ctl stdout and create a test to check that compute_ctl works and prints postgres logs. --- compute_tools/src/compute.rs | 7 +- test_runner/fixtures/neon_fixtures.py | 10 ++ test_runner/regress/test_compute_ctl.py | 203 ++++++++++++++++++++++++ 3 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 test_runner/regress/test_compute_ctl.py diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1e848627e3..bfdd2340ec 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -178,7 +178,6 @@ impl ComputeNode { .args(&["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .stdout(Stdio::piped()) - .stderr(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); @@ -191,10 +190,10 @@ impl ComputeNode { if !sync_output.status.success() { anyhow::bail!( - "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}", + "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}", sync_output.status, - String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), - String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"), + String::from_utf8(sync_output.stdout) + .expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), ); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 38d818b3d8..28c65223ba 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1200,6 +1200,7 @@ class AbstractNeonCli(abc.ABC): arguments: List[str], extra_env_vars: Optional[Dict[str, str]] = None, check_return_code=True, + timeout=None, ) -> "subprocess.CompletedProcess[str]": """ Run the command with the specified arguments. @@ -1246,6 +1247,7 @@ class AbstractNeonCli(abc.ABC): universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + timeout=timeout, ) if not res.returncode: log.info(f"Run success: {res.stdout}") @@ -1619,6 +1621,14 @@ class WalCraft(AbstractNeonCli): res.check_returncode() +class ComputeCtl(AbstractNeonCli): + """ + A typed wrapper around the `compute_ctl` CLI tool. + """ + + COMMAND = "compute_ctl" + + class NeonPageserver(PgProtocol): """ An object representing a running pageserver. diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py new file mode 100644 index 0000000000..01b64b8b17 --- /dev/null +++ b/test_runner/regress/test_compute_ctl.py @@ -0,0 +1,203 @@ +import os +from subprocess import TimeoutExpired + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ComputeCtl, NeonEnvBuilder, PgBin + + +# Test that compute_ctl works and prints "--sync-safekeepers" logs. +def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + ctl = ComputeCtl(env) + + env.neon_cli.create_branch("test_compute_ctl", "main") + pg = env.postgres.create_start("test_compute_ctl") + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + + with open(pg.config_file_path(), "r") as f: + cfg_lines = f.readlines() + cfg_map = {} + for line in cfg_lines: + if "=" in line: + k, v = line.split("=") + cfg_map[k] = v.strip("\n '\"") + log.info(f"postgres config: {cfg_map}") + pgdata = pg.pg_data_dir_path() + pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres") + + pg.stop_and_destroy() + + spec = ( + """ +{ + "format_version": 1.0, + + "timestamp": "2021-05-23T18:25:43.511Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", + + "cluster": { + "cluster_id": "test-cluster-42", + "name": "Neon Test", + "state": "restarted", + "roles": [ + ], + "databases": [ + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "replica", + "vartype": "enum" + }, + { + "name": "hot_standby", + "value": "on", + "vartype": "bool" + }, + { + "name": "neon.safekeepers", + "value": """ + + f'"{cfg_map["neon.safekeepers"]}"' + + """, + "vartype": "string" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "shared_buffers", + "value": "32768", + "vartype": "integer" + }, + { + "name": "port", + "value": """ + + f'"{cfg_map["port"]}"' + + """, + "vartype": "integer" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "wal_sender_timeout", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "maintenance_work_mem", + "value": "65536", + "vartype": "integer" + }, + { + "name": "max_parallel_workers", + "value": "8", + "vartype": "integer" + }, + { + "name": "max_worker_processes", + "value": "8", + "vartype": "integer" + }, + { + "name": "neon.tenant_id", + "value": """ + + f'"{cfg_map["neon.tenant_id"]}"' + + """, + "vartype": "string" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "neon.timeline_id", + "value": """ + + f'"{cfg_map["neon.timeline_id"]}"' + + """, + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon", + "vartype": "string" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": """ + + f'"{cfg_map["neon.pageserver_connstring"]}"' + + """, + "vartype": "string" + } + ] + }, + "delta_operations": [ + ] +} +""" + ) + + ps_connstr = cfg_map["neon.pageserver_connstring"] + log.info(f"ps_connstr: {ps_connstr}, pgdata: {pgdata}") + + # run compute_ctl and wait for 10s + try: + ctl.raw_cli( + ["--connstr", ps_connstr, "--pgdata", pgdata, "--spec", spec, "--pgbin", pg_bin_path], + timeout=10, + ) + except TimeoutExpired as exc: + ctl_logs = exc.stderr.decode("utf-8") + log.info("compute_ctl output:\n" + ctl_logs) + + start = "starting safekeepers syncing" + end = "safekeepers synced at LSN" + start_pos = ctl_logs.index(start) + assert start_pos != -1 + end_pos = ctl_logs.index(end, start_pos) + assert end_pos != -1 + sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)] + log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs) + + # assert that --sync-safekeepers logs are present in the output + assert "connecting with node" in sync_safekeepers_logs + assert "connected with node" in sync_safekeepers_logs + assert "proposer connected to quorum (2)" in sync_safekeepers_logs + assert "got votes from majority (2)" in sync_safekeepers_logs + assert "sending elected msg to node" in sync_safekeepers_logs