From 2aceb6a3095bf0ee6cf7ef3ecc1bb182864abccb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 23 May 2022 20:58:27 +0300 Subject: [PATCH 01/50] Fix garbage collection to not remove image layers that are still needed. The logic would incorrectly remove an image layer, if a new image layer existed, even though the older image layer was still needed by some delta layers after it. See example given in the comment this adds. Without this fix, I was getting a lot of "could not find data for key 010000000000000000000000000000000000" errors from GC, with the new test case being added in PR #1735. Fixes #707 --- pageserver/src/layered_repository.rs | 24 ++++++++++++------- .../src/layered_repository/layer_map.rs | 13 ++++------ 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index fc4ab942f6..a83907430e 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -18,7 +18,7 @@ use itertools::Itertools; use lazy_static::lazy_static; use tracing::*; -use std::cmp::{max, Ordering}; +use std::cmp::{max, min, Ordering}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::{BTreeSet, HashSet}; @@ -2165,7 +2165,7 @@ impl LayeredTimeline { let gc_info = self.gc_info.read().unwrap(); let retain_lsns = &gc_info.retain_lsns; - let cutoff = gc_info.cutoff; + let cutoff = min(gc_info.cutoff, disk_consistent_lsn); let pitr = gc_info.pitr; // Calculate pitr cutoff point. @@ -2294,12 +2294,20 @@ impl LayeredTimeline { // is 102, then it might not have been fully flushed to disk // before crash. // - // FIXME: This logic is wrong. See https://github.com/zenithdb/zenith/issues/707 - if !layers.newer_image_layer_exists( - &l.get_key_range(), - l.get_lsn_range().end, - disk_consistent_lsn + 1, - )? { + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + { debug!( "keeping {} because it is the latest layer", l.filename().display() diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 7491294c03..f7f51bf21f 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -201,18 +201,14 @@ impl LayerMap { NUM_ONDISK_LAYERS.dec(); } - /// Is there a newer image layer for given key-range? + /// Is there a newer image layer for given key- and LSN-range? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. - /// We ignore layers newer than disk_consistent_lsn because they will be removed at restart - /// We also only look at historic layers - //#[allow(dead_code)] - pub fn newer_image_layer_exists( + pub fn image_layer_exists( &self, key_range: &Range, - lsn: Lsn, - disk_consistent_lsn: Lsn, + lsn_range: &Range, ) -> Result { let mut range_remain = key_range.clone(); @@ -225,8 +221,7 @@ impl LayerMap { let img_lsn = l.get_lsn_range().start; if !l.is_incremental() && l.get_key_range().contains(&range_remain.start) - && img_lsn > lsn - && img_lsn < disk_consistent_lsn + && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; From 8346aa3a29daf6088689076d35a9c99df3c9e4ce Mon Sep 17 00:00:00 2001 From: KlimentSerafimov Date: Tue, 24 May 2022 04:55:38 -0400 Subject: [PATCH 02/50] Potential fix to #1626. Fixed typo is Makefile. (#1781) * Potential fix to #1626. Fixed typo is Makefile. * Completed fix to #1626. Summary: changed 'error' to 'bail' in start_pageserver and start_safekeeper. --- Makefile | 2 +- pageserver/src/bin/pageserver.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 5eca7fb094..fdfc64f6fa 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS = -O0 -g3 $(CFLAGS) else -$(error Bad build type `$(BUILD_TYPE)', see Makefile for options) + $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif # macOS with brew-installed openssl requires explicit paths diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 00864056cb..ac90500b97 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -254,7 +254,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // Otherwise, the coverage data will be damaged. match daemonize.exit_action(|| exit_now(0)).start() { Ok(_) => info!("Success, daemonized"), - Err(err) => error!(%err, "could not daemonize"), + Err(err) => bail!("{err}. could not daemonize. bailing."), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 61d2f558f2..a5ffc013e2 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -245,7 +245,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b // Otherwise, the coverage data will be damaged. match daemonize.exit_action(|| exit_now(0)).start() { Ok(_) => info!("Success, daemonized"), - Err(e) => error!("Error, {}", e), + Err(err) => bail!("Error: {err}. could not daemonize. bailing."), } } From 541ec258758309b1ef98c24b5afe79169406d3b9 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 24 May 2022 17:56:37 +0300 Subject: [PATCH 03/50] Properly shutdown test mock S3 server --- .circleci/config.yml | 2 +- test_runner/fixtures/zenith_fixtures.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index eb2bf0172b..41f7693726 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -361,7 +361,7 @@ jobs: when: always command: | du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete + find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete du -sh /tmp/test_output/* - store_artifacts: path: /tmp/test_output diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 17d932c968..8f9bf1c11b 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -393,7 +393,10 @@ class MockS3Server: ): self.port = port - self.subprocess = subprocess.Popen([f'poetry run moto_server s3 -p{port}'], shell=True) + # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. + # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux + # if a process is started from the shell process. + self.subprocess = subprocess.Popen(['poetry', 'run', 'moto_server', 's3', f'-p{port}']) error = None try: return_code = self.subprocess.poll() @@ -403,7 +406,7 @@ class MockS3Server: error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" if error is not None: log.error(error) - self.subprocess.kill() + self.kill() raise RuntimeError("failed to start s3 mock server") def endpoint(self) -> str: From d32b491a5300d99c9e2d7811944160185e23730c Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 25 May 2022 11:31:10 +0400 Subject: [PATCH 04/50] Add zenith-us-stage-sk-6 to deploy (#1728) --- .circleci/ansible/staging.hosts | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index 8e89e843d9..d99ffa6dac 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -6,6 +6,7 @@ zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 +zenith-us-stage-sk-6 console_region_id=27 [storage:children] pageservers From 2b265fd6dc38b58a684ee6d584714a87705936b1 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 25 May 2022 14:16:44 +0400 Subject: [PATCH 05/50] Disable restart_after_crash in neon_local. It is pointless when basebackup is invalid. --- control_plane/src/compute.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 92d0e080d8..350cf74b7c 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -274,6 +274,8 @@ impl PostgresNode { conf.append("listen_addresses", &self.address.ip().to_string()); conf.append("port", &self.address.port().to_string()); conf.append("wal_keep_size", "0"); + // walproposer panics when basebackup is invalid, it is pointless to restart in this case. + conf.append("restart_after_crash", "off"); // Configure the node to fetch pages from pageserver let pageserver_connstr = { From 703f691df8fb82fdfd3d2febc892748eb7317126 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 25 May 2022 14:30:50 +0300 Subject: [PATCH 06/50] production inventory update (#1779) --- .circleci/ansible/production.hosts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 2ed8f517f7..6cefd724d8 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -1,5 +1,6 @@ [pageservers] -zenith-1-ps-1 console_region_id=1 +#zenith-1-ps-1 console_region_id=1 +zenith-1-ps-2 console_region_id=1 [safekeepers] zenith-1-sk-1 console_region_id=1 @@ -15,4 +16,4 @@ console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 -safekeeper_enable_s3_offload = true +safekeeper_enable_s3_offload = false From 6f1f33ef42a63c0047442e8057b9223793424edb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 May 2022 14:33:06 +0300 Subject: [PATCH 07/50] Improve error messages on seccomp loading errors. Bump vendor/postgres for https://github.com/neondatabase/postgres/pull/166 --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 79af2faf08..038b2b98e5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 79af2faf08d9bec1b1664a72936727dcca36d253 +Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91 From 9ab52e2186e9330d4098b27372d8a0a2d5f0ac1e Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 25 May 2022 15:41:18 +0300 Subject: [PATCH 08/50] helm repository name fix for production proxy deploy (#1790) --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 41f7693726..5346e35c01 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -685,7 +685,7 @@ jobs: name: Setup helm v3 command: | curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://neondatabase.github.io/helm-charts + helm repo add neondatabase https://neondatabase.github.io/helm-charts - run: name: Re-deploy proxy command: | From 24d2313d0b8d1b6279f8a01376f55111427c9b19 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 May 2022 16:57:45 +0300 Subject: [PATCH 09/50] Set --quota-backend-bytes when launching etcd in tests. By default, etcd makes a huge 10 GB mmap() allocation when it starts up. It doesn't actually use that much memory, it's just address space, but it caused me grief when I tried to use 'rr' to debug a python test run. Apparently, when you replay the 'rr' trace, it does allocate memory for all that address space. The size of the initial mmap depends on the --quota-backend-bytes setting. Our etcd clusters are very small, so let's set --quota-backend-bytes to keep the virtual memory size small, to make debugging with 'rr' easier. See https://github.com/etcd-io/etcd/issues/7910 and https://github.com/etcd-io/etcd/commit/5e4b0081065925ab9d04009cd4fb559c4cceb304 --- control_plane/src/etcd.rs | 4 ++++ test_runner/fixtures/zenith_fixtures.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index df657dd1be..bc39b7dea3 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -48,6 +48,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { format!("--data-dir={}", etcd_data_dir.display()), format!("--listen-client-urls={client_urls}"), format!("--advertise-client-urls={client_urls}"), + // Set --quota-backend-bytes to keep the etcd virtual memory + // size smaller. Our test etcd clusters are very small. + // See https://github.com/etcd-io/etcd/issues/7910 + "--quota-backend-bytes=100000000".to_string(), ]) .stdout(Stdio::from(etcd_stdout_file)) .stderr(Stdio::from(etcd_stderr_file)) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8f9bf1c11b..7f5b2ad2aa 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1893,7 +1893,11 @@ class Etcd: f"--data-dir={self.datadir}", f"--listen-client-urls={client_url}", f"--advertise-client-urls={client_url}", - f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}" + f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}", + # Set --quota-backend-bytes to keep the etcd virtual memory + # size smaller. Our test etcd clusters are very small. + # See https://github.com/etcd-io/etcd/issues/7910 + f"--quota-backend-bytes=100000000" ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) From 7997fc2932465b1c8854a64c2c053041eacdf80a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 May 2022 18:14:44 +0300 Subject: [PATCH 10/50] Fix error handling with 'basebackup' command. If the 'basebackup' command failed in the middle of building the tar archive, the client would not report the error, but would attempt to to start up postgres with the partial contents of the data directory. That fails because the control file is missing (it's added to the archive last, precisly to make sure that you cannot start postgres from a partial archive). But the client doesn't see the proper error message that caused the basebackup to fail in the server, which is confusing. Two issues conspired to cause that: 1. The tar::Builder object that we use in the pageserver to construct the tar stream has a Drop handler that automatically writes a valid end-of-archive marker on drop. Because of that, the resulting tarball looks complete, even if an error happens while we're building it. The pageserver does send an ErrorResponse after the seemingly-valid tarball, but: 2. The client stops reading the Copy stream, as soon as it sees the tar end-of-archive marker. Therefore, it doesn't read the ErrorResponse that comes after it. We have two clients that call 'basebackup', one in `control_plane` used by the `neon_local` binary, and another one in `compute_tools`. Both had the same issue. This PR fixes both issues, even though fixing either one would be enough to fix the problem at hand. The pageserver now doesn't send the end-of-archive marker on error, and the client now reads the copy stream to the end, even if it sees an end-of-archive marker. Fixes github issue #1715 In the passing, change Basebackup to use generic Write rather than 'dyn'. --- compute_tools/src/compute.rs | 8 +- control_plane/Cargo.toml | 2 +- control_plane/src/compute.rs | 9 +- pageserver/src/basebackup.rs | 90 +++++++++++++++++-- pageserver/src/page_service.rs | 3 +- .../batch_others/test_basebackup_error.py | 20 +++++ 6 files changed, 119 insertions(+), 13 deletions(-) create mode 100644 test_runner/batch_others/test_basebackup_error.py diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a8422fb2b2..fd60b80305 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -146,8 +146,14 @@ impl ComputeNode { _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; - let mut ar = tar::Archive::new(copyreader); + // Read the archive directly from the `CopyOutReader` + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?; self.metrics.basebackup_ms.store( diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 41417aab9a..21311eea9a 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -tar = "0.4.33" +tar = "0.4.38" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 350cf74b7c..045acd7519 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -231,8 +231,13 @@ impl PostgresNode { .context("page server 'basebackup' command failed")?; // Read the archive directly from the `CopyOutReader` - tar::Archive::new(copyreader) - .unpack(&self.pgdata()) + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata()) .context("extracting base backup failed")?; Ok(()) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 92d35130d8..46d824b2e2 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,8 +10,9 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; +use fail::fail_point; use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; @@ -30,11 +31,16 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a> { - ar: Builder<&'a mut dyn Write>, +pub struct Basebackup<'a, W> +where + W: Write, +{ + ar: Builder>, timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, + + finished: bool, } // Create basebackup with non-rel data in it. Omit relational data. @@ -44,12 +50,15 @@ pub struct Basebackup<'a> { // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a> Basebackup<'a> { +impl<'a, W> Basebackup<'a, W> +where + W: Write, +{ pub fn new( - write: &'a mut dyn Write, + write: W, timeline: &'a Arc, req_lsn: Option, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -90,14 +99,15 @@ impl<'a> Basebackup<'a> { ); Ok(Basebackup { - ar: Builder::new(write), + ar: Builder::new(AbortableWrite::new(write)), timeline, lsn: backup_lsn, prev_record_lsn: backup_prev, + finished: false, }) } - pub fn send_tarball(&mut self) -> anyhow::Result<()> { + pub fn send_tarball(mut self) -> anyhow::Result<()> { // Create pgdata subdirs structure for dir in pg_constants::PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; @@ -135,9 +145,14 @@ impl<'a> Basebackup<'a> { self.add_twophase_file(xid)?; } + fail_point!("basebackup-before-control-file", |_| { + bail!("failpoint basebackup-before-control-file") + }); + // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file()?; self.ar.finish()?; + self.finished = true; debug!("all tarred up!"); Ok(()) } @@ -331,6 +346,19 @@ impl<'a> Basebackup<'a> { } } +impl<'a, W> Drop for Basebackup<'a, W> +where + W: Write, +{ + /// If the basebackup was not finished, prevent the Archive::drop() from + /// writing the end-of-archive marker. + fn drop(&mut self) { + if !self.finished { + self.ar.get_mut().abort(); + } + } +} + // // Create new tarball entry header // @@ -366,3 +394,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result
{ header.set_cksum(); Ok(header) } + +/// A wrapper that passes through all data to the underlying Write, +/// until abort() is called. +/// +/// tar::Builder has an annoying habit of finishing the archive with +/// a valid tar end-of-archive marker (two 512-byte sectors of zeros), +/// even if an error occurs and we don't finish building the archive. +/// We'd rather abort writing the tarball immediately than construct +/// a seemingly valid but incomplete archive. This wrapper allows us +/// to swallow the end-of-archive marker that Builder::drop() emits, +/// without writing it to the underlying sink. +/// +struct AbortableWrite { + w: W, + aborted: bool, +} + +impl AbortableWrite { + pub fn new(w: W) -> Self { + AbortableWrite { w, aborted: false } + } + + pub fn abort(&mut self) { + self.aborted = true; + } +} + +impl Write for AbortableWrite +where + W: Write, +{ + fn write(&mut self, data: &[u8]) -> io::Result { + if self.aborted { + Ok(data.len()) + } else { + self.w.write(data) + } + } + fn flush(&mut self) -> io::Result<()> { + if self.aborted { + Ok(()) + } else { + self.w.flush() + } + } +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 03264c9782..f54cd550b3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -593,7 +593,8 @@ impl PageServerHandler { /* Send a tarball of the latest layer on the timeline */ { let mut writer = CopyDataSink { pgb }; - let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; + + let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; span.record("lsn", &basebackup.lsn.to_string().as_str()); basebackup.send_tarball()?; } diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py new file mode 100644 index 0000000000..4b8b8a746c --- /dev/null +++ b/test_runner/batch_others/test_basebackup_error.py @@ -0,0 +1,20 @@ +import pytest +from contextlib import closing + +from fixtures.zenith_fixtures import ZenithEnv +from fixtures.log_helper import log + + +# +# Test error handling, if the 'basebackup' command fails in the middle +# of building the tar archive. +# +def test_basebackup_error(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + env.zenith_cli.create_branch("test_basebackup_error", "empty") + + # Introduce failpoint + env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") + + with pytest.raises(Exception, match="basebackup-before-control-file"): + pg = env.postgres.create_start('test_basebackup_error') From c584d90bb96bb7bd390bc5345ec8f667e765c299 Mon Sep 17 00:00:00 2001 From: chaitanya sharma <86035+phoenix24@users.noreply.github.com> Date: Mon, 23 May 2022 15:52:21 +0000 Subject: [PATCH 11/50] initial commit, renamed znodeid to nodeid. --- control_plane/src/local_env.rs | 10 +++++----- control_plane/src/safekeeper.rs | 8 ++++---- libs/etcd_broker/src/lib.rs | 16 ++++++++-------- libs/utils/src/zid.rs | 4 ++-- neon_local/src/main.rs | 10 +++++----- pageserver/src/config.rs | 16 ++++++++-------- pageserver/src/http/models.rs | 4 ++-- safekeeper/src/bin/safekeeper.rs | 12 ++++++------ safekeeper/src/broker.rs | 4 ++-- safekeeper/src/http/models.rs | 4 ++-- safekeeper/src/http/routes.rs | 6 +++--- safekeeper/src/lib.rs | 6 +++--- safekeeper/src/safekeeper.rs | 18 +++++++++--------- safekeeper/src/timeline.rs | 10 +++++----- 14 files changed, 64 insertions(+), 64 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index c73af7d338..015b33f591 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -15,7 +15,7 @@ use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use crate::safekeeper::SafekeeperNode; @@ -136,7 +136,7 @@ impl EtcdBroker { #[serde(default)] pub struct PageServerConf { // node id - pub id: ZNodeId, + pub id: NodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -151,7 +151,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -163,7 +163,7 @@ impl Default for PageServerConf { #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub id: ZNodeId, + pub id: NodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, @@ -172,7 +172,7 @@ pub struct SafekeeperConf { impl Default for SafekeeperConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), pg_port: 0, http_port: 0, sync: true, diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index d5b6251209..303d6850df 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -18,7 +18,7 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::local_env::{LocalEnv, SafekeeperConf}; @@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub id: ZNodeId, + pub id: NodeId, pub conf: SafekeeperConf, @@ -100,7 +100,7 @@ impl SafekeeperNode { .unwrap() } - pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf { + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) } @@ -286,7 +286,7 @@ impl SafekeeperNode { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result<()> { Ok(self .http_request( diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 76181f9ba1..271f657f43 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -16,7 +16,7 @@ use tokio::{sync::mpsc, task::JoinHandle}; use tracing::*; use utils::{ lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; /// Default value to use for prefixing to all etcd keys with. @@ -25,7 +25,7 @@ pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; #[derive(Debug, Deserialize, Serialize)] struct SafekeeperTimeline { - safekeeper_id: ZNodeId, + safekeeper_id: NodeId, info: SkTimelineInfo, } @@ -71,7 +71,7 @@ pub enum BrokerError { /// A way to control the data retrieval from a certain subscription. pub struct SkTimelineSubscription { safekeeper_timeline_updates: - mpsc::UnboundedReceiver>>, + mpsc::UnboundedReceiver>>, kind: SkTimelineSubscriptionKind, watcher_handle: JoinHandle>, watcher: Watcher, @@ -81,7 +81,7 @@ impl SkTimelineSubscription { /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. pub async fn fetch_data( &mut self, - ) -> Option>> { + ) -> Option>> { self.safekeeper_timeline_updates.recv().await } @@ -221,7 +221,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates( break; } - let mut timeline_updates: HashMap> = HashMap::new(); + let mut timeline_updates: HashMap> = HashMap::new(); // Keep track that the timeline data updates from etcd arrive in the right order. // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. @@ -299,18 +299,18 @@ fn parse_etcd_key_value( parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?, ), - ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Tenant(tenant_id) => ( ZTenantTimelineId::new( tenant_id, parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, ), - ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), ), SubscriptionKind::Timeline(zttid) => ( zttid, - ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), + NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), ), }; diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 44d81cda50..02f781c49a 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -226,9 +226,9 @@ impl fmt::Display for ZTenantTimelineId { // by the console. #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] #[serde(transparent)] -pub struct ZNodeId(pub u64); +pub struct NodeId(pub u64); -impl fmt::Display for ZNodeId { +impl fmt::Display for NodeId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index f04af9cfdd..8d39fe5d0d 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -22,14 +22,14 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, project_git_version, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use pageserver::timelines::TimelineInfo; // Default id of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); -const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); +const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); @@ -860,7 +860,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result { +fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result { if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { @@ -876,7 +876,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul // All the commands take an optional safekeeper name argument let sk_id = if let Some(id_str) = sub_args.value_of("id") { - ZNodeId(id_str.parse().context("while parsing safekeeper id")?) + NodeId(id_str.parse().context("while parsing safekeeper id")?) } else { DEFAULT_SAFEKEEPER_ID }; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a9215c0701..6c045d77ae 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -16,7 +16,7 @@ use toml_edit::{Document, Item}; use url::Url; use utils::{ postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::layered_repository::TIMELINES_SEGMENT_NAME; @@ -78,7 +78,7 @@ pub mod defaults { pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers // can safely distinguish different pageservers - pub id: ZNodeId, + pub id: NodeId, /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, @@ -180,7 +180,7 @@ struct PageServerConfigBuilder { auth_validation_public_key_path: BuilderValue>, remote_storage_config: BuilderValue>, - id: BuilderValue, + id: BuilderValue, profiling: BuilderValue, broker_etcd_prefix: BuilderValue, @@ -276,7 +276,7 @@ impl PageServerConfigBuilder { self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix) } - pub fn id(&mut self, node_id: ZNodeId) { + pub fn id(&mut self, node_id: NodeId) { self.id = BuilderValue::Set(node_id) } @@ -399,7 +399,7 @@ impl PageServerConf { "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; } - "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), + "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), "broker_endpoints" => builder.broker_endpoints( @@ -550,7 +550,7 @@ impl PageServerConf { #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { - id: ZNodeId(0), + id: NodeId(0), wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, @@ -693,7 +693,7 @@ id = 10 assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, @@ -737,7 +737,7 @@ id = 10 assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), wait_lsn_timeout: Duration::from_secs(111), diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index e9aaa72416..e00ccda2a1 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; #[serde_as] @@ -42,7 +42,7 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId #[derive(Serialize)] pub struct StatusResponse { - pub id: ZNodeId, + pub id: NodeId, } impl TenantCreateRequest { diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index a5ffc013e2..290b7c738a 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -24,7 +24,7 @@ use safekeeper::{broker, callmemaybe}; use safekeeper::{http, s3_offload}; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, - zid::ZNodeId, + zid::NodeId, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; @@ -167,7 +167,7 @@ fn main() -> anyhow::Result<()> { let mut given_id = None; if let Some(given_id_str) = arg_matches.value_of("id") { - given_id = Some(ZNodeId( + given_id = Some(NodeId( given_id_str .parse() .context("failed to parse safekeeper id")?, @@ -192,7 +192,7 @@ fn main() -> anyhow::Result<()> { start_safekeeper(conf, given_id, arg_matches.is_present("init")) } -fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; info!("version: {GIT_VERSION}"); @@ -345,14 +345,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b } /// Determine safekeeper id and set it in config. -fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { let id_file_path = conf.workdir.join(ID_FILE_NAME); - let my_id: ZNodeId; + let my_id: NodeId; // If ID exists, read it in; otherwise set one passed match fs::read(&id_file_path) { Ok(id_serialized) => { - my_id = ZNodeId( + my_id = NodeId( std::str::from_utf8(&id_serialized) .context("failed to parse safekeeper id")? .parse() diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index d7217be20a..59d282d378 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -12,7 +12,7 @@ use tokio::{runtime, time::sleep}; use tracing::*; use crate::{timeline::GlobalTimelines, SafeKeeperConf}; -use utils::zid::{ZNodeId, ZTenantTimelineId}; +use utils::zid::{NodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; @@ -36,7 +36,7 @@ pub fn thread_main(conf: SafeKeeperConf) { fn timeline_safekeeper_path( broker_etcd_prefix: String, zttid: ZTenantTimelineId, - sk_id: ZNodeId, + sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index ca18e64096..77efc0cc21 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,9 +1,9 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use utils::zid::{NodeId, ZTenantId, ZTimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, - pub peer_ids: Vec, + pub peer_ids: Vec, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 62fbd2ff2f..3f6ade970d 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -20,14 +20,14 @@ use utils::{ RequestExt, RouterBuilder, }, lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::models::TimelineCreateRequest; #[derive(Debug, Serialize)] struct SafekeeperStatus { - id: ZNodeId, + id: NodeId, } /// Healthcheck handler. @@ -178,7 +178,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result, pub recall_period: Duration, - pub my_id: ZNodeId, + pub my_id: NodeId, pub broker_endpoints: Vec, pub broker_etcd_prefix: String, pub s3_offload_enabled: bool, @@ -79,7 +79,7 @@ impl Default for SafeKeeperConf { listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), ttl: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, - my_id: ZNodeId(0), + my_id: NodeId(0), broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), s3_offload_enabled: true, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fff1c269b6..b8b969929d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -26,7 +26,7 @@ use utils::{ bin_ser::LeSer, lsn::Lsn, pq_proto::{SystemId, ZenithFeedback}, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; @@ -164,7 +164,7 @@ impl PeerInfo { // vector-based node id -> peer state map with very limited functionality we // need/ #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>); +pub struct Peers(pub Vec<(NodeId, PeerInfo)>); /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. @@ -224,7 +224,7 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { tenant_id: zttid.tenant_id, timeline_id: zttid.timeline_id, @@ -277,7 +277,7 @@ pub struct ProposerGreeting { #[derive(Debug, Serialize)] pub struct AcceptorGreeting { term: u64, - node_id: ZNodeId, + node_id: NodeId, } /// Vote request sent from proposer to safekeepers @@ -531,7 +531,7 @@ pub struct SafeKeeper { pub wal_store: WAL, - node_id: ZNodeId, // safekeeper's node id + node_id: NodeId, // safekeeper's node id } impl SafeKeeper @@ -544,7 +544,7 @@ where ztli: ZTimelineId, state: CTRL, mut wal_store: WAL, - node_id: ZNodeId, + node_id: NodeId, ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); @@ -1013,7 +1013,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1028,7 +1028,7 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap(); + sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1045,7 +1045,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2bb7771aac..0953439bd8 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -21,7 +21,7 @@ use tracing::*; use utils::{ lsn::Lsn, pq_proto::ZenithFeedback, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; @@ -99,7 +99,7 @@ impl SharedState { fn create( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; @@ -448,7 +448,7 @@ impl Timeline { } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> { + pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: NodeId) -> Result<()> { let mut shared_state = self.mutex.lock().unwrap(); shared_state.sk.record_safekeeper_info(sk_info)?; self.notify_wal_senders(&mut shared_state); @@ -551,7 +551,7 @@ impl GlobalTimelines { mut state: MutexGuard, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { match state.timelines.get(&zttid) { Some(_) => bail!("timeline {} already exists", zttid), @@ -576,7 +576,7 @@ impl GlobalTimelines { pub fn create( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { let state = TIMELINES_STATE.lock().unwrap(); GlobalTimelines::create_internal(state, conf, zttid, peer_ids) From 887b0e14d9285bdf64eab3e44eb7000cdb55b44b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 25 May 2022 21:07:49 +0300 Subject: [PATCH 12/50] Run basic checks on PRs and pushes to main only --- .github/workflows/testing.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 79b2ba05d0..281c893403 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -1,8 +1,10 @@ name: Build and Test on: - pull_request: push: + branches: + - main + pull_request: jobs: regression-check: From 06f5e017a1b0d380e0e082e906cd52b7a885b100 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 25 May 2022 21:12:17 +0300 Subject: [PATCH 13/50] Move rustfmt check to GH Action --- .circleci/config.yml | 10 ---------- .github/workflows/testing.yml | 6 +++++- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5346e35c01..624d367053 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,15 +11,6 @@ executors: - image: zimg/rust:1.58 jobs: - check-codestyle-rust: - executor: neon-xlarge-executor - steps: - - checkout - - run: - name: rustfmt - when: always - command: cargo fmt --all -- --check - # A job to build postgres build-postgres: executor: neon-xlarge-executor @@ -740,7 +731,6 @@ jobs: workflows: build_and_test: jobs: - - check-codestyle-rust - check-codestyle-python - build-postgres: name: build-postgres-<< matrix.build_type >> diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 281c893403..1ce1b64a49 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -25,13 +25,17 @@ jobs: submodules: true fetch-depth: 2 - - name: install rust toolchain ${{ matrix.rust_toolchain }} + - name: Install rust toolchain ${{ matrix.rust_toolchain }} uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.rust_toolchain }} + components: rustfmt, clippy override: true + - name: Check formatting + run: cargo fmt --all -- --check + - name: Install Ubuntu postgres dependencies if: matrix.os == 'ubuntu-latest' run: | From 5a5737278e637245d0b7b89a20b47040d2572a0e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 25 May 2022 23:10:44 +0300 Subject: [PATCH 14/50] add simple metrics for remote storage operations track number of operations and number of their failures --- Cargo.lock | 2 + libs/remote_storage/Cargo.toml | 11 ++- libs/remote_storage/src/s3_bucket.rs | 109 +++++++++++++++++++++++++-- 3 files changed, 113 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6acad6dac8..840953f645 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2394,6 +2394,8 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "metrics", + "once_cell", "rusoto_core", "rusoto_s3", "serde", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 291f6e50ac..5c62e28fda 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -5,14 +5,17 @@ edition = "2021" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } -tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } -tokio-util = { version = "0.7", features = ["io"] } -tracing = "0.1.27" +async-trait = "0.1" + +metrics = { version = "0.1", path = "../metrics" } +once_cell = "1.8.0" rusoto_core = "0.48" rusoto_s3 = "0.48" serde = { version = "1.0", features = ["derive"] } serde_json = "1" -async-trait = "0.1" +tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } +tokio-util = { version = "0.7", features = ["io"] } +tracing = "0.1.27" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 01aaf7ca7e..80d6966494 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -23,6 +23,71 @@ use crate::{strip_path_prefix, RemoteStorage, S3Config}; use super::StorageMetadata; +pub(super) mod metrics { + use metrics::{register_int_counter_vec, IntCounterVec}; + use once_cell::sync::Lazy; + + static S3_REQUESTS_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_requests_count", + "Number of s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + static S3_REQUESTS_FAIL_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_failures_count", + "Number of failed s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + pub fn inc_get_object() { + S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc(); + } + + pub fn inc_get_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["get_object"]) + .inc(); + } + + pub fn inc_put_object() { + S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc(); + } + + pub fn inc_put_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["put_object"]) + .inc(); + } + + pub fn inc_delete_object() { + S3_REQUESTS_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_delete_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_list_objects() { + S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); + } + + pub fn inc_list_objects_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["list_objects"]) + .inc(); + } +} + const S3_PREFIX_SEPARATOR: char = '/'; #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] @@ -152,6 +217,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + let fetch_response = self .client .list_objects_v2(ListObjectsV2Request { @@ -160,7 +228,11 @@ impl RemoteStorage for S3Bucket { continuation_token, ..ListObjectsV2Request::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; document_keys.extend( fetch_response .contents @@ -190,6 +262,8 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 upload")?; + + metrics::inc_put_object(); self.client .put_object(PutObjectRequest { body: Some(StreamingBody::new_with_size( @@ -201,7 +275,11 @@ impl RemoteStorage for S3Bucket { metadata: metadata.map(|m| m.0), ..PutObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_put_object_fail(); + e + })?; Ok(()) } @@ -215,6 +293,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 download")?; + + metrics::inc_get_object(); + let object_output = self .client .get_object(GetObjectRequest { @@ -222,7 +303,11 @@ impl RemoteStorage for S3Bucket { key: from.key().to_owned(), ..GetObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_get_object_fail(); + e + })?; if let Some(body) = object_output.body { let mut from = io::BufReader::new(body.into_async_read()); @@ -251,6 +336,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 range download")?; + + metrics::inc_get_object(); + let object_output = self .client .get_object(GetObjectRequest { @@ -259,7 +347,11 @@ impl RemoteStorage for S3Bucket { range, ..GetObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_get_object_fail(); + e + })?; if let Some(body) = object_output.body { let mut from = io::BufReader::new(body.into_async_read()); @@ -275,13 +367,20 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 delete")?; + + metrics::inc_delete_object(); + self.client .delete_object(DeleteObjectRequest { bucket: self.bucket_name.clone(), key: path.key().to_owned(), ..DeleteObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_delete_object_fail(); + e + })?; Ok(()) } } From 38f2d165b778834d927ed6c549c3285ecfbbe576 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 26 May 2022 12:06:05 +0300 Subject: [PATCH 15/50] allow TLS 1.2 in proxy to be compatible with older client libraries --- proxy/src/config.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 077a07beb9..6f1b56bfe4 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -61,7 +61,8 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result Date: Thu, 19 May 2022 14:27:28 +0300 Subject: [PATCH 16/50] Initialize last_freeze_at with disk consistent LSN to avoid creation of small L0 delta layer on startup refer #1736 --- pageserver/src/layered_repository.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a83907430e..d10c795214 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1230,7 +1230,7 @@ impl LayeredTimeline { }), disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - last_freeze_at: AtomicLsn::new(0), + last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), From 72a7220dc8c7a247ea411f3e381c8710f99617b7 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 26 May 2022 16:48:32 +0300 Subject: [PATCH 17/50] Tidy up some log messages * turn println into an info with proper message * rename new_local_timeline to load_local_timeline because it does not create new timeline, it registers timeline that exists on disk in pageserver in-memory structures --- pageserver/src/tenant_mgr.rs | 10 +++++----- pageserver/src/timelines.rs | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index bbe66d7f80..bba67394c3 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -327,8 +327,8 @@ pub fn get_local_timeline_with_load( return Ok(Arc::clone(page_tline)); } - let page_tline = new_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?; + let page_tline = load_local_timeline(&tenant.repo, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; tenant .local_timelines .insert(timeline_id, Arc::clone(&page_tline)); @@ -365,7 +365,7 @@ pub fn detach_timeline( Ok(()) } -fn new_local_timeline( +fn load_local_timeline( repo: &RepositoryImpl, timeline_id: ZTimelineId, ) -> anyhow::Result>> { @@ -458,8 +458,8 @@ fn apply_timeline_remote_sync_status_updates( bail!("Local timeline {timeline_id} already registered") } Entry::Vacant(v) => { - v.insert(new_local_timeline(repo, timeline_id).with_context(|| { - format!("Failed to register new local timeline for tenant {tenant_id}") + v.insert(load_local_timeline(repo, timeline_id).with_context(|| { + format!("Failed to register add local timeline for tenant {tenant_id}") })?); } }, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index eadf5bf4e0..408eca6501 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -302,8 +302,8 @@ fn bootstrap_timeline( import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; page_tline.tline.checkpoint(CheckpointConfig::Forced)?; - println!( - "created initial timeline {} timeline.lsn {}", + info!( + "created root timeline {} timeline.lsn {}", tli, page_tline.tline.get_last_record_lsn() ); From 7d565aa4b93836127de209eca5ceb1a98167b4f7 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Thu, 26 May 2022 12:21:15 -0400 Subject: [PATCH 18/50] Reduce the logging level when PG client disconnected to `INFO` (#1713) Fixes #1683. --- pageserver/src/page_service.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f54cd550b3..1c07b63072 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -305,7 +305,29 @@ fn page_service_conn_main( let mut conn_handler = PageServerHandler::new(conf, auth); let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - pgbackend.run(&mut conn_handler) + match pgbackend.run(&mut conn_handler) { + Ok(()) => { + // we've been requested to shut down + Ok(()) + } + Err(err) => { + let root_cause_io_err_kind = err + .root_cause() + .downcast_ref::() + .map(|e| e.kind()); + + // `ConnectionReset` error happens when the Postgres client closes the connection. + // As this disconnection happens quite often and is expected, + // we decided to downgrade the logging level to `INFO`. + // See: https://github.com/neondatabase/neon/issues/1683. + if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) { + info!("Postgres client disconnected"); + Ok(()) + } else { + Err(err) + } + } + } } #[derive(Debug)] From 1d71949c51f06cd0eaf313f0ac595af3209ef57a Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Thu, 26 May 2022 14:59:03 -0400 Subject: [PATCH 19/50] Change proxy welcome message (#1808) Remove zenith sun and outdated instructions around .pgpass --- proxy/src/auth_backend/link.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth_backend/link.rs index 9bdb9e21c4..8e5fcb32a9 100644 --- a/proxy/src/auth_backend/link.rs +++ b/proxy/src/auth_backend/link.rs @@ -5,12 +5,9 @@ use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ - "☀️ Welcome to Neon!\n", - "To proceed with database creation, open the following link:\n\n", + "Welcome to Neon!\n", + "Authenticate by visiting:\n", " {redirect_uri}{session_id}\n\n", - "It needs to be done once and we will send you '.pgpass' file,\n", - "which will allow you to access or create ", - "databases without opening your web browser." ], redirect_uri = redirect_uri, session_id = session_id, From 0e1bd57c533165dbe4bead8fa23baefa09c97b82 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 27 Apr 2022 00:24:59 -0700 Subject: [PATCH 20/50] Add WAL offloading to s3 on safekeepers. Separate task is launched for each timeline and stopped when timeline doesn't need offloading. Decision who offloads is done through etcd leader election; currently there is no pre condition for participating, that's a TODO. neon_local and tests infrastructure for remote storage in safekeepers added, along with the test itself. ref #1009 Co-authored-by: Anton Shyrabokau --- .circleci/ansible/production.hosts | 1 - .circleci/ansible/staging.hosts | 1 - .circleci/ansible/systemd/safekeeper.service | 2 +- Cargo.lock | 9 +- control_plane/src/lib.rs | 9 + control_plane/src/local_env.rs | 5 + control_plane/src/safekeeper.rs | 10 +- control_plane/src/storage.rs | 11 +- libs/etcd_broker/src/lib.rs | 4 +- libs/remote_storage/Cargo.toml | 2 +- libs/remote_storage/src/lib.rs | 88 +++- libs/utils/src/lsn.rs | 9 + pageserver/src/config.rs | 87 +--- safekeeper/Cargo.toml | 4 + safekeeper/src/bin/safekeeper.rs | 72 +-- safekeeper/src/broker.rs | 129 +++++- safekeeper/src/control_file_upgrade.rs | 8 +- safekeeper/src/http/routes.rs | 18 +- safekeeper/src/lib.rs | 15 +- safekeeper/src/receive_wal.rs | 22 +- safekeeper/src/remove_wal.rs | 2 +- safekeeper/src/s3_offload.rs | 107 ----- safekeeper/src/safekeeper.rs | 69 ++- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/timeline.rs | 307 +++++++++---- safekeeper/src/wal_backup.rs | 418 ++++++++++++++++++ test_runner/batch_others/test_wal_acceptor.py | 54 ++- test_runner/fixtures/zenith_fixtures.py | 110 +++-- 28 files changed, 1146 insertions(+), 429 deletions(-) delete mode 100644 safekeeper/src/s3_offload.rs create mode 100644 safekeeper/src/wal_backup.rs diff --git a/.circleci/ansible/production.hosts b/.circleci/ansible/production.hosts index 6cefd724d8..03c6cf57e0 100644 --- a/.circleci/ansible/production.hosts +++ b/.circleci/ansible/production.hosts @@ -16,4 +16,3 @@ console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index d99ffa6dac..cf5b98eaa1 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -17,4 +17,3 @@ console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 etcd_endpoints = etcd-staging.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index 55088db859..a6b443c3e7 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }} +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote_storage='{bucket_name={{bucket_name}}, bucket_region={{bucket_region}}, prefix_in_bucket=wal}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/Cargo.lock b/Cargo.lock index 840953f645..e39375c221 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1722,9 +1722,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" [[package]] name = "oorandom" @@ -2403,6 +2403,7 @@ dependencies = [ "tempfile", "tokio", "tokio-util 0.7.0", + "toml_edit", "tracing", "workspace_hack", ] @@ -2654,6 +2655,7 @@ name = "safekeeper" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "byteorder", "bytes", "clap 3.0.14", @@ -2662,12 +2664,14 @@ dependencies = [ "daemonize", "etcd_broker", "fs2", + "futures", "git-version", "hex", "humantime", "hyper", "lazy_static", "metrics", + "once_cell", "postgres", "postgres-protocol", "postgres_ffi", @@ -2681,6 +2685,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-util 0.7.0", + "toml_edit", "tracing", "url", "utils", diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index c3469c3350..4dfca588ad 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -49,3 +49,12 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { cmd } } + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 015b33f591..2623f65242 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -167,6 +167,8 @@ pub struct SafekeeperConf { pub pg_port: u16, pub http_port: u16, pub sync: bool, + pub remote_storage: Option, + pub backup_threads: Option, } impl Default for SafekeeperConf { @@ -176,6 +178,8 @@ impl Default for SafekeeperConf { pg_port: 0, http_port: 0, sync: true, + remote_storage: None, + backup_threads: None, } } } @@ -377,6 +381,7 @@ impl LocalEnv { base_path != Path::new(""), "repository base path is missing" ); + ensure!( !base_path.exists(), "directory '{}' already exists. Perhaps already initialized?", diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 303d6850df..972b6d48ae 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -23,7 +23,7 @@ use utils::{ use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; -use crate::{fill_rust_env_vars, read_pidfile}; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { @@ -143,6 +143,14 @@ impl SafekeeperNode { if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { cmd.args(&["--broker-etcd-prefix", prefix]); } + if let Some(threads) = self.conf.backup_threads { + cmd.args(&["--backup-threads", threads.to_string().as_ref()]); + } + if let Some(ref remote_storage) = self.conf.remote_storage { + cmd.args(&["--remote-storage", remote_storage]); + } + + fill_aws_secrets_vars(&mut cmd); if !cmd.status()?.success() { bail!( diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 355c7c250d..24cdbce8f3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -25,7 +25,7 @@ use utils::{ }; use crate::local_env::LocalEnv; -use crate::{fill_rust_env_vars, read_pidfile}; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; use pageserver::tenant_mgr::TenantInfo; #[derive(Error, Debug)] @@ -493,12 +493,3 @@ impl PageServerNode { Ok(timeline_info_response) } } - -fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { - for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { - if let Ok(value) = std::env::var(env_key) { - cmd = cmd.env(env_key, value); - } - } - cmd -} diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 271f657f43..7fe142502b 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -43,10 +43,10 @@ pub struct SkTimelineInfo { #[serde_as(as = "Option")] #[serde(default)] pub commit_lsn: Option, - /// LSN up to which safekeeper offloaded WAL to s3. + /// LSN up to which safekeeper has backed WAL. #[serde_as(as = "Option")] #[serde(default)] - pub s3_wal_lsn: Option, + pub backup_lsn: Option, /// LSN of last checkpoint uploaded by pageserver. #[serde_as(as = "Option")] #[serde(default)] diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 5c62e28fda..b11b3cf371 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -6,7 +6,6 @@ edition = "2021" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1" - metrics = { version = "0.1", path = "../metrics" } once_cell = "1.8.0" rusoto_core = "0.48" @@ -15,6 +14,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } tokio-util = { version = "0.7", features = ["io"] } +toml_edit = { version = "0.13", features = ["easy"] } tracing = "0.1.27" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 8092e4fc49..0889cb720c 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -16,8 +16,10 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::Context; +use anyhow::{bail, Context}; + use tokio::io; +use toml_edit::Item; use tracing::info; pub use self::{ @@ -203,6 +205,90 @@ pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) .with_extension(new_extension.as_ref()) } +impl RemoteStorageConfig { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + let local_path = toml.get("local_path"); + let bucket_name = toml.get("bucket_name"); + let bucket_region = toml.get("bucket_region"); + + let max_concurrent_syncs = NonZeroUsize::new( + parse_optional_integer("max_concurrent_syncs", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), + ) + .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; + + let max_sync_errors = NonZeroU32::new( + parse_optional_integer("max_sync_errors", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + ) + .context("Failed to parse 'max_sync_errors' as a positive integer")?; + + let concurrency_limit = NonZeroUsize::new( + parse_optional_integer("concurrency_limit", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + ) + .context("Failed to parse 'concurrency_limit' as a positive integer")?; + + let storage = match (local_path, bucket_name, bucket_region) { + (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), + (_, Some(_), None) => { + bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") + } + (_, None, Some(_)) => { + bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") + } + (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { + bucket_name: parse_toml_string("bucket_name", bucket_name)?, + bucket_region: parse_toml_string("bucket_region", bucket_region)?, + prefix_in_bucket: toml + .get("prefix_in_bucket") + .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) + .transpose()?, + endpoint: toml + .get("endpoint") + .map(|endpoint| parse_toml_string("endpoint", endpoint)) + .transpose()?, + concurrency_limit, + }), + (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( + parse_toml_string("local_path", local_path)?, + )), + (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), + }; + + Ok(RemoteStorageConfig { + max_concurrent_syncs, + max_sync_errors, + storage, + }) + } +} + +// Helper functions to parse a toml Item +fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> +where + I: TryFrom, + E: std::error::Error + Send + Sync + 'static, +{ + let toml_integer = match item.get(name) { + Some(item) => item + .as_integer() + .with_context(|| format!("configure option {name} is not an integer"))?, + None => return Ok(None), + }; + + I::try_from(toml_integer) + .map(Some) + .with_context(|| format!("configure option {name} is too large")) +} + +fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { + let s = item + .as_str() + .with_context(|| format!("configure option {name} is not a string"))?; + Ok(s.to_string()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index c09d8c67ce..3dab2a625c 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -26,6 +26,9 @@ impl Lsn { /// Maximum possible value for an LSN pub const MAX: Lsn = Lsn(u64::MAX); + /// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h + pub const INVALID: Lsn = Lsn(0); + /// Subtract a number, returning None on overflow. pub fn checked_sub>(self, other: T) -> Option { let other: u64 = other.into(); @@ -103,6 +106,12 @@ impl Lsn { pub fn is_aligned(&self) -> bool { *self == self.align() } + + /// Return if the LSN is valid + /// mimics postgres XLogRecPtrIsInvalid macro + pub fn is_valid(self) -> bool { + self != Lsn::INVALID + } } impl From for Lsn { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6c045d77ae..dc9d7161a2 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,9 +5,9 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config}; +use remote_storage::RemoteStorageConfig; use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; + use std::path::{Path, PathBuf}; use std::str::FromStr; use std::time::Duration; @@ -394,7 +394,7 @@ impl PageServerConf { )), "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?)) } "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; @@ -484,64 +484,6 @@ impl PageServerConf { Ok(t_conf) } - /// subroutine of parse_config(), to parse the `[remote_storage]` table. - fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result { - let local_path = toml.get("local_path"); - let bucket_name = toml.get("bucket_name"); - let bucket_region = toml.get("bucket_region"); - - let max_concurrent_syncs = NonZeroUsize::new( - parse_optional_integer("max_concurrent_syncs", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), - ) - .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; - - let max_sync_errors = NonZeroU32::new( - parse_optional_integer("max_sync_errors", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), - ) - .context("Failed to parse 'max_sync_errors' as a positive integer")?; - - let concurrency_limit = NonZeroUsize::new( - parse_optional_integer("concurrency_limit", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), - ) - .context("Failed to parse 'concurrency_limit' as a positive integer")?; - - let storage = match (local_path, bucket_name, bucket_region) { - (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), - (_, Some(_), None) => { - bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") - } - (_, None, Some(_)) => { - bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") - } - (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) - .transpose()?, - endpoint: toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?, - concurrency_limit, - }), - (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( - parse_toml_string("local_path", local_path)?, - )), - (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), - }; - - Ok(RemoteStorageConfig { - max_concurrent_syncs, - max_sync_errors, - storage, - }) - } - #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> PathBuf { PathBuf::from(format!("../tmp_check/test_{test_name}")) @@ -592,23 +534,6 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { Ok(i as u64) } -fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> -where - I: TryFrom, - E: std::error::Error + Send + Sync + 'static, -{ - let toml_integer = match item.get(name) { - Some(item) => item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?, - None => return Ok(None), - }; - - I::try_from(toml_integer) - .map(Some) - .with_context(|| format!("configure option {name} is too large")) -} - fn parse_toml_duration(name: &str, item: &Item) -> Result { let s = item .as_str() @@ -651,8 +576,12 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { #[cfg(test)] mod tests { - use std::fs; + use std::{ + fs, + num::{NonZeroU32, NonZeroUsize}, + }; + use remote_storage::{RemoteStorageKind, S3Config}; use tempfile::{tempdir, TempDir}; use super::*; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 417cf58cd5..373108c61b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,10 @@ const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-util = { version = "0.7", features = ["io"] } git-version = "0.3.5" +async-trait = "0.1" +once_cell = "1.10.0" +futures = "0.3.13" +toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 290b7c738a..a7628482d9 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -6,22 +6,27 @@ use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; +use remote_storage::RemoteStorageConfig; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tokio::sync::mpsc; +use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; use safekeeper::control_file::{self}; -use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; +use safekeeper::defaults::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, +}; +use safekeeper::http; use safekeeper::remove_wal; use safekeeper::timeline::GlobalTimelines; +use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; use safekeeper::{broker, callmemaybe}; -use safekeeper::{http, s3_offload}; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, zid::NodeId, @@ -71,12 +76,6 @@ fn main() -> anyhow::Result<()> { .long("pageserver") .takes_value(true), ) - .arg( - Arg::new("ttl") - .long("ttl") - .takes_value(true) - .help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"), - ) .arg( Arg::new("recall") .long("recall") @@ -118,12 +117,20 @@ fn main() -> anyhow::Result<()> { .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), ) .arg( - Arg::new("enable-s3-offload") - .long("enable-s3-offload") + Arg::new("wal-backup-threads").long("backup-threads").takes_value(true).help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), + ).arg( + Arg::new("remote-storage") + .long("remote-storage") + .takes_value(true) + .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") + ) + .arg( + Arg::new("enable-wal-backup") + .long("enable-wal-backup") .takes_value(true) .default_value("true") .default_missing_value("true") - .help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."), + .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), ) .get_matches(); @@ -157,10 +164,6 @@ fn main() -> anyhow::Result<()> { conf.listen_http_addr = addr.to_owned(); } - if let Some(ttl) = arg_matches.value_of("ttl") { - conf.ttl = Some(humantime::parse_duration(ttl)?); - } - if let Some(recall) = arg_matches.value_of("recall") { conf.recall_period = humantime::parse_duration(recall)?; } @@ -182,9 +185,21 @@ fn main() -> anyhow::Result<()> { conf.broker_etcd_prefix = prefix.to_string(); } + if let Some(backup_threads) = arg_matches.value_of("wal-backup-threads") { + conf.backup_runtime_threads = backup_threads + .parse() + .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?; + } + if let Some(storage_conf) = arg_matches.value_of("remote-storage") { + // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse + let storage_conf_toml = format!("remote_storage = {}", storage_conf); + let parsed_toml = storage_conf_toml.parse::()?; // parse + let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again + conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?); + } // Seems like there is no better way to accept bool values explicitly in clap. - conf.s3_offload_enabled = arg_matches - .value_of("enable-s3-offload") + conf.wal_backup_enabled = arg_matches + .value_of("enable-wal-backup") .unwrap() .parse() .context("failed to parse bool enable-s3-offload bool")?; @@ -252,7 +267,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); - GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx); + let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + GlobalTimelines::init(callmemaybe_tx, wal_backup_launcher_tx); let conf_ = conf.clone(); threads.push( @@ -270,17 +286,6 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo })?, ); - if conf.ttl.is_some() { - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("S3 offload thread".into()) - .spawn(|| { - s3_offload::thread_main(conf_); - })?, - ); - } - let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) @@ -330,6 +335,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo })?, ); + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("wal backup launcher thread".into()) + .spawn(move || { + wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx); + })?, + ); + // TODO: put more thoughts into handling of failed threads // We probably should restart them. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 59d282d378..676719b60d 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,5 +1,6 @@ //! Communication with etcd, providing safekeeper peers and pageserver coordination. +use anyhow::anyhow; use anyhow::Context; use anyhow::Error; use anyhow::Result; @@ -7,9 +8,11 @@ use etcd_broker::Client; use etcd_broker::PutOptions; use etcd_broker::SkTimelineSubscriptionKind; use std::time::Duration; +use tokio::spawn; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use url::Url; use crate::{timeline::GlobalTimelines, SafeKeeperConf}; use utils::zid::{NodeId, ZTenantTimelineId}; @@ -44,6 +47,118 @@ fn timeline_safekeeper_path( ) } +pub struct Election { + pub election_name: String, + pub candidate_name: String, + pub broker_endpoints: Vec, +} + +impl Election { + pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec) -> Self { + Self { + election_name, + candidate_name, + broker_endpoints, + } + } +} + +pub struct ElectionLeader { + client: Client, + keep_alive: JoinHandle>, +} + +impl ElectionLeader { + pub async fn check_am_i( + &mut self, + election_name: String, + candidate_name: String, + ) -> Result { + let resp = self.client.leader(election_name).await?; + + let kv = resp.kv().ok_or(anyhow!("failed to get leader response"))?; + let leader = kv.value_str()?; + + Ok(leader == candidate_name) + } + + pub async fn give_up(self) { + // self.keep_alive.abort(); + // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway + // should we await for keep alive termination? + let _ = self.keep_alive.await; + } +} + +pub async fn get_leader(req: &Election) -> Result { + let mut client = Client::connect(req.broker_endpoints.clone(), None) + .await + .context("Could not connect to etcd")?; + + let lease = client + .lease_grant(LEASE_TTL_SEC, None) + .await + .context("Could not acquire a lease"); + + let lease_id = lease.map(|l| l.id()).unwrap(); + + let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id)); + + if let Err(e) = client + .campaign( + req.election_name.clone(), + req.candidate_name.clone(), + lease_id, + ) + .await + { + keep_alive.abort(); + let _ = keep_alive.await; + return Err(e.into()); + } + + Ok(ElectionLeader { client, keep_alive }) +} + +async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { + let (mut keeper, mut ka_stream) = client + .lease_keep_alive(lease_id) + .await + .context("failed to create keepalive stream")?; + + loop { + let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); + + keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + + ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + + sleep(push_interval).await; + } +} + +pub fn get_campaign_name( + election_name: String, + broker_prefix: String, + timeline_id: &ZTenantTimelineId, +) -> String { + return format!( + "{}/{}", + SkTimelineSubscriptionKind::timeline(broker_prefix, *timeline_id).watch_key(), + election_name + ); +} + +pub fn get_candiate_name(system_id: NodeId) -> String { + format!("id_{}", system_id) +} + /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; @@ -59,7 +174,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. for zttid in GlobalTimelines::get_active_timelines() { - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { + if let Some(tli) = GlobalTimelines::get_loaded(zttid) { let sk_info = tli.get_public_info(&conf)?; let put_opts = PutOptions::new().with_lease(lease.id()); client @@ -106,12 +221,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // note: there are blocking operations below, but it's considered fine for now if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { for (safekeeper_id, info) in sk_info { - tli.record_safekeeper_info(&info, safekeeper_id)? + tli.record_safekeeper_info(&info, safekeeper_id).await? } } } } None => { + // XXX it means we lost connection with etcd, error is consumed inside sub object debug!("timeline updates sender closed, aborting the pull loop"); return Ok(()); } @@ -142,11 +258,12 @@ async fn main_loop(conf: SafeKeeperConf) { }, res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => { // was it panic or normal error? - let err = match res { - Ok(res_internal) => res_internal.unwrap_err(), - Err(err_outer) => err_outer.into(), + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("pull task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("pull task panicked: {:?}", err_outer) } }; - warn!("pull task failed: {:?}", err); pull_handle = None; }, _ = ticker.tick() => { diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 22716de1a0..8d36472540 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -165,7 +165,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -188,7 +188,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -211,7 +211,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -234,7 +234,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: oldstate.peer_horizon_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 3f6ade970d..b0197a9a2a 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -70,19 +70,19 @@ struct TimelineStatus { timeline_id: ZTimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] + flush_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] timeline_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] local_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] - s3_wal_lsn: Lsn, + backup_lsn: Lsn, #[serde(serialize_with = "display_serialize")] peer_horizon_lsn: Lsn, #[serde(serialize_with = "display_serialize")] remote_consistent_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - flush_lsn: Lsn, } /// Report info about timeline. @@ -107,13 +107,13 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, pub recall_period: Duration, + pub remote_storage: Option, + pub backup_runtime_threads: usize, + pub wal_backup_enabled: bool, pub my_id: NodeId, pub broker_endpoints: Vec, pub broker_etcd_prefix: String, - pub s3_offload_enabled: bool, } impl SafeKeeperConf { @@ -77,12 +81,13 @@ impl Default for SafeKeeperConf { no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - ttl: None, + remote_storage: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, my_id: NodeId(0), broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), - s3_offload_enabled: true, + backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + wal_backup_enabled: true, } } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 0ef335c9ed..88b7816912 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -85,16 +85,10 @@ impl<'pg> ReceiveWalConn<'pg> { _ => bail!("unexpected message {:?} instead of greeting", next_msg), } - // Register the connection and defer unregister. - spg.timeline - .get() - .on_compute_connect(self.pageserver_connstr.as_ref())?; - let _guard = ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), - }; - let mut next_msg = Some(next_msg); + let mut first_time_through = true; + let mut _guard: Option = None; loop { if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { // poll AppendRequest's without blocking and write WAL to disk without flushing, @@ -122,6 +116,18 @@ impl<'pg> ReceiveWalConn<'pg> { self.write_msg(&reply)?; } } + if first_time_through { + // Register the connection and defer unregister. Do that only + // after processing first message, as it sets wal_seg_size, + // wanted by many. + spg.timeline + .get() + .on_compute_connect(self.pageserver_connstr.as_ref())?; + _guard = Some(ComputeConnectionGuard { + timeline: Arc::clone(spg.timeline.get()), + }); + first_time_through = false; + } // blocking wait for the next message if next_msg.is_none() { diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 3278d51bd3..004c0243f9 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) { let active_tlis = GlobalTimelines::get_active_timelines(); for zttid in &active_tlis { if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) { + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { warn!( "failed to remove WAL for tenant {} timeline {}: {}", tli.zttid.tenant_id, tli.zttid.timeline_id, e diff --git a/safekeeper/src/s3_offload.rs b/safekeeper/src/s3_offload.rs deleted file mode 100644 index 2851c0b8a0..0000000000 --- a/safekeeper/src/s3_offload.rs +++ /dev/null @@ -1,107 +0,0 @@ -// -// Offload old WAL segments to S3 and remove them locally -// Needs `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to be set -// if no IAM bucket access is used. -// - -use anyhow::{bail, Context}; -use postgres_ffi::xlog_utils::*; -use remote_storage::{ - GenericRemoteStorage, RemoteStorage, RemoteStorageConfig, S3Bucket, S3Config, S3ObjectKey, -}; -use std::collections::HashSet; -use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; -use std::path::Path; -use std::time::SystemTime; -use tokio::fs::{self, File}; -use tokio::io::BufReader; -use tokio::runtime; -use tokio::time::sleep; -use tracing::*; -use walkdir::WalkDir; - -use crate::SafeKeeperConf; - -pub fn thread_main(conf: SafeKeeperConf) { - // Create a new thread pool - // - // FIXME: keep it single-threaded for now, make it easier to debug with gdb, - // and we're not concerned with performance yet. - //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - info!("Starting S3 offload task"); - - runtime.block_on(async { - main_loop(&conf).await.unwrap(); - }); -} - -async fn offload_files( - remote_storage: &S3Bucket, - listing: &HashSet, - dir_path: &Path, - conf: &SafeKeeperConf, -) -> anyhow::Result { - let horizon = SystemTime::now() - conf.ttl.unwrap(); - let mut n: u64 = 0; - for entry in WalkDir::new(dir_path) { - let entry = entry?; - let path = entry.path(); - - if path.is_file() - && IsXLogFileName(entry.file_name().to_str().unwrap()) - && entry.metadata().unwrap().created().unwrap() <= horizon - { - let remote_path = remote_storage.remote_object_id(path)?; - if !listing.contains(&remote_path) { - let file = File::open(&path).await?; - let file_length = file.metadata().await?.len() as usize; - remote_storage - .upload(BufReader::new(file), file_length, &remote_path, None) - .await?; - - fs::remove_file(&path).await?; - n += 1; - } - } - } - Ok(n) -} - -async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> { - let remote_storage = match GenericRemoteStorage::new( - conf.workdir.clone(), - &RemoteStorageConfig { - max_concurrent_syncs: NonZeroUsize::new(10).unwrap(), - max_sync_errors: NonZeroU32::new(1).unwrap(), - storage: remote_storage::RemoteStorageKind::AwsS3(S3Config { - bucket_name: "zenith-testbucket".to_string(), - bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?, - prefix_in_bucket: Some("walarchive/".to_string()), - endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?), - concurrency_limit: NonZeroUsize::new(20).unwrap(), - }), - }, - )? { - GenericRemoteStorage::Local(_) => { - bail!("Unexpected: got local storage for the remote config") - } - GenericRemoteStorage::S3(remote_storage) => remote_storage, - }; - - loop { - let listing = remote_storage - .list() - .await? - .into_iter() - .collect::>(); - let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?; - info!("Offload {n} files to S3"); - sleep(conf.ttl.unwrap()).await; - } -} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index b8b969929d..9a07127771 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -19,6 +19,7 @@ use lazy_static::lazy_static; use crate::control_file; use crate::send_wal::HotStandbyFeedback; + use crate::wal_storage; use metrics::{register_gauge_vec, Gauge, GaugeVec}; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; @@ -141,7 +142,7 @@ pub struct ServerInfo { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PeerInfo { /// LSN up to which safekeeper offloaded WAL to s3. - s3_wal_lsn: Lsn, + backup_lsn: Lsn, /// Term of the last entry. term: Term, /// LSN of the last record. @@ -153,7 +154,7 @@ pub struct PeerInfo { impl PeerInfo { fn new() -> Self { Self { - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, term: INVALID_TERM, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -193,9 +194,9 @@ pub struct SafeKeeperState { /// Part of WAL acknowledged by quorum and available locally. Always points /// to record boundary. pub commit_lsn: Lsn, - /// First LSN not yet offloaded to s3. Useful to persist to avoid finding - /// out offloading progress on boot. - pub s3_wal_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn /// of last record streamed to everyone). Persisting it helps skipping /// recovery in walproposer, generally we compute it from peers. In @@ -217,7 +218,7 @@ pub struct SafeKeeperState { // are not flushed yet. pub struct SafekeeperMemState { pub commit_lsn: Lsn, - pub s3_wal_lsn: Lsn, // TODO: keep only persistent version + pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, pub remote_consistent_lsn: Lsn, pub proposer_uuid: PgUuid, @@ -241,7 +242,7 @@ impl SafeKeeperState { timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: Lsn(0), - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: Lsn(0), remote_consistent_lsn: Lsn(0), peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), @@ -559,7 +560,7 @@ where epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { commit_lsn: state.commit_lsn, - s3_wal_lsn: state.s3_wal_lsn, + backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, proposer_uuid: state.proposer_uuid, @@ -649,7 +650,6 @@ where self.state.persist(&state)?; } - // pass wal_seg_size to read WAL and find flush_lsn self.wal_store.init_storage(&self.state)?; info!( @@ -764,6 +764,14 @@ where self.inmem.commit_lsn = commit_lsn; self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); + // We got our first commit_lsn, which means we should sync + // everything to disk, to initialize the state. + if self.state.commit_lsn == Lsn::INVALID && commit_lsn != Lsn::INVALID { + self.inmem.backup_lsn = self.inmem.commit_lsn; // initialize backup_lsn + self.wal_store.flush_wal()?; + self.persist_control_file()?; + } + // If new commit_lsn reached epoch switch, force sync of control // file: walproposer in sync mode is very interested when this // happens. Note: this is for sync-safekeepers mode only, as @@ -775,22 +783,14 @@ where self.persist_control_file()?; } - // We got our first commit_lsn, which means we should sync - // everything to disk, to initialize the state. - if self.state.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) { - self.wal_store.flush_wal()?; - self.persist_control_file()?; - } - Ok(()) } /// Persist in-memory state to the disk. fn persist_control_file(&mut self) -> Result<()> { let mut state = self.state.clone(); - state.commit_lsn = self.inmem.commit_lsn; - state.s3_wal_lsn = self.inmem.s3_wal_lsn; + state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; @@ -898,11 +898,11 @@ where self.update_commit_lsn()?; } } - if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn { - let new_s3_wal_lsn = max(s3_wal_lsn, self.inmem.s3_wal_lsn); + if let Some(backup_lsn) = sk_info.backup_lsn { + let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn); sync_control_file |= - self.state.s3_wal_lsn + (self.state.server.wal_seg_size as u64) < new_s3_wal_lsn; - self.inmem.s3_wal_lsn = new_s3_wal_lsn; + self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; + self.inmem.backup_lsn = new_backup_lsn; } if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { let new_remote_consistent_lsn = @@ -930,29 +930,23 @@ where /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo { - let s3_offload_horizon = if s3_offload_enabled { - self.state.s3_wal_lsn - } else { - Lsn(u64::MAX) - }; - let horizon_lsn = min( - min( - self.state.remote_consistent_lsn, - self.state.peer_horizon_lsn, - ), - s3_offload_horizon, + pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { + let mut horizon_lsn = min( + self.state.remote_consistent_lsn, + self.state.peer_horizon_lsn, ); + if wal_backup_enabled { + horizon_lsn = min(horizon_lsn, self.state.backup_lsn); + } horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } } #[cfg(test)] mod tests { - use std::ops::Deref; - use super::*; use crate::wal_storage::Storage; + use std::ops::Deref; // fake storage for tests struct InMemoryState { @@ -1013,6 +1007,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok @@ -1028,6 +1023,7 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; + sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok @@ -1045,6 +1041,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index d52dd6ea57..a89ed18071 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -315,7 +315,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().check_deactivate(replica_id)? { + if spg.timeline.get().stop_walsender(replica_id)? { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 0953439bd8..74a61410fd 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -8,6 +8,7 @@ use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; use serde::Serialize; +use tokio::sync::watch; use std::cmp::{max, min}; use std::collections::HashMap; @@ -15,7 +16,7 @@ use std::fs::{self}; use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; -use tokio::sync::mpsc::UnboundedSender; +use tokio::sync::mpsc::{Sender, UnboundedSender}; use tracing::*; use utils::{ @@ -25,13 +26,13 @@ use utils::{ }; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; - use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, }; use crate::send_wal::HotStandbyFeedback; + use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; @@ -81,10 +82,14 @@ struct SharedState { notified_commit_lsn: Lsn, /// State of replicas replicas: Vec>, - /// Inactive clusters shouldn't occupy any resources, so timeline is - /// activated whenever there is a compute connection or pageserver is not - /// caughtup (it must have latest WAL for new compute start) and suspended - /// otherwise. + /// True when WAL backup launcher oversees the timeline, making sure WAL is + /// offloaded, allows to bother launcher less. + wal_backup_active: bool, + /// True whenever there is at least some pending activity on timeline: live + /// compute connection, pageserver is not caughtup (it must have latest WAL + /// for new compute start) or WAL backuping is not finished. Practically it + /// means safekeepers broadcast info to peers about the timeline, old WAL is + /// trimmed. /// /// TODO: it might be better to remove tli completely from GlobalTimelines /// when tli is inactive instead of having this flag. @@ -103,6 +108,7 @@ impl SharedState { ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; @@ -110,6 +116,7 @@ impl SharedState { notified_commit_lsn: Lsn(0), sk, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, pageserver_connstr: None, @@ -129,15 +136,62 @@ impl SharedState { notified_commit_lsn: Lsn(0), sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, pageserver_connstr: None, last_removed_segno: 0, }) } + fn is_active(&self) -> bool { + self.is_wal_backup_required() + // FIXME: add tracking of relevant pageservers and check them here individually, + // otherwise migration won't work (we suspend too early). + || self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn + } - /// Activate the timeline: start/change walsender (via callmemaybe). - fn activate( + /// Mark timeline active/inactive and return whether s3 offloading requires + /// start/stop action. + fn update_status(&mut self) -> bool { + self.active = self.is_active(); + self.is_wal_backup_action_pending() + } + + /// Should we run s3 offloading in current state? + fn is_wal_backup_required(&self) -> bool { + let seg_size = self.get_wal_seg_size(); + self.num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (self.sk.inmem.commit_lsn.segment_number(seg_size) > + self.sk.inmem.backup_lsn.segment_number(seg_size)) + } + + /// Is current state of s3 offloading is not what it ought to be? + fn is_wal_backup_action_pending(&self) -> bool { + let res = self.wal_backup_active != self.is_wal_backup_required(); + if res { + let action_pending = if self.is_wal_backup_required() { + "start" + } else { + "stop" + }; + trace!( + "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", + self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn + ); + } + res + } + + /// Returns whether s3 offloading is required and sets current status as + /// matching. + fn wal_backup_attend(&mut self) -> bool { + self.wal_backup_active = self.is_wal_backup_required(); + self.wal_backup_active + } + + /// start/change walsender (via callmemaybe). + fn callmemaybe_sub( &mut self, zttid: &ZTenantTimelineId, pageserver_connstr: Option<&String>, @@ -179,42 +233,42 @@ impl SharedState { ); } self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned()); - self.active = true; Ok(()) } /// Deactivate the timeline: stop callmemaybe. - fn deactivate( + fn callmemaybe_unsub( &mut self, zttid: &ZTenantTimelineId, callmemaybe_tx: &UnboundedSender, ) -> Result<()> { - if self.active { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Unsubscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is unsubscribed from callmemaybe to {}", - zttid.timeline_id, - self.pageserver_connstr.as_ref().unwrap() - ); - } - self.active = false; + if let Some(ref pageserver_connstr) = self.pageserver_connstr { + let subscription_key = SubscriptionStateKey::new( + zttid.tenant_id, + zttid.timeline_id, + pageserver_connstr.to_owned(), + ); + callmemaybe_tx + .send(CallmeEvent::Unsubscribe(subscription_key)) + .unwrap_or_else(|e| { + error!( + "failed to send Unsubscribe request to callmemaybe thread {}", + e + ); + }); + info!( + "timeline {} is unsubscribed from callmemaybe to {}", + zttid.timeline_id, + self.pageserver_connstr.as_ref().unwrap() + ); } Ok(()) } + fn get_wal_seg_size(&self) -> usize { + self.sk.state.server.wal_seg_size as usize + } + /// Get combined state of all alive replicas pub fn get_replicas_state(&self) -> ReplicaState { let mut acc = ReplicaState::new(); @@ -278,6 +332,13 @@ impl SharedState { pub struct Timeline { pub zttid: ZTenantTimelineId, pub callmemaybe_tx: UnboundedSender, + /// Sending here asks for wal backup launcher attention (start/stop + /// offloading). Sending zttid instead of concrete command allows to do + /// sending without timeline lock. + wal_backup_launcher_tx: Sender, + commit_lsn_watch_tx: watch::Sender, + /// For breeding receivers. + commit_lsn_watch_rx: watch::Receiver, mutex: Mutex, /// conditional variable used to notify wal senders cond: Condvar, @@ -287,11 +348,17 @@ impl Timeline { fn new( zttid: ZTenantTimelineId, callmemaybe_tx: UnboundedSender, + wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = + watch::channel(shared_state.sk.inmem.commit_lsn); Timeline { zttid, callmemaybe_tx, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, mutex: Mutex::new(shared_state), cond: Condvar::new(), } @@ -301,13 +368,21 @@ impl Timeline { /// not running yet. /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes += 1; - // FIXME: currently we always adopt latest pageserver connstr, but we - // should have kind of generations assigned by compute to distinguish - // the latest one or even pass it through consensus to reliably deliver - // to all safekeepers. - shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes += 1; + is_wal_backup_action_pending = shared_state.update_status(); + // FIXME: currently we always adopt latest pageserver connstr, but we + // should have kind of generations assigned by compute to distinguish + // the latest one or even pass it through consensus to reliably deliver + // to all safekeepers. + shared_state.callmemaybe_sub(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + } + // Wake up wal backup launcher, if offloading not started yet. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + } Ok(()) } @@ -315,38 +390,43 @@ impl Timeline { /// pageserver doesn't need catchup. /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes -= 1; - // If there is no pageserver, can suspend right away; otherwise let - // walsender do that. - if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes -= 1; + is_wal_backup_action_pending = shared_state.update_status(); + } + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; } Ok(()) } - /// Deactivate tenant if there is no computes and pageserver is caughtup, - /// assuming the pageserver status is in replica_id. - /// Returns true if deactivated. - pub fn check_deactivate(&self, replica_id: usize) -> Result { + /// Whether we still need this walsender running? + /// TODO: check this pageserver is actually interested in this timeline. + pub fn stop_walsender(&self, replica_id: usize) -> Result { let mut shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { - // already suspended - return Ok(true); - } if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); - let deactivate = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet - (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. - replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn); - if deactivate { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let stop = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet + (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. + replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); + if stop { + shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; return Ok(true); } } Ok(false) } + /// Returns whether s3 offloading is required and sets current status as + /// matching it. + pub fn wal_backup_attend(&self) -> bool { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.wal_backup_attend() + } + /// Deactivates the timeline, assuming it is being deleted. /// Returns whether the timeline was already active. /// @@ -354,10 +434,14 @@ impl Timeline { /// will stop by themselves eventually (possibly with errors, but no panics). There should be no /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but /// we're deleting the timeline anyway. - pub fn deactivate_for_delete(&self) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); - let was_active = shared_state.active; - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + pub async fn deactivate_for_delete(&self) -> Result { + let was_active: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + was_active = shared_state.active; + shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; + } + self.wal_backup_launcher_tx.send(self.zttid).await?; Ok(was_active) } @@ -391,6 +475,7 @@ impl Timeline { } // Notify caught-up WAL senders about new WAL data received + // TODO: replace-unify it with commit_lsn_watch. fn notify_wal_senders(&self, shared_state: &mut MutexGuard) { if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn { shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn; @@ -398,12 +483,17 @@ impl Timeline { } } + pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { + self.commit_lsn_watch_rx.clone() + } + /// Pass arrived message to the safekeeper. pub fn process_msg( &self, msg: &ProposerAcceptorMessage, ) -> Result> { let mut rmsg: Option; + let commit_lsn: Lsn; { let mut shared_state = self.mutex.lock().unwrap(); rmsg = shared_state.sk.process_msg(msg)?; @@ -419,15 +509,31 @@ impl Timeline { // Ping wal sender that new data might be available. self.notify_wal_senders(&mut shared_state); + commit_lsn = shared_state.sk.inmem.commit_lsn; } + self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } + pub fn get_wal_seg_size(&self) -> usize { + self.mutex.lock().unwrap().get_wal_seg_size() + } + pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { let shared_state = self.mutex.lock().unwrap(); (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) } + pub fn get_wal_backup_lsn(&self) -> Lsn { + self.mutex.lock().unwrap().sk.inmem.backup_lsn + } + + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { + self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + // we should check whether to shut down offloader, but this will be done + // soon by peer communication anyway. + } + /// Prepare public safekeeper info for reporting. pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result { let shared_state = self.mutex.lock().unwrap(); @@ -436,7 +542,6 @@ impl Timeline { flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), // note: this value is not flushed to control file yet and can be lost commit_lsn: Some(shared_state.sk.inmem.commit_lsn), - s3_wal_lsn: Some(shared_state.sk.inmem.s3_wal_lsn), // TODO: rework feedbacks to avoid max here remote_consistent_lsn: Some(max( shared_state.get_replicas_state().remote_consistent_lsn, @@ -444,14 +549,35 @@ impl Timeline { )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), + backup_lsn: Some(shared_state.sk.inmem.backup_lsn), }) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: NodeId) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.sk.record_safekeeper_info(sk_info)?; - self.notify_wal_senders(&mut shared_state); + pub async fn record_safekeeper_info( + &self, + sk_info: &SkTimelineInfo, + _sk_id: NodeId, + ) -> Result<()> { + let is_wal_backup_action_pending: bool; + let commit_lsn: Lsn; + { + let mut shared_state = self.mutex.lock().unwrap(); + // WAL seg size not initialized yet (no message from compute ever + // received), can't do much without it. + if shared_state.get_wal_seg_size() == 0 { + return Ok(()); + } + shared_state.sk.record_safekeeper_info(sk_info)?; + self.notify_wal_senders(&mut shared_state); + is_wal_backup_action_pending = shared_state.update_status(); + commit_lsn = shared_state.sk.inmem.commit_lsn; + } + self.commit_lsn_watch_tx.send(commit_lsn)?; + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.send(self.zttid).await?; + } Ok(()) } @@ -476,16 +602,16 @@ impl Timeline { shared_state.sk.wal_store.flush_lsn() } - pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> { + pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { let shared_state = self.mutex.lock().unwrap(); // WAL seg size not initialized yet, no WAL exists. - if shared_state.sk.state.server.wal_seg_size == 0 { + if shared_state.get_wal_seg_size() == 0 { return Ok(()); } - horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled); + horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); @@ -522,12 +648,14 @@ impl TimelineTools for Option> { struct GlobalTimelinesState { timelines: HashMap>, callmemaybe_tx: Option>, + wal_backup_launcher_tx: Option>, } lazy_static! { static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - callmemaybe_tx: None + callmemaybe_tx: None, + wal_backup_launcher_tx: None, }); } @@ -541,10 +669,15 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender) { + pub fn init( + callmemaybe_tx: UnboundedSender, + wal_backup_launcher_tx: Sender, + ) { let mut state = TIMELINES_STATE.lock().unwrap(); assert!(state.callmemaybe_tx.is_none()); state.callmemaybe_tx = Some(callmemaybe_tx); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); } fn create_internal( @@ -559,12 +692,14 @@ impl GlobalTimelines { // TODO: check directory existence let dir = conf.timeline_dir(&zttid); fs::create_dir_all(dir)?; + let shared_state = SharedState::create(conf, &zttid, peer_ids) .context("failed to create shared state")?; let new_tli = Arc::new(Timeline::new( zttid, state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -594,8 +729,7 @@ impl GlobalTimelines { match state.timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { - let shared_state = - SharedState::restore(conf, &zttid).context("failed to restore shared state"); + let shared_state = SharedState::restore(conf, &zttid); let shared_state = match shared_state { Ok(shared_state) => shared_state, @@ -617,6 +751,7 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -625,6 +760,12 @@ impl GlobalTimelines { } } + /// Get loaded timeline, if it exists. + pub fn get_loaded(zttid: ZTenantTimelineId) -> Option> { + let state = TIMELINES_STATE.lock().unwrap(); + state.timelines.get(&zttid).map(Arc::clone) + } + /// Get ZTenantTimelineIDs of all active timelines. pub fn get_active_timelines() -> Vec { let state = TIMELINES_STATE.lock().unwrap(); @@ -665,22 +806,23 @@ impl GlobalTimelines { /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. /// TODO: ensure all of the above never happens. - pub fn delete_force( + pub async fn delete_force( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, ) -> Result { info!("deleting timeline {}", zttid); - let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) { - None => false, - Some(tli) => tli.deactivate_for_delete()?, - }; + let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); + let mut was_active = false; + if let Some(tli) = timeline { + was_active = tli.deactivate_for_delete().await?; + } GlobalTimelines::delete_force_internal(conf, zttid, was_active) } /// Deactivates and deletes all timelines for the tenant, see `delete()`. /// Returns map of all timelines which the tenant had, `true` if a timeline was active. /// There may be a race if new timelines are created simultaneously. - pub fn delete_force_all_for_tenant( + pub async fn delete_force_all_for_tenant( conf: &SafeKeeperConf, tenant_id: &ZTenantId, ) -> Result> { @@ -691,14 +833,15 @@ impl GlobalTimelines { let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; for (&zttid, tli) in timelines.iter() { if zttid.tenant_id == *tenant_id { - to_delete.insert(zttid, tli.deactivate_for_delete()?); + to_delete.insert(zttid, tli.clone()); } } // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); } let mut deleted = HashMap::new(); - for (zttid, was_active) in to_delete { + for (zttid, timeline) in to_delete { + let was_active = timeline.deactivate_for_delete().await?; deleted.insert( zttid, GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs new file mode 100644 index 0000000000..ef8ebe14e1 --- /dev/null +++ b/safekeeper/src/wal_backup.rs @@ -0,0 +1,418 @@ +use anyhow::{Context, Result}; +use tokio::task::JoinHandle; + +use std::cmp::min; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::Duration; + +use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use tokio::fs::File; +use tokio::runtime::Builder; + +use tokio::select; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use tokio::sync::watch; +use tokio::time::sleep; +use tracing::*; + +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + +use crate::broker::{Election, ElectionLeader}; +use crate::timeline::{GlobalTimelines, Timeline}; +use crate::{broker, SafeKeeperConf}; + +use once_cell::sync::OnceCell; + +const BACKUP_ELECTION_NAME: &str = "WAL_BACKUP"; + +const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000; + +const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; +const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; + +pub fn wal_backup_launcher_thread_main( + conf: SafeKeeperConf, + wal_backup_launcher_rx: Receiver, +) { + let rt = Builder::new_multi_thread() + .worker_threads(conf.backup_runtime_threads) + .enable_all() + .build() + .expect("failed to create wal backup runtime"); + + rt.block_on(async { + wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await; + }); +} + +/// Check whether wal backup is required for timeline and mark that launcher is +/// aware of current status (if timeline exists). +fn is_wal_backup_required(zttid: ZTenantTimelineId) -> bool { + if let Some(tli) = GlobalTimelines::get_loaded(zttid) { + tli.wal_backup_attend() + } else { + false + } +} + +struct WalBackupTaskHandle { + shutdown_tx: Sender<()>, + handle: JoinHandle<()>, +} + +/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup +/// tasks. Having this in separate task simplifies locking, allows to reap +/// panics and separate elections from offloading itself. +async fn wal_backup_launcher_main_loop( + conf: SafeKeeperConf, + mut wal_backup_launcher_rx: Receiver, +) { + info!( + "wal backup launcher started, remote config {:?}", + conf.remote_storage + ); + + let conf_ = conf.clone(); + REMOTE_STORAGE.get_or_init(|| { + conf_.remote_storage.as_ref().map(|c| { + GenericRemoteStorage::new(conf_.workdir, c).expect("failed to create remote storage") + }) + }); + + let mut tasks: HashMap = HashMap::new(); + + loop { + // channel is never expected to get closed + let zttid = wal_backup_launcher_rx.recv().await.unwrap(); + let is_wal_backup_required = is_wal_backup_required(zttid); + if conf.remote_storage.is_none() || !conf.wal_backup_enabled { + continue; /* just drain the channel and do nothing */ + } + // do we need to do anything at all? + if is_wal_backup_required != tasks.contains_key(&zttid) { + if is_wal_backup_required { + // need to start the task + info!("starting wal backup task for {}", zttid); + + // TODO: decide who should offload in launcher itself by simply checking current state + let election_name = broker::get_campaign_name( + BACKUP_ELECTION_NAME.to_string(), + conf.broker_etcd_prefix.clone(), + &zttid, + ); + let my_candidate_name = broker::get_candiate_name(conf.my_id); + let election = broker::Election::new( + election_name, + my_candidate_name, + conf.broker_endpoints.clone(), + ); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&zttid); + + let handle = tokio::spawn( + backup_task_main(zttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup", zttid = %zttid)), + ); + + tasks.insert( + zttid, + WalBackupTaskHandle { + shutdown_tx, + handle, + }, + ); + } else { + // need to stop the task + info!("stopping wal backup task for {}", zttid); + + let wb_handle = tasks.remove(&zttid).unwrap(); + // Tell the task to shutdown. Error means task exited earlier, that's ok. + let _ = wb_handle.shutdown_tx.send(()).await; + // Await the task itself. TODO: restart panicked tasks earlier. + // Hm, why I can't await on reference to handle? + if let Err(e) = wb_handle.handle.await { + warn!("WAL backup task for {} panicked: {}", zttid, e); + } + } + } + } +} + +struct WalBackupTask { + timeline: Arc, + timeline_dir: PathBuf, + wal_seg_size: usize, + commit_lsn_watch_rx: watch::Receiver, + leader: Option, + election: Election, +} + +/// Offload single timeline. +async fn backup_task_main( + zttid: ZTenantTimelineId, + timeline_dir: PathBuf, + mut shutdown_rx: Receiver<()>, + election: Election, +) { + info!("started"); + let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { + tli + } else { + /* Timeline could get deleted while task was starting, just exit then. */ + info!("no timeline, exiting"); + return; + }; + + let mut wb = WalBackupTask { + wal_seg_size: timeline.get_wal_seg_size(), + commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), + timeline, + timeline_dir, + leader: None, + election, + }; + + // task is spinned up only when wal_seg_size already initialized + assert!(wb.wal_seg_size > 0); + + let mut canceled = false; + select! { + _ = wb.run() => {} + _ = shutdown_rx.recv() => { + canceled = true; + } + } + if let Some(l) = wb.leader { + l.give_up().await; + } + info!("task {}", if canceled { "canceled" } else { "terminated" }); +} + +impl WalBackupTask { + async fn run(&mut self) { + let mut backup_lsn = Lsn(0); + + // election loop + loop { + let mut retry_attempt = 0u32; + + if let Some(l) = self.leader.take() { + l.give_up().await; + } + + match broker::get_leader(&self.election).await { + Ok(l) => { + self.leader = Some(l); + } + Err(e) => { + error!("error during leader election {:?}", e); + sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await; + continue; + } + } + + // offload loop + loop { + if retry_attempt == 0 { + // wait for new WAL to arrive + if let Err(e) = self.commit_lsn_watch_rx.changed().await { + // should never happen, as we hold Arc to timeline. + error!("commit_lsn watch shut down: {:?}", e); + return; + } + } else { + // or just sleep if we errored previously + let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; + if let Some(backoff_delay) = + UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt) + { + retry_delay = min(retry_delay, backoff_delay); + } + sleep(Duration::from_millis(retry_delay)).await; + } + + let commit_lsn = *self.commit_lsn_watch_rx.borrow(); + assert!( + commit_lsn >= backup_lsn, + "backup lsn should never pass commit lsn" + ); + + if backup_lsn.segment_number(self.wal_seg_size) + == commit_lsn.segment_number(self.wal_seg_size) + { + continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ + } + // Perhaps peers advanced the position, check shmem value. + backup_lsn = self.timeline.get_wal_backup_lsn(); + if backup_lsn.segment_number(self.wal_seg_size) + == commit_lsn.segment_number(self.wal_seg_size) + { + continue; + } + + if let Some(l) = self.leader.as_mut() { + // Optimization idea for later: + // Avoid checking election leader every time by returning current lease grant expiration time + // Re-check leadership only after expiration time, + // such approach woud reduce overhead on write-intensive workloads + + match l + .check_am_i( + self.election.election_name.clone(), + self.election.candidate_name.clone(), + ) + .await + { + Ok(leader) => { + if !leader { + info!("leader has changed"); + break; + } + } + Err(e) => { + warn!("error validating leader, {:?}", e); + break; + } + } + } + + match backup_lsn_range( + backup_lsn, + commit_lsn, + self.wal_seg_size, + &self.timeline_dir, + ) + .await + { + Ok(backup_lsn_result) => { + backup_lsn = backup_lsn_result; + self.timeline.set_wal_backup_lsn(backup_lsn_result); + retry_attempt = 0; + } + Err(e) => { + error!( + "failed while offloading range {}-{}: {:?}", + backup_lsn, commit_lsn, e + ); + + retry_attempt = min(retry_attempt + 1, u32::MAX); + } + } + } + } + } +} + +pub async fn backup_lsn_range( + start_lsn: Lsn, + end_lsn: Lsn, + wal_seg_size: usize, + timeline_dir: &Path, +) -> Result { + let mut res = start_lsn; + let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + for s in &segments { + backup_single_segment(s, timeline_dir) + .await + .with_context(|| format!("offloading segno {}", s.seg_no))?; + + res = s.end_lsn; + } + info!( + "offloaded segnos {:?} up to {}, previous backup_lsn {}", + segments.iter().map(|&s| s.seg_no).collect::>(), + end_lsn, + start_lsn, + ); + Ok(res) +} + +async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> { + let segment_file_name = seg.file_path(timeline_dir)?; + + backup_object(&segment_file_name, seg.size()).await?; + debug!("Backup of {} done", segment_file_name.display()); + + Ok(()) +} + +#[derive(Debug, Copy, Clone)] +pub struct Segment { + seg_no: XLogSegNo, + start_lsn: Lsn, + end_lsn: Lsn, +} + +impl Segment { + pub fn new(seg_no: u64, start_lsn: Lsn, end_lsn: Lsn) -> Self { + Self { + seg_no, + start_lsn, + end_lsn, + } + } + + pub fn object_name(self) -> String { + XLogFileName(PG_TLI, self.seg_no, self.size()) + } + + pub fn file_path(self, timeline_dir: &Path) -> Result { + Ok(timeline_dir.join(self.object_name())) + } + + pub fn size(self) -> usize { + (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize + } +} + +fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { + let first_seg = start.segment_number(seg_size); + let last_seg = end.segment_number(seg_size); + + let res: Vec = (first_seg..last_seg) + .map(|s| { + let start_lsn = XLogSegNoOffsetToRecPtr(s, 0, seg_size); + let end_lsn = XLogSegNoOffsetToRecPtr(s + 1, 0, seg_size); + Segment::new(s, Lsn::from(start_lsn), Lsn::from(end_lsn)) + }) + .collect(); + res +} + +static REMOTE_STORAGE: OnceCell> = OnceCell::new(); + +async fn backup_object(source_file: &Path, size: usize) -> Result<()> { + let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); + + let file = File::open(&source_file).await?; + + // Storage is initialized by launcher at ths point. + match storage.as_ref().unwrap() { + GenericRemoteStorage::Local(local_storage) => { + let destination = local_storage.remote_object_id(source_file)?; + + debug!( + "local upload about to start from {} to {}", + source_file.display(), + destination.display() + ); + local_storage.upload(file, size, &destination, None).await + } + GenericRemoteStorage::S3(s3_storage) => { + let s3key = s3_storage.remote_object_id(source_file)?; + + debug!( + "S3 upload about to start from {} to {:?}", + source_file.display(), + s3key + ); + s3_storage.upload(file, size, &s3key, None).await + } + }?; + + Ok(()) +} diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index e1b7bd91ee..fc192c28e8 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -12,7 +12,7 @@ from contextlib import closing from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path -from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol +from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -401,7 +401,7 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): http_cli = env.safekeepers[0].http_client() # Pretend WAL is offloaded to s3. - http_cli.record_safekeeper_info(tenant_id, timeline_id, {'s3_wal_lsn': 'FFFFFFFF/FEFFFFFF'}) + http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) # wait till first segment is removed on all safekeepers started_at = time.time() @@ -414,6 +414,56 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): time.sleep(0.5) +@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) +def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): + zenith_env_builder.num_safekeepers = 3 + if storage_type == 'local_fs': + zenith_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + zenith_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + zenith_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = zenith_env_builder.init_start() + + env.zenith_cli.create_branch('test_safekeepers_wal_backup') + pg = env.postgres.create_start('test_safekeepers_wal_backup') + + # learn zenith timeline from compute + tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + + pg_conn = pg.connect() + cur = pg_conn.cursor() + cur.execute('create table t(key int, value text)') + + # Shut down subsequently each of safekeepers and fill a segment while sk is + # down; ensure segment gets offloaded by others. + offloaded_seg_end = ['0/2000000', '0/3000000', '0/4000000'] + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + victim.stop() + # roughly fills one segment + cur.execute("insert into t select generate_series(1,250000), 'payload'") + live_sk = [sk for sk in env.safekeepers if sk != victim][0] + http_cli = live_sk.http_client() + + started_at = time.time() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"live sk status is {tli_status}") + + if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s segment ending at {seg_end} get offloaded") + time.sleep(0.5) + + victim.start() + + class ProposerPostgres(PgProtocol): """Object for running postgres without ZenithEnv""" def __init__(self, diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 7f5b2ad2aa..a2e8c82d30 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import field +from enum import Flag, auto import textwrap from cached_property import cached_property import asyncpg @@ -421,10 +422,51 @@ class MockS3Server: def secret_key(self) -> str: return 'test' + def access_env_vars(self) -> Dict[Any, Any]: + return { + 'AWS_ACCESS_KEY_ID': self.access_key(), + 'AWS_SECRET_ACCESS_KEY': self.secret_key(), + } + def kill(self): self.subprocess.kill() +@dataclass +class LocalFsStorage: + local_path: Path + + +@dataclass +class S3Storage: + bucket_name: str + bucket_region: str + endpoint: Optional[str] + + +RemoteStorage = Union[LocalFsStorage, S3Storage] + + +# serialize as toml inline table +def remote_storage_to_toml_inline_table(remote_storage): + if isinstance(remote_storage, LocalFsStorage): + res = f"local_path='{remote_storage.local_path}'" + elif isinstance(remote_storage, S3Storage): + res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'" + if remote_storage.endpoint is not None: + res += f", endpoint='{remote_storage.endpoint}'" + else: + raise Exception(f'Unknown storage configuration {remote_storage}') + else: + raise Exception("invalid remote storage type") + return f"{{{res}}}" + + +class RemoteStorageUsers(Flag): + PAGESERVER = auto() + SAFEKEEPER = auto() + + class ZenithEnvBuilder: """ Builder object to create a Zenith runtime environment @@ -440,6 +482,7 @@ class ZenithEnvBuilder: broker: Etcd, mock_s3_server: MockS3Server, remote_storage: Optional[RemoteStorage] = None, + remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, pageserver_auth_enabled: bool = False, @@ -449,6 +492,7 @@ class ZenithEnvBuilder: self.rust_log_override = rust_log_override self.port_distributor = port_distributor self.remote_storage = remote_storage + self.remote_storage_users = remote_storage_users self.broker = broker self.mock_s3_server = mock_s3_server self.pageserver_config_override = pageserver_config_override @@ -497,9 +541,9 @@ class ZenithEnvBuilder: aws_access_key_id=self.mock_s3_server.access_key(), aws_secret_access_key=self.mock_s3_server.secret_key(), ).create_bucket(Bucket=bucket_name) - self.remote_storage = S3Storage(bucket=bucket_name, + self.remote_storage = S3Storage(bucket_name=bucket_name, endpoint=mock_endpoint, - region=mock_region) + bucket_region=mock_region) def __enter__(self): return self @@ -557,6 +601,7 @@ class ZenithEnv: self.safekeepers: List[Safekeeper] = [] self.broker = config.broker self.remote_storage = config.remote_storage + self.remote_storage_users = config.remote_storage_users # generate initial tenant ID here instead of letting 'zenith init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -605,8 +650,12 @@ class ZenithEnv: id = {id} pg_port = {port.pg} http_port = {port.http} - sync = false # Disable fsyncs to make the tests go faster - """) + sync = false # Disable fsyncs to make the tests go faster""") + if bool(self.remote_storage_users + & RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None: + toml += textwrap.dedent(f""" + remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" + """) safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) @@ -638,7 +687,7 @@ def _shared_simple_env(request: Any, mock_s3_server: MockS3Server, default_broker: Etcd) -> Iterator[ZenithEnv]: """ - Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES + # Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES is set, this is shared by all tests using `zenith_simple_env`. """ @@ -822,20 +871,6 @@ class PageserverPort: http: int -@dataclass -class LocalFsStorage: - root: Path - - -@dataclass -class S3Storage: - bucket: str - region: str - endpoint: Optional[str] - - -RemoteStorage = Union[LocalFsStorage, S3Storage] - CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", re.MULTILINE) CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", @@ -998,6 +1033,7 @@ class ZenithCli: append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, pageserver_config_override=self.env.pageserver.config_override) res = self.raw_cli(cmd) @@ -1022,14 +1058,10 @@ class ZenithCli: append_pageserver_param_overrides( params_to_update=start_args, remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, pageserver_config_override=self.env.pageserver.config_override) - s3_env_vars = None - if self.env.s3_mock_server: - s3_env_vars = { - 'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(), - 'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(), - } + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None return self.raw_cli(start_args, extra_env_vars=s3_env_vars) def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': @@ -1041,7 +1073,8 @@ class ZenithCli: return self.raw_cli(cmd) def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', str(id)]) + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None + return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars) def safekeeper_stop(self, id: Optional[int] = None, @@ -1237,22 +1270,13 @@ class ZenithPageserver(PgProtocol): def append_pageserver_param_overrides( params_to_update: List[str], remote_storage: Optional[RemoteStorage], + remote_storage_users: RemoteStorageUsers, pageserver_config_override: Optional[str] = None, ): - if remote_storage is not None: - if isinstance(remote_storage, LocalFsStorage): - pageserver_storage_override = f"local_path='{remote_storage.root}'" - elif isinstance(remote_storage, S3Storage): - pageserver_storage_override = f"bucket_name='{remote_storage.bucket}',\ - bucket_region='{remote_storage.region}'" - - if remote_storage.endpoint is not None: - pageserver_storage_override += f",endpoint='{remote_storage.endpoint}'" - - else: - raise Exception(f'Unknown storage configuration {remote_storage}') + if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None: + remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) params_to_update.append( - f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}') + f'--pageserver-config-override=remote_storage={remote_storage_toml_table}') env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES') if env_overrides is not None: @@ -1786,8 +1810,9 @@ class Safekeeper: class SafekeeperTimelineStatus: acceptor_epoch: int flush_lsn: str - remote_consistent_lsn: str timeline_start_lsn: str + backup_lsn: str + remote_consistent_lsn: str @dataclass @@ -1812,8 +1837,9 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], flush_lsn=resj['flush_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn'], - timeline_start_lsn=resj['timeline_start_lsn']) + timeline_start_lsn=resj['timeline_start_lsn'], + backup_lsn=resj['backup_lsn'], + remote_consistent_lsn=resj['remote_consistent_lsn']) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( From 54b75248ff53cd3530916200d9156a491c16b8dd Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 27 May 2022 13:09:17 +0400 Subject: [PATCH 21/50] s3 WAL offloading staging review. - Uncomment accidently `self.keep_alive.abort()` commented line, due to this task never finished, which blocked launcher. - Mess up with initialization one more time, to fix offloader trying to back up segment 0. Now we initialize all required LSNs in handle_elected, where we learn start LSN for the first time. - Fix blind attempt to provide safekeeper service file with remote storage params. --- .circleci/ansible/systemd/safekeeper.service | 2 +- libs/utils/src/zid.rs | 2 +- safekeeper/src/broker.rs | 2 +- safekeeper/src/safekeeper.rs | 50 +++++++++----------- safekeeper/src/wal_backup.rs | 19 ++++---- 5 files changed, 35 insertions(+), 40 deletions(-) diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service index a6b443c3e7..e4a395a60e 100644 --- a/.circleci/ansible/systemd/safekeeper.service +++ b/.circleci/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote_storage='{bucket_name={{bucket_name}}, bucket_region={{bucket_region}}, prefix_in_bucket=wal}' +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 02f781c49a..0ef174da4d 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -218,7 +218,7 @@ impl ZTenantTimelineId { impl fmt::Display for ZTenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}-{}", self.tenant_id, self.timeline_id) + write!(f, "{}/{}", self.tenant_id, self.timeline_id) } } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 676719b60d..5bcb197205 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -83,7 +83,7 @@ impl ElectionLeader { } pub async fn give_up(self) { - // self.keep_alive.abort(); + self.keep_alive.abort(); // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway // should we await for keep alive termination? let _ = self.keep_alive.await; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 9a07127771..0a7adb96b6 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -731,24 +731,36 @@ where { let mut state = self.state.clone(); - // Remeber point where WAL begins globally, if not yet. + // Here we learn initial LSN for the first time, set fields + // interested in that. + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. state.timeline_start_lsn = msg.timeline_start_lsn; info!( "setting timeline_start_lsn to {:?}", state.timeline_start_lsn ); - } - // Remember point where WAL begins locally, if not yet. (I doubt the - // second condition is ever possible) - if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at { state.local_start_lsn = msg.start_streaming_at; info!("setting local_start_lsn to {:?}", state.local_start_lsn); } + // Initializing commit_lsn before acking first flushed record is + // important to let find_end_of_wal skip the whole in the beginning + // of the first segment. + // + // NB: on new clusters, this happens at the same time as + // timeline_start_lsn initialization, it is taken outside to provide + // upgrade. + self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); + self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); + self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); + + // Initalizing backup_lsn is useful to avoid making backup think it should upload 0 segment. + self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); - self.state.persist(&state)?; + self.persist_control_file(state)?; } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -764,14 +776,6 @@ where self.inmem.commit_lsn = commit_lsn; self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); - // We got our first commit_lsn, which means we should sync - // everything to disk, to initialize the state. - if self.state.commit_lsn == Lsn::INVALID && commit_lsn != Lsn::INVALID { - self.inmem.backup_lsn = self.inmem.commit_lsn; // initialize backup_lsn - self.wal_store.flush_wal()?; - self.persist_control_file()?; - } - // If new commit_lsn reached epoch switch, force sync of control // file: walproposer in sync mode is very interested when this // happens. Note: this is for sync-safekeepers mode only, as @@ -780,15 +784,14 @@ where // that we receive new epoch_start_lsn, and we still need to sync // control file in this case. if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } - /// Persist in-memory state to the disk. - fn persist_control_file(&mut self) -> Result<()> { - let mut state = self.state.clone(); + /// Persist in-memory state to the disk, taking other data from state. + fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; @@ -823,13 +826,6 @@ where // do the job if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - - // If this was the first record we ever received, initialize - // commit_lsn to help find_end_of_wal skip the hole in the - // beginning. - if self.global_commit_lsn == Lsn(0) { - self.global_commit_lsn = msg.h.begin_lsn; - } } // flush wal to the disk, if required @@ -852,7 +848,7 @@ where if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) < self.inmem.peer_horizon_lsn { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } trace!( @@ -920,7 +916,7 @@ where self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; } if sync_control_file { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index ef8ebe14e1..83dc312d28 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -71,7 +71,7 @@ async fn wal_backup_launcher_main_loop( mut wal_backup_launcher_rx: Receiver, ) { info!( - "wal backup launcher started, remote config {:?}", + "WAL backup launcher: started, remote config {:?}", conf.remote_storage ); @@ -95,7 +95,7 @@ async fn wal_backup_launcher_main_loop( if is_wal_backup_required != tasks.contains_key(&zttid) { if is_wal_backup_required { // need to start the task - info!("starting wal backup task for {}", zttid); + info!("starting WAL backup task for {}", zttid); // TODO: decide who should offload in launcher itself by simply checking current state let election_name = broker::get_campaign_name( @@ -115,7 +115,7 @@ async fn wal_backup_launcher_main_loop( let handle = tokio::spawn( backup_task_main(zttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup", zttid = %zttid)), + .instrument(info_span!("WAL backup task", zttid = %zttid)), ); tasks.insert( @@ -127,7 +127,7 @@ async fn wal_backup_launcher_main_loop( ); } else { // need to stop the task - info!("stopping wal backup task for {}", zttid); + info!("stopping WAL backup task for {}", zttid); let wb_handle = tasks.remove(&zttid).unwrap(); // Tell the task to shutdown. Error means task exited earlier, that's ok. @@ -236,20 +236,19 @@ impl WalBackupTask { } let commit_lsn = *self.commit_lsn_watch_rx.borrow(); - assert!( - commit_lsn >= backup_lsn, - "backup lsn should never pass commit lsn" - ); + // Note that backup_lsn can be higher than commit_lsn if we + // don't have much local WAL and others already uploaded + // segments we don't even have. if backup_lsn.segment_number(self.wal_seg_size) - == commit_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) { continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ } // Perhaps peers advanced the position, check shmem value. backup_lsn = self.timeline.get_wal_backup_lsn(); if backup_lsn.segment_number(self.wal_seg_size) - == commit_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) { continue; } From 75f71a63801c687a8bebe6aea28d751da52ac677 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 27 May 2022 11:43:06 -0400 Subject: [PATCH 22/50] Handle broken timelines on startup (#1809) Resolve #1663. ## Changes - ignore a "broken" [1] timeline on page server startup - fix the race condition when creating multiple timelines in parallel for a tenant - added tests for the above changes [1]: a timeline is marked as "broken" if either - failed to load the timeline's metadata or - the timeline's disk consistent LSN is zero --- pageserver/src/layered_repository.rs | 2 +- pageserver/src/tenant_mgr.rs | 31 +++++++++++++++- pageserver/src/timelines.rs | 9 ++++- .../batch_others/test_broken_timeline.py | 37 ++++++++++++++++++- 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index d10c795214..0d7c6f54c8 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2518,7 +2518,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } -fn load_metadata( +pub fn load_metadata( conf: &'static PageServerConf, timeline_id: ZTimelineId, tenant_id: ZTenantId, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index bba67394c3..cc35d79d16 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -2,7 +2,7 @@ //! page server. use crate::config::PageServerConf; -use crate::layered_repository::LayeredRepository; +use crate::layered_repository::{load_metadata, LayeredRepository}; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::{Repository, TimelineSyncStatusUpdate}; use crate::storage_sync::index::RemoteIndex; @@ -22,6 +22,7 @@ use std::collections::HashMap; use std::fmt; use std::sync::Arc; use tracing::*; +use utils::lsn::Lsn; use utils::zid::{ZTenantId, ZTimelineId}; @@ -399,6 +400,26 @@ pub fn list_tenants() -> Vec { .collect() } +/// Check if a given timeline is "broken" \[1\]. +/// The function returns an error if the timeline is "broken". +/// +/// \[1\]: it's not clear now how should we classify a timeline as broken. +/// A timeline is categorized as broken when any of following conditions is true: +/// - failed to load the timeline's metadata +/// - the timeline's disk consistent LSN is zero +fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> { + let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id()) + .context("failed to load metadata")?; + + // A timeline with zero disk consistent LSN can happen when the page server + // failed to checkpoint the timeline import data when creating that timeline. + if metadata.disk_consistent_lsn() == Lsn::INVALID { + bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + } + + Ok(()) +} + fn init_local_repository( conf: &'static PageServerConf, tenant_id: ZTenantId, @@ -414,7 +435,13 @@ fn init_local_repository( match init_status { LocalTimelineInitStatus::LocallyComplete => { debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + if let Err(err) = check_broken_timeline(&repo, timeline_id) { + info!( + "Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository" + ); + } else { + status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + } } LocalTimelineInitStatus::NeedsSync => { debug!( diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 408eca6501..9ab063107c 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -285,7 +285,9 @@ fn bootstrap_timeline( ) -> Result<()> { let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - let initdb_path = conf.tenant_path(&tenantid).join("tmp"); + let initdb_path = conf + .tenant_path(&tenantid) + .join(format!("tmp-timeline-{}", tli)); // Init temporarily repo to get bootstrap data run_initdb(conf, &initdb_path)?; @@ -300,6 +302,11 @@ fn bootstrap_timeline( let timeline = repo.create_empty_timeline(tli, lsn)?; let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + page_tline.tline.checkpoint(CheckpointConfig::Forced)?; info!( diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 17eadb33b4..f0aa44e0a4 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,6 +1,7 @@ import pytest +import concurrent.futures from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv from fixtures.log_helper import log import os @@ -78,3 +79,37 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="Cannot load local timeline") as err: pg.start() log.info(f'compute startup failed as expected: {err}') + + +def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + tenant_id, _ = env.zenith_cli.create_tenant() + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [ + executor.submit(env.zenith_cli.create_timeline, + f"test-create-multiple-timelines-{i}", + tenant_id) for i in range(4) + ] + for future in futures: + future.result() + + +def test_fix_broken_timelines_on_startup(zenith_simple_env: ZenithEnv): + env = zenith_simple_env + + tenant_id, _ = env.zenith_cli.create_tenant() + + # Introduce failpoint when creating a new timeline + env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return") + with pytest.raises(Exception, match="before-checkpoint-new-timeline"): + _ = env.zenith_cli.create_timeline("test_fix_broken_timelines", tenant_id) + + # Restart the page server + env.zenith_cli.pageserver_stop(immediate=True) + env.zenith_cli.pageserver_start() + + # Check that the "broken" timeline is not loaded + timelines = env.zenith_cli.list_timelines(tenant_id) + assert len(timelines) == 1 From cb8bf1beb606fa97eeee0f038d28af4c7327af34 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 27 May 2022 14:10:10 +0400 Subject: [PATCH 23/50] Prevent commit_lsn <= flush_lsn violation after a42eba3cd7. Nothing complained about that yet, but we definitely don't hold at least one assert, so let's keep it this way until better version. --- safekeeper/src/safekeeper.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 0a7adb96b6..c254f2c57c 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -576,13 +576,16 @@ where self.state .acceptor_state .term_history - .up_to(self.wal_store.flush_lsn()) + .up_to(self.flush_lsn()) } pub fn get_epoch(&self) -> Term { - self.state - .acceptor_state - .get_epoch(self.wal_store.flush_lsn()) + self.state.acceptor_state.get_epoch(self.flush_lsn()) + } + + /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet. + fn flush_lsn(&self) -> Lsn { + max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn) } /// Process message from proposer and possibly form reply. Concurrent @@ -671,7 +674,7 @@ where let mut resp = VoteResponse { term: self.state.acceptor_state.term, vote_given: false as u64, - flush_lsn: self.wal_store.flush_lsn(), + flush_lsn: self.flush_lsn(), truncate_lsn: self.state.peer_horizon_lsn, term_history: self.get_term_history(), timeline_start_lsn: self.state.timeline_start_lsn, @@ -703,7 +706,7 @@ where fn append_response(&self) -> AppendResponse { let ar = AppendResponse { term: self.state.acceptor_state.term, - flush_lsn: self.wal_store.flush_lsn(), + flush_lsn: self.flush_lsn(), commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), @@ -770,7 +773,7 @@ where /// Advance commit_lsn taking into account what we have locally pub fn update_commit_lsn(&mut self) -> Result<()> { - let commit_lsn = min(self.global_commit_lsn, self.wal_store.flush_lsn()); + let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); self.inmem.commit_lsn = commit_lsn; From 757746b5717eec6e0c338e41f19844ec077852e7 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Fri, 27 May 2022 13:33:53 -0400 Subject: [PATCH 24/50] Fix `test_pageserver_http_get_wal_receiver_success` flaky test. (#1786) Fixes #1768. ## Context Previously, to test `get_wal_receiver` API, we make run some DB transactions then call the API to check the latest message's LSN from the WAL receiver. However, this test won't work because it's not guaranteed that the WAL receiver will get the latest WAL from the postgres/safekeeper at the time of making the API call. This PR resolves the above issue by adding a "poll and wait" code that waits to retrieve the latest data from the WAL receiver. This PR also fixes a bug that tries to compare two hex LSNs, should convert to number before the comparison. See: https://github.com/neondatabase/neon/issues/1768#issuecomment-1133752122. --- .../batch_others/test_pageserver_api.py | 40 ++++++++++++++----- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 7fe3b4dff5..2b0e5ae8bd 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,11 +1,14 @@ +from typing import Optional from uuid import uuid4, UUID import pytest +from fixtures.utils import lsn_from_hex from fixtures.zenith_fixtures import ( DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient, ZenithPageserverApiException, + wait_until, ) @@ -73,18 +76,35 @@ def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): tenant_id, timeline_id = env.zenith_cli.create_tenant() pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) - res = client.wal_receiver_get(tenant_id, timeline_id) - assert list(res.keys()) == [ - "thread_id", - "wal_producer_connstr", - "last_received_msg_lsn", - "last_received_msg_ts", - ] + def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int: + res = client.wal_receiver_get(tenant_id, timeline_id) - # make a DB modification then expect getting a new WAL receiver's data + # a successful `wal_receiver_get` response must contain the below fields + assert list(res.keys()) == [ + "thread_id", + "wal_producer_connstr", + "last_received_msg_lsn", + "last_received_msg_ts", + ] + + assert res["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" + + last_msg_lsn = lsn_from_hex(res["last_received_msg_lsn"]) + assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ + f"the last received message's LSN {last_msg_lsn} hasn't been updated \ + compared to the previous message's LSN {prev_msg_lsn}" + + return last_msg_lsn + + # Wait to make sure that we get a latest WAL receiver data. + # We need to wait here because it's possible that we don't have access to + # the latest WAL during the time the `wal_receiver_get` API is called. + # See: https://github.com/neondatabase/neon/issues/1768. + lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None)) + + # Make a DB modification then expect getting a new WAL receiver's data. pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - res2 = client.wal_receiver_get(tenant_id, timeline_id) - assert res2["last_received_msg_lsn"] > res["last_received_msg_lsn"] + wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn)) def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): From 5d813f97386b34f020c8051bf2c5a1b06dc4e408 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 18 May 2022 16:01:56 +0300 Subject: [PATCH 25/50] [proxy] Refactoring This patch attempts to fix some of the technical debt we had to introduce in previous patches. --- proxy/src/auth.rs | 67 ++--- proxy/src/auth/backend.rs | 109 ++++++++ proxy/src/auth/backend/console.rs | 225 ++++++++++++++++ .../backend}/legacy_console.rs | 32 ++- .../{auth_backend => auth/backend}/link.rs | 14 +- proxy/src/auth/backend/postgres.rs | 88 +++++++ proxy/src/auth/credentials.rs | 30 ++- proxy/src/auth/flow.rs | 6 +- proxy/src/auth_backend.rs | 31 --- proxy/src/auth_backend/console.rs | 243 ------------------ proxy/src/auth_backend/postgres.rs | 93 ------- proxy/src/compute.rs | 4 +- proxy/src/config.rs | 35 ++- proxy/src/main.rs | 2 +- proxy/src/mgmt.rs | 8 +- proxy/src/url.rs | 82 ++++++ 16 files changed, 599 insertions(+), 470 deletions(-) create mode 100644 proxy/src/auth/backend.rs create mode 100644 proxy/src/auth/backend/console.rs rename proxy/src/{auth_backend => auth/backend}/legacy_console.rs (90%) rename proxy/src/{auth_backend => auth/backend}/link.rs (75%) create mode 100644 proxy/src/auth/backend/postgres.rs delete mode 100644 proxy/src/auth_backend.rs delete mode 100644 proxy/src/auth_backend/console.rs delete mode 100644 proxy/src/auth_backend/postgres.rs create mode 100644 proxy/src/url.rs diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 2463f31645..082a7bcf20 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,56 +1,58 @@ -mod credentials; -mod flow; +//! Client authentication mechanisms. -use crate::auth_backend::{console, legacy_console, link, postgres}; -use crate::config::{AuthBackendType, ProxyConfig}; -use crate::error::UserFacingError; -use crate::stream::PqStream; -use crate::{auth_backend, compute, waiters}; -use console::ConsoleAuthError::SniMissing; +pub mod backend; +pub use backend::DatabaseInfo; + +mod credentials; +pub use credentials::ClientCredentials; + +mod flow; +pub use flow::*; + +use crate::{error::UserFacingError, waiters}; use std::io; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -pub use credentials::ClientCredentials; -pub use flow::*; +/// Convenience wrapper for the authentication error. +pub type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { /// Authentication error reported by the console. #[error(transparent)] - Console(#[from] auth_backend::AuthError), + Console(#[from] backend::AuthError), #[error(transparent)] - GetAuthInfo(#[from] auth_backend::console::ConsoleAuthError), + GetAuthInfo(#[from] backend::console::ConsoleAuthError), #[error(transparent)] Sasl(#[from] crate::sasl::Error), - /// For passwords that couldn't be processed by [`parse_password`]. + /// For passwords that couldn't be processed by [`backend::legacy_console::parse_password`]. #[error("Malformed password message")] MalformedPassword, - /// Errors produced by [`PqStream`]. + /// Errors produced by [`crate::stream::PqStream`]. #[error(transparent)] Io(#[from] io::Error), } impl AuthErrorImpl { pub fn auth_failed(msg: impl Into) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::auth_failed(msg)) + Self::Console(backend::AuthError::auth_failed(msg)) } } impl From for AuthErrorImpl { fn from(e: waiters::RegisterError) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::from(e)) + Self::Console(backend::AuthError::from(e)) } } impl From for AuthErrorImpl { fn from(e: waiters::WaitError) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::from(e)) + Self::Console(backend::AuthError::from(e)) } } @@ -63,7 +65,7 @@ where AuthErrorImpl: From, { fn from(e: T) -> Self { - AuthError(Box::new(e.into())) + Self(Box::new(e.into())) } } @@ -72,34 +74,9 @@ impl UserFacingError for AuthError { use AuthErrorImpl::*; match self.0.as_ref() { Console(e) => e.to_string_client(), + GetAuthInfo(e) => e.to_string_client(), MalformedPassword => self.to_string(), - GetAuthInfo(e) if matches!(e, SniMissing) => e.to_string(), _ => "Internal error".to_string(), } } } - -async fn handle_user( - config: &ProxyConfig, - client: &mut PqStream, - creds: ClientCredentials, -) -> Result { - match config.auth_backend { - AuthBackendType::LegacyConsole => { - legacy_console::handle_user( - &config.auth_endpoint, - &config.auth_link_uri, - client, - &creds, - ) - .await - } - AuthBackendType::Console => { - console::handle_user(config.auth_endpoint.as_ref(), client, &creds).await - } - AuthBackendType::Postgres => { - postgres::handle_user(&config.auth_endpoint, client, &creds).await - } - AuthBackendType::Link => link::handle_user(config.auth_link_uri.as_ref(), client).await, - } -} diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs new file mode 100644 index 0000000000..1d41f7f932 --- /dev/null +++ b/proxy/src/auth/backend.rs @@ -0,0 +1,109 @@ +mod legacy_console; +mod link; +mod postgres; + +pub mod console; + +pub use legacy_console::{AuthError, AuthErrorImpl}; + +use super::ClientCredentials; +use crate::{ + compute, + config::{AuthBackendType, ProxyConfig}, + mgmt, + stream::PqStream, + waiters::{self, Waiter, Waiters}, +}; +use lazy_static::lazy_static; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; + +lazy_static! { + static ref CPLANE_WAITERS: Waiters = Default::default(); +} + +/// Give caller an opportunity to wait for the cloud's reply. +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result +where + R: std::future::Future>, + E: From, +{ + let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; + action(waiter).await +} + +pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { + CPLANE_WAITERS.notify(psql_session_id, msg) +} + +/// Compute node connection params provided by the cloud. +/// Note how it implements serde traits, since we receive it over the wire. +#[derive(Serialize, Deserialize, Default)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + pub password: Option, +} + +// Manually implement debug to omit personal and sensitive info. +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish() + } +} + +impl From for tokio_postgres::Config { + fn from(db_info: DatabaseInfo) -> Self { + let mut config = tokio_postgres::Config::new(); + + config + .host(&db_info.host) + .port(db_info.port) + .dbname(&db_info.dbname) + .user(&db_info.user); + + if let Some(password) = db_info.password { + config.password(password); + } + + config + } +} + +pub(super) async fn handle_user( + config: &ProxyConfig, + client: &mut PqStream, + creds: ClientCredentials, +) -> super::Result { + use AuthBackendType::*; + match config.auth_backend { + LegacyConsole => { + legacy_console::handle_user( + &config.auth_endpoint, + &config.auth_link_uri, + client, + &creds, + ) + .await + } + Console => { + console::Api::new(&config.auth_endpoint, &creds)? + .handle_user(client) + .await + } + Postgres => { + postgres::Api::new(&config.auth_endpoint, &creds)? + .handle_user(client) + .await + } + Link => link::handle_user(&config.auth_link_uri, client).await, + } +} diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs new file mode 100644 index 0000000000..252522affb --- /dev/null +++ b/proxy/src/auth/backend/console.rs @@ -0,0 +1,225 @@ +//! Cloud API V2. + +use crate::{ + auth::{self, AuthFlow, ClientCredentials, DatabaseInfo}, + compute, + error::UserFacingError, + scram, + stream::PqStream, + url::ApiUrl, +}; +use serde::{Deserialize, Serialize}; +use std::{future::Future, io}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +pub type Result = std::result::Result; + +#[derive(Debug, Error)] +pub enum ConsoleAuthError { + #[error(transparent)] + BadProjectName(#[from] auth::credentials::ProjectNameError), + + // We shouldn't include the actual secret here. + #[error("Bad authentication secret")] + BadSecret, + + #[error("Console responded with a malformed compute address: '{0}'")] + BadComputeAddress(String), + + #[error("Console responded with a malformed JSON: '{0}'")] + BadResponse(#[from] serde_json::Error), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl UserFacingError for ConsoleAuthError { + fn to_string_client(&self) -> String { + use ConsoleAuthError::*; + match self { + BadProjectName(e) => e.to_string_client(), + _ => "Internal error".to_string(), + } + } +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetRoleSecretResponse { + role_secret: String, +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetWakeComputeResponse { + address: String, +} + +/// Auth secret which is managed by the cloud. +pub enum AuthInfo { + /// Md5 hash of user's password. + Md5([u8; 16]), + + /// [SCRAM](crate::scram) authentication info. + Scram(scram::ServerSecret), +} + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a ApiUrl, + creds: &'a ClientCredentials, + /// Cache project name, since we'll need it several times. + project: &'a str, +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { + Ok(Self { + endpoint, + creds, + project: creds.project_name()?, + }) + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + async fn get_auth_info(&self) -> Result { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push("proxy_get_role_secret"); + url.query_pairs_mut() + .append_pair("project", self.project) + .append_pair("role", &self.creds.user); + + // TODO: use a proper logger + println!("cplane request: {url}"); + + let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetRoleSecretResponse = + serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + + scram::ServerSecret::parse(response.role_secret.as_str()) + .map(AuthInfo::Scram) + .ok_or(ConsoleAuthError::BadSecret) + } + + /// Wake up the compute node and return the corresponding connection info. + async fn wake_compute(&self) -> Result { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push("proxy_wake_compute"); + url.query_pairs_mut().append_pair("project", self.project); + + // TODO: use a proper logger + println!("cplane request: {url}"); + + let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetWakeComputeResponse = + serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + + let (host, port) = parse_host_port(&response.address) + .ok_or(ConsoleAuthError::BadComputeAddress(response.address))?; + + Ok(DatabaseInfo { + host, + port, + dbname: self.creds.dbname.to_owned(), + user: self.creds.user.to_owned(), + password: None, + }) + } +} + +/// Common logic for user handling in API V2. +/// We reuse this for a mock API implementation in [`super::postgres`]. +pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( + client: &mut PqStream, + endpoint: &'a Endpoint, + get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo, + wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute, +) -> auth::Result +where + GetAuthInfo: Future>, + WakeCompute: Future>, +{ + let auth_info = get_auth_info(endpoint).await?; + + let flow = AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + AuthInfo::Scram(secret) => { + let scram = auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info: wake_compute(endpoint).await?, + scram_keys, + }) +} + +/// Upcast (almost) any error into an opaque [`io::Error`]. +pub(super) fn io_error(e: impl Into>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} + +fn parse_host_port(input: &str) -> Option<(String, u16)> { + let (host, port) = input.split_once(':')?; + Some((host.to_owned(), port.parse().ok()?)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } +} diff --git a/proxy/src/auth_backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs similarity index 90% rename from proxy/src/auth_backend/legacy_console.rs rename to proxy/src/auth/backend/legacy_console.rs index 29997d2389..467da63a98 100644 --- a/proxy/src/auth_backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -1,20 +1,18 @@ //! Cloud API V1. -use super::console::DatabaseInfo; - -use crate::auth::ClientCredentials; -use crate::stream::PqStream; - -use crate::{compute, waiters}; +use super::DatabaseInfo; +use crate::{ + auth::{self, ClientCredentials}, + compute, + error::UserFacingError, + stream::PqStream, + waiters, +}; use serde::{Deserialize, Serialize}; - +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; -use thiserror::Error; - -use crate::error::UserFacingError; - #[derive(Debug, Error)] pub enum AuthErrorImpl { /// Authentication error reported by the console. @@ -45,7 +43,7 @@ pub struct AuthError(Box); impl AuthError { /// Smart constructor for authentication error reported by `mgmt`. pub fn auth_failed(msg: impl Into) -> Self { - AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + Self(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) } } @@ -54,7 +52,7 @@ where AuthErrorImpl: From, { fn from(e: T) -> Self { - AuthError(Box::new(e.into())) + Self(Box::new(e.into())) } } @@ -120,7 +118,7 @@ async fn handle_existing_user( auth_endpoint: &reqwest::Url, client: &mut PqStream, creds: &ClientCredentials, -) -> Result { +) -> Result { let psql_session_id = super::link::new_psql_session_id(); let md5_salt = rand::random(); @@ -130,7 +128,7 @@ async fn handle_existing_user( // Read client's password hash let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(crate::auth::AuthErrorImpl::MalformedPassword)?; + let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword)?; let db_info = authenticate_proxy_client( auth_endpoint, @@ -156,11 +154,11 @@ pub async fn handle_user( auth_link_uri: &reqwest::Url, client: &mut PqStream, creds: &ClientCredentials, -) -> Result { +) -> auth::Result { if creds.is_existing_user() { handle_existing_user(auth_endpoint, client, creds).await } else { - super::link::handle_user(auth_link_uri.as_ref(), client).await + super::link::handle_user(auth_link_uri, client).await } } diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth/backend/link.rs similarity index 75% rename from proxy/src/auth_backend/link.rs rename to proxy/src/auth/backend/link.rs index 8e5fcb32a9..669c9e00e9 100644 --- a/proxy/src/auth_backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,4 +1,4 @@ -use crate::{compute, stream::PqStream}; +use crate::{auth, compute, stream::PqStream}; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; @@ -19,13 +19,13 @@ pub fn new_psql_session_id() -> String { } pub async fn handle_user( - redirect_uri: &str, + redirect_uri: &reqwest::Url, client: &mut PqStream, -) -> Result { +) -> auth::Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri, &psql_session_id); + let greeting = hello_message(redirect_uri.as_str(), &psql_session_id); - let db_info = crate::auth_backend::with_waiter(psql_session_id, |waiter| async { + let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database client .write_message_noflush(&Be::AuthenticationOk)? @@ -34,9 +34,7 @@ pub async fn handle_user( .await?; // Wait for web console response (see `mgmt`) - waiter - .await? - .map_err(crate::auth::AuthErrorImpl::auth_failed) + waiter.await?.map_err(auth::AuthErrorImpl::auth_failed) }) .await?; diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs new file mode 100644 index 0000000000..721b9db095 --- /dev/null +++ b/proxy/src/auth/backend/postgres.rs @@ -0,0 +1,88 @@ +//! Local mock of Cloud API V2. + +use crate::{ + auth::{ + self, + backend::console::{self, io_error, AuthInfo, Result}, + ClientCredentials, DatabaseInfo, + }, + compute, scram, + stream::PqStream, + url::ApiUrl, +}; +use tokio::io::{AsyncRead, AsyncWrite}; + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a ApiUrl, + creds: &'a ClientCredentials, +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { + Ok(Self { endpoint, creds }) + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + // We reuse user handling logic from a production module. + console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + /// This implementation fetches the auth info from a local postgres instance. + async fn get_auth_info(&self) -> Result { + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls) + .await + .map_err(io_error)?; + + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client + .query(query, &[&self.creds.user]) + .await + .map_err(io_error)?; + + match &rows[..] { + // We can't get a secret if there's no such user. + [] => Err(io_error(format!("unknown user '{}'", self.creds.user)).into()), + + // We shouldn't get more than one row anyway. + [row, ..] => { + let entry = row.try_get(0).map_err(io_error)?; + scram::ServerSecret::parse(entry) + .map(AuthInfo::Scram) + .or_else(|| { + // It could be an md5 hash if it's not a SCRAM secret. + let text = entry.strip_prefix("md5")?; + Some(AuthInfo::Md5({ + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + bytes + })) + }) + // Putting the secret into this message is a security hazard! + .ok_or(console::ConsoleAuthError::BadSecret) + } + } + } + + /// We don't need to wake anything locally, so we just return the connection info. + async fn wake_compute(&self) -> Result { + Ok(DatabaseInfo { + // TODO: handle that near CLI params parsing + host: self.endpoint.host_str().unwrap_or("localhost").to_owned(), + port: self.endpoint.port().unwrap_or(5432), + dbname: self.creds.dbname.to_owned(), + user: self.creds.user.to_owned(), + password: None, + }) + } +} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 9d2272b5ad..467e7db282 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,6 +1,5 @@ //! User credentials used in authentication. -use super::AuthError; use crate::compute; use crate::config::ProxyConfig; use crate::error::UserFacingError; @@ -36,6 +35,27 @@ impl ClientCredentials { } } +#[derive(Debug, Error)] +pub enum ProjectNameError { + #[error("SNI is missing, please upgrade the postgres client library")] + Missing, + + #[error("SNI is malformed")] + Bad, +} + +impl UserFacingError for ProjectNameError {} + +impl ClientCredentials { + /// Determine project name from SNI. + pub fn project_name(&self) -> Result<&str, ProjectNameError> { + // Currently project name is passed as a top level domain + let sni = self.sni_data.as_ref().ok_or(ProjectNameError::Missing)?; + let (first, _) = sni.split_once('.').ok_or(ProjectNameError::Bad)?; + Ok(first) + } +} + impl TryFrom> for ClientCredentials { type Error = ClientCredsParseError; @@ -47,11 +67,11 @@ impl TryFrom> for ClientCredentials { }; let user = get_param("user")?; - let db = get_param("database")?; + let dbname = get_param("database")?; Ok(Self { user, - dbname: db, + dbname, sni_data: None, }) } @@ -63,8 +83,8 @@ impl ClientCredentials { self, config: &ProxyConfig, client: &mut PqStream, - ) -> Result { + ) -> super::Result { // This method is just a convenient facade for `handle_user` - super::handle_user(config, client, self).await + super::backend::handle_user(config, client, self).await } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 3eed0f0a23..7efff13bfc 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,6 +1,6 @@ //! Main authentication flow. -use super::{AuthError, AuthErrorImpl}; +use super::AuthErrorImpl; use crate::stream::PqStream; use crate::{sasl, scram}; use std::io; @@ -32,7 +32,7 @@ impl AuthMethod for Scram<'_> { pub struct AuthFlow<'a, Stream, State> { /// The underlying stream which implements libpq's protocol. stream: &'a mut PqStream, - /// State might contain ancillary data (see [`AuthFlow::begin`]). + /// State might contain ancillary data (see [`Self::begin`]). state: State, } @@ -60,7 +60,7 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> Result { + pub async fn authenticate(self) -> super::Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; diff --git a/proxy/src/auth_backend.rs b/proxy/src/auth_backend.rs deleted file mode 100644 index 54362bf719..0000000000 --- a/proxy/src/auth_backend.rs +++ /dev/null @@ -1,31 +0,0 @@ -pub mod console; -pub mod legacy_console; -pub mod link; -pub mod postgres; - -pub use legacy_console::{AuthError, AuthErrorImpl}; - -use crate::mgmt; -use crate::waiters::{self, Waiter, Waiters}; -use lazy_static::lazy_static; - -lazy_static! { - static ref CPLANE_WAITERS: Waiters = Default::default(); -} - -/// Give caller an opportunity to wait for the cloud's reply. -pub async fn with_waiter( - psql_session_id: impl Into, - action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, -) -> Result -where - R: std::future::Future>, - E: From, -{ - let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - action(waiter).await -} - -pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { - CPLANE_WAITERS.notify(psql_session_id, msg) -} diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs deleted file mode 100644 index 41a822701f..0000000000 --- a/proxy/src/auth_backend/console.rs +++ /dev/null @@ -1,243 +0,0 @@ -//! Declaration of Cloud API V2. - -use crate::{ - auth::{self, AuthFlow}, - compute, scram, -}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -use crate::auth::ClientCredentials; -use crate::stream::PqStream; - -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; - -#[derive(Debug, Error)] -pub enum ConsoleAuthError { - // We shouldn't include the actual secret here. - #[error("Bad authentication secret")] - BadSecret, - - #[error("Bad client credentials: {0:?}")] - BadCredentials(crate::auth::ClientCredentials), - - #[error("SNI info is missing, please upgrade the postgres client library")] - SniMissing, - - #[error("Unexpected SNI content")] - SniWrong, - - #[error(transparent)] - BadUrl(#[from] url::ParseError), - - #[error(transparent)] - Io(#[from] std::io::Error), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error("Console responded with a malformed JSON: '{0}'")] - MalformedResponse(#[from] serde_json::Error), - - #[error("Console responded with a malformed compute address: '{0}'")] - MalformedComputeAddress(String), -} - -#[derive(Serialize, Deserialize, Debug)] -struct GetRoleSecretResponse { - role_secret: String, -} - -#[derive(Serialize, Deserialize, Debug)] -struct GetWakeComputeResponse { - address: String, -} - -/// Auth secret which is managed by the cloud. -pub enum AuthInfo { - /// Md5 hash of user's password. - Md5([u8; 16]), - /// [SCRAM](crate::scram) authentication info. - Scram(scram::ServerSecret), -} - -/// Compute node connection params provided by the cloud. -/// Note how it implements serde traits, since we receive it over the wire. -#[derive(Serialize, Deserialize, Default)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - - /// [Cloud API V1](super::legacy) returns cleartext password, - /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram) - /// authentication, so we can leverage this method and cope without password. - pub password: Option, -} - -// Manually implement debug to omit personal and sensitive info. -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .finish() - } -} - -impl From for tokio_postgres::Config { - fn from(db_info: DatabaseInfo) -> Self { - let mut config = tokio_postgres::Config::new(); - - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); - - if let Some(password) = db_info.password { - config.password(password); - } - - config - } -} - -async fn get_auth_info( - auth_endpoint: &str, - user: &str, - cluster: &str, -) -> Result { - let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?; - - url.query_pairs_mut() - .append_pair("project", cluster) - .append_pair("role", user); - - // TODO: use a proper logger - println!("cplane request: {}", url); - - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); - } - - let response: GetRoleSecretResponse = serde_json::from_str(resp.text().await?.as_str())?; - - scram::ServerSecret::parse(response.role_secret.as_str()) - .map(AuthInfo::Scram) - .ok_or(ConsoleAuthError::BadSecret) -} - -/// Wake up the compute node and return the corresponding connection info. -async fn wake_compute( - auth_endpoint: &str, - cluster: &str, -) -> Result<(String, u16), ConsoleAuthError> { - let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?; - url.query_pairs_mut().append_pair("project", cluster); - - // TODO: use a proper logger - println!("cplane request: {}", url); - - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); - } - - let response: GetWakeComputeResponse = serde_json::from_str(resp.text().await?.as_str())?; - let (host, port) = response - .address - .split_once(':') - .ok_or_else(|| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; - let port: u16 = port - .parse() - .map_err(|_| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; - - Ok((host.to_string(), port)) -} - -pub async fn handle_user( - auth_endpoint: &str, - client: &mut PqStream, - creds: &ClientCredentials, -) -> Result { - // Determine cluster name from SNI. - let cluster = creds - .sni_data - .as_ref() - .ok_or(ConsoleAuthError::SniMissing)? - .split_once('.') - .ok_or(ConsoleAuthError::SniWrong)? - .0; - - let user = creds.user.as_str(); - - // Step 1: get the auth secret - let auth_info = get_auth_info(auth_endpoint, user, cluster).await?; - - let flow = AuthFlow::new(client); - let scram_keys = match auth_info { - AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 - return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); - } - AuthInfo::Scram(secret) => { - let scram = auth::Scram(&secret); - Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), - server_key: secret.server_key.as_bytes(), - }) - } - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - // Step 2: wake compute - let (host, port) = wake_compute(auth_endpoint, cluster).await?; - - Ok(compute::NodeInfo { - db_info: DatabaseInfo { - host, - port, - dbname: creds.dbname.clone(), - user: creds.user.clone(), - password: None, - }, - scram_keys, - }) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/auth_backend/postgres.rs b/proxy/src/auth_backend/postgres.rs deleted file mode 100644 index 148c2a2518..0000000000 --- a/proxy/src/auth_backend/postgres.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! Local mock of Cloud API V2. - -use super::console::{self, AuthInfo, DatabaseInfo}; -use crate::scram; -use crate::{auth::ClientCredentials, compute}; - -use crate::stream::PqStream; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; - -async fn get_auth_info( - auth_endpoint: &str, - creds: &ClientCredentials, -) -> Result { - // We wrap `tokio_postgres::Error` because we don't want to infect the - // method's error type with a detail that's specific to debug mode only. - let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e); - - // Perhaps we could persist this connection, but then we'd have to - // write more code for reopening it if it got closed, which doesn't - // seem worth it. - let (client, connection) = tokio_postgres::connect(auth_endpoint, tokio_postgres::NoTls) - .await - .map_err(io_error)?; - - tokio::spawn(connection); - let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; - let rows = client - .query(query, &[&creds.user]) - .await - .map_err(io_error)?; - - match &rows[..] { - // We can't get a secret if there's no such user. - [] => Err(console::ConsoleAuthError::BadCredentials(creds.to_owned())), - // We shouldn't get more than one row anyway. - [row, ..] => { - let entry = row.try_get(0).map_err(io_error)?; - scram::ServerSecret::parse(entry) - .map(AuthInfo::Scram) - .or_else(|| { - // It could be an md5 hash if it's not a SCRAM secret. - let text = entry.strip_prefix("md5")?; - Some(AuthInfo::Md5({ - let mut bytes = [0u8; 16]; - hex::decode_to_slice(text, &mut bytes).ok()?; - bytes - })) - }) - // Putting the secret into this message is a security hazard! - .ok_or(console::ConsoleAuthError::BadSecret) - } - } -} - -pub async fn handle_user( - auth_endpoint: &reqwest::Url, - client: &mut PqStream, - creds: &ClientCredentials, -) -> Result { - let auth_info = get_auth_info(auth_endpoint.as_ref(), creds).await?; - - let flow = crate::auth::AuthFlow::new(client); - let scram_keys = match auth_info { - AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 - return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); - } - AuthInfo::Scram(secret) => { - let scram = crate::auth::Scram(&secret); - Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), - server_key: secret.server_key.as_bytes(), - }) - } - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(compute::NodeInfo { - db_info: DatabaseInfo { - // TODO: handle that near CLI params parsing - host: auth_endpoint.host_str().unwrap_or("localhost").to_owned(), - port: auth_endpoint.port().unwrap_or(5432), - dbname: creds.dbname.to_owned(), - user: creds.user.to_owned(), - password: None, - }, - scram_keys, - }) -} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index c3c5ba47fb..cccd6e60d4 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,4 +1,4 @@ -use crate::auth_backend::console::DatabaseInfo; +use crate::auth::DatabaseInfo; use crate::cancellation::CancelClosure; use crate::error::UserFacingError; use std::io; @@ -37,7 +37,7 @@ pub struct NodeInfo { impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = format!("{}:{}", self.db_info.host, self.db_info.port); + let host_port = (self.db_info.host.as_str(), self.db_info.port); let socket = TcpStream::connect(host_port).await?; let socket_addr = socket.peer_addr()?; socket2::SockRef::from(&socket).set_keepalive(true)?; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 6f1b56bfe4..a5cd17eb55 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,39 +1,38 @@ -use anyhow::{ensure, Context}; +use crate::url::ApiUrl; +use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; -#[non_exhaustive] pub enum AuthBackendType { + /// Legacy Cloud API (V1). LegacyConsole, - Console, - Postgres, + /// Authentication via a web browser. Link, + /// Current Cloud API (V2). + Console, + /// Local mock of Cloud API (V2). + Postgres, } impl FromStr for AuthBackendType { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { - println!("ClientAuthMethod::from_str: '{}'", s); use AuthBackendType::*; - match s { - "legacy" => Ok(LegacyConsole), - "console" => Ok(Console), - "postgres" => Ok(Postgres), - "link" => Ok(Link), - _ => Err(anyhow::anyhow!("Invlid option for auth method")), - } + Ok(match s { + "legacy" => LegacyConsole, + "console" => Console, + "postgres" => Postgres, + "link" => Link, + _ => bail!("Invalid option `{s}` for auth method"), + }) } } pub struct ProxyConfig { - /// TLS configuration for the proxy. pub tls_config: Option, - pub auth_backend: AuthBackendType, - - pub auth_endpoint: reqwest::Url, - - pub auth_link_uri: reqwest::Url, + pub auth_endpoint: ApiUrl, + pub auth_link_uri: ApiUrl, } pub type TlsConfig = Arc; diff --git a/proxy/src/main.rs b/proxy/src/main.rs index b457d46824..672f24b6fb 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -5,7 +5,6 @@ //! in somewhat transparent manner (again via communication with control plane API). mod auth; -mod auth_backend; mod cancellation; mod compute; mod config; @@ -17,6 +16,7 @@ mod proxy; mod sasl; mod scram; mod stream; +mod url; mod waiters; use anyhow::{bail, Context}; diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 93618fff68..8737d170b1 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,4 +1,4 @@ -use crate::auth_backend; +use crate::auth; use anyhow::Context; use serde::Deserialize; use std::{ @@ -77,12 +77,12 @@ struct PsqlSessionResponse { #[derive(Deserialize)] enum PsqlSessionResult { - Success(auth_backend::console::DatabaseInfo), + Success(auth::DatabaseInfo), Failure(String), } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = Result; +pub type ComputeReady = Result; impl PsqlSessionResult { fn into_compute_ready(self) -> ComputeReady { @@ -113,7 +113,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - match auth_backend::notify(&resp.session_id, resp.result.into_compute_ready()) { + match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/url.rs b/proxy/src/url.rs new file mode 100644 index 0000000000..76d6ad0e66 --- /dev/null +++ b/proxy/src/url.rs @@ -0,0 +1,82 @@ +use anyhow::bail; +use url::form_urlencoded::Serializer; + +/// A [url](url::Url) type with additional guarantees. +#[derive(Debug, Clone)] +pub struct ApiUrl(url::Url); + +impl ApiUrl { + /// Consume the wrapper and return inner [url](url::Url). + pub fn into_inner(self) -> url::Url { + self.0 + } + + /// See [`url::Url::query_pairs_mut`]. + pub fn query_pairs_mut(&mut self) -> Serializer<'_, url::UrlQuery<'_>> { + self.0.query_pairs_mut() + } + + /// See [`url::Url::path_segments_mut`]. + pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { + // We've already verified that it works during construction. + self.0.path_segments_mut().expect("bad API url") + } +} + +/// This instance imposes additional requirements on the url. +impl std::str::FromStr for ApiUrl { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let mut url: url::Url = s.parse()?; + + // Make sure that we can build upon this URL. + if url.path_segments_mut().is_err() { + bail!("bad API url provided"); + } + + Ok(Self(url)) + } +} + +/// This instance is safe because it doesn't allow us to modify the object. +impl std::ops::Deref for ApiUrl { + type Target = url::Url; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Display for ApiUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bad_url() { + let url = "test:foobar"; + url.parse::().expect("unexpected parsing failure"); + let _ = url.parse::().expect_err("should not parse"); + } + + #[test] + fn good_url() { + let url = "test://foobar"; + let mut a = url.parse::().expect("unexpected parsing failure"); + let mut b = url.parse::().expect("unexpected parsing failure"); + + a.path_segments_mut().unwrap().push("method"); + a.query_pairs_mut().append_pair("key", "value"); + + b.path_segments_mut().push("method"); + b.query_pairs_mut().append_pair("key", "value"); + + assert_eq!(a, b.into_inner()); + } +} From b3ec6e0661e1f08beb1cd08b265cc64af0cd4035 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Thu, 26 May 2022 20:39:33 +0300 Subject: [PATCH 26/50] [proxy] Propagate SASL/SCRAM auth errors to the user This will replace the vague (and incorrect) "Internal error" with a nice and helpful authentication error, e.g. "password doesn't match". --- proxy/src/auth.rs | 1 + proxy/src/config.rs | 1 + proxy/src/main.rs | 1 + proxy/src/sasl.rs | 15 +++++++++++++++ proxy/src/scram/exchange.rs | 6 ++++-- 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 082a7bcf20..9bddd58fce 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -75,6 +75,7 @@ impl UserFacingError for AuthError { match self.0.as_ref() { Console(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), + Sasl(e) => e.to_string_client(), MalformedPassword => self.to_string(), _ => "Internal error".to_string(), } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index a5cd17eb55..4def11aefc 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,6 +2,7 @@ use crate::url::ApiUrl; use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; +#[derive(Debug)] pub enum AuthBackendType { /// Legacy Cloud API (V1). LegacyConsole, diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 672f24b6fb..b68b2440dd 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -126,6 +126,7 @@ async fn main() -> anyhow::Result<()> { })); println!("Version: {GIT_VERSION}"); + println!("Authentication backend: {:?}", config.auth_backend); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index cd9032bfb9..689fca6049 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -10,6 +10,7 @@ mod channel_binding; mod messages; mod stream; +use crate::error::UserFacingError; use std::io; use thiserror::Error; @@ -36,6 +37,20 @@ pub enum Error { Io(#[from] io::Error), } +impl UserFacingError for Error { + fn to_string_client(&self) -> String { + use Error::*; + match self { + // This constructor contains the reason why auth has failed. + AuthenticationFailed(s) => s.to_string(), + // TODO: add support for channel binding + ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(), + ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), + _ => "authentication protocol violation".to_string(), + } + } +} + /// A convenient result type for SASL exchange. pub type Result = std::result::Result; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index cad77e15f5..fca5585b25 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -106,7 +106,9 @@ impl sasl::Mechanism for Exchange<'_> { } if client_final_message.nonce != server_first_message.nonce() { - return Err(SaslError::AuthenticationFailed("bad nonce")); + return Err(SaslError::AuthenticationFailed( + "combined nonce doesn't match", + )); } let signature_builder = SignatureBuilder { @@ -120,7 +122,7 @@ impl sasl::Mechanism for Exchange<'_> { .derive_client_key(&client_final_message.proof); if client_key.sha256() != self.secret.stored_key { - return Err(SaslError::AuthenticationFailed("keys don't match")); + return Err(SaslError::AuthenticationFailed("password doesn't match")); } let msg = client_final_message From 500e8772f058ccb1a7cccbbfc83c80d14aa26a1e Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Fri, 27 May 2022 17:48:11 -0400 Subject: [PATCH 27/50] Add quick-start guide in readme (#1816) --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 8e8bf1a9b2..97927317d8 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,11 @@ Neon is a serverless open source alternative to AWS Aurora Postgres. It separate The project used to be called "Zenith". Many of the commands and code comments still refer to "zenith", but we are in the process of renaming things. +## Quick start +[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor. + +Alternatively, compile and run the project [locally](#running-local-installation). + ## Architecture overview A Neon installation consists of compute nodes and Neon storage engine. From f1c51a12675587fc1c28412e7ee3c31212e01cd7 Mon Sep 17 00:00:00 2001 From: Kian-Meng Ang Date: Sat, 28 May 2022 13:27:30 +0800 Subject: [PATCH 28/50] Fix typos --- .circleci/ansible/get_binaries.sh | 4 ++-- .github/workflows/benchmarking.yml | 2 +- Dockerfile | 2 +- Dockerfile.alpine | 2 +- docs/glossary.md | 2 +- docs/multitenancy.md | 2 +- docs/rfcs/002-storage.md | 6 +++--- docs/rfcs/003-laptop-cli.md | 2 +- docs/rfcs/005-zenith_local.md | 2 +- docs/rfcs/006-laptop-cli-v2-CLI.md | 4 ++-- docs/rfcs/009-snapshot-first-storage-cli.md | 2 +- docs/rfcs/009-snapshot-first-storage-pitr.md | 2 +- docs/rfcs/010-storage_details.md | 2 +- docs/rfcs/013-term-history.md | 2 +- docs/rfcs/015-storage-messaging.md | 6 +++--- docs/rfcs/README.md | 2 +- docs/settings.md | 2 +- docs/sourcetree.md | 4 ++-- libs/postgres_ffi/src/waldecoder.rs | 2 +- libs/postgres_ffi/src/xlog_utils.rs | 2 +- libs/utils/src/bin_ser.rs | 4 ++-- libs/utils/src/lib.rs | 2 +- libs/utils/src/postgres_backend.rs | 2 +- libs/utils/src/pq_proto.rs | 8 ++++---- pageserver/src/config.rs | 2 +- pageserver/src/keyspace.rs | 2 +- pageserver/src/layered_repository.rs | 6 +++--- pageserver/src/layered_repository/disk_btree.rs | 2 +- pageserver/src/page_service.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 2 +- .../src/remote_storage/storage_sync/delete.rs | 2 +- pageserver/src/repository.rs | 10 +++++----- pageserver/src/storage_sync.rs | 16 ++++++++-------- pageserver/src/storage_sync/delete.rs | 2 +- pageserver/src/storage_sync/download.rs | 2 +- pageserver/src/storage_sync/index.rs | 4 ++-- pageserver/src/storage_sync/upload.rs | 2 +- pageserver/src/virtual_file.rs | 2 +- pageserver/src/walingest.rs | 2 +- pageserver/src/walredo.rs | 4 ++-- proxy/src/proxy.rs | 2 +- safekeeper/README.md | 2 +- safekeeper/README_PROTO.md | 4 ++-- safekeeper/spec/ProposerAcceptorConsensus.tla | 2 +- safekeeper/src/bin/safekeeper.rs | 2 +- safekeeper/src/callmemaybe.rs | 2 +- safekeeper/src/control_file_upgrade.rs | 8 ++++---- safekeeper/src/safekeeper.rs | 4 ++-- safekeeper/src/wal_backup.rs | 4 ++-- safekeeper/src/wal_storage.rs | 4 ++-- test_runner/batch_others/test_clog_truncate.py | 2 +- test_runner/batch_others/test_pitr_gc.py | 2 +- test_runner/batch_others/test_remote_storage.py | 2 +- .../batch_others/test_tenant_relocation.py | 2 +- test_runner/batch_others/test_vm_bits.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 4 ++-- test_runner/fixtures/benchmark_fixture.py | 4 ++-- test_runner/fixtures/zenith_fixtures.py | 8 ++++---- 58 files changed, 96 insertions(+), 96 deletions(-) diff --git a/.circleci/ansible/get_binaries.sh b/.circleci/ansible/get_binaries.sh index c613213a75..c9cbe91f34 100755 --- a/.circleci/ansible/get_binaries.sh +++ b/.circleci/ansible/get_binaries.sh @@ -6,7 +6,7 @@ RELEASE=${RELEASE:-false} # look at docker hub for latest tag for neon docker image if [ "${RELEASE}" = "true" ]; then - echo "search latest relase tag" + echo "search latest release tag" VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." @@ -31,7 +31,7 @@ echo "found ${VERSION}" rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version mkdir neon_install -# retrive binaries from docker image +# retrieve binaries from docker image echo "getting binaries from docker image" docker pull --quiet neondatabase/neon:${TAG} ID=$(docker create neondatabase/neon:${TAG}) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 72041c9d02..adb53c0009 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -19,7 +19,7 @@ jobs: bench: # this workflow runs on self hosteed runner # it's environment is quite different from usual guthub runner - # probably the most important difference is that it doesnt start from clean workspace each time + # probably the most important difference is that it doesn't start from clean workspace each time # e g if you install system packages they are not cleaned up since you install them directly in host machine # not a container or something # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners diff --git a/Dockerfile b/Dockerfile index a7afd1f335..62e0de7e15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/inclu COPY . . # Show build caching stats to check if it was used in the end. -# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. +# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ && sudo -E "PATH=$PATH" mold -run cargo build --release \ && cachepot -s diff --git a/Dockerfile.alpine b/Dockerfile.alpine index dafb7eaf6b..0f244e4443 100644 --- a/Dockerfile.alpine +++ b/Dockerfile.alpine @@ -4,7 +4,7 @@ # We may also reuse it in CI to unify installation process and as a general binaries building # tool for production servers. # -# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls +# Dynamic linking is used for librocksdb and libstdc++ because librocksdb-sys calls # bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust # images which are statically linked and have guards against any dlopen. I would rather # prefer all static binaries so we may change the way librocksdb-sys builds or wait until diff --git a/docs/glossary.md b/docs/glossary.md index a014446010..0de0eea1cb 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -115,7 +115,7 @@ Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/RE * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. diff --git a/docs/multitenancy.md b/docs/multitenancy.md index 4f1d45e970..c697ae93cd 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -6,7 +6,7 @@ Zenith supports multitenancy. One pageserver can serve multiple tenants at once. ### Tenants in other commands -By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct arugment `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. +By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index 5cac377272..6e756df4bf 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -111,13 +111,13 @@ Since we are storing page diffs of variable sizes there is no structural depende ### **Chunk metadata** -Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers. +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunk should always consult this data when merging SSTables and applying delete markers. ### **Chunk splitting** *(NB: following paragraph is about how to avoid page splitting)* -When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global metadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: 1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. @@ -166,7 +166,7 @@ Multi-tenant storage makes sense even on a laptop, when you work with different Few databases are stored in one chunk, replicated three times -- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster. +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we always may manually move chunks around the cluster. Screenshot_2021-02-22_at_16 49 10 diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 4d1f0a68f0..8520249bf1 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -123,7 +123,7 @@ Show currently attached storages. For example: > zenith storage list NAME USED TYPE OPTIONS PATH local 5.1G zenith-local /opt/zenith/store/local -local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr +local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr zcloud 60G zenith-remote zenith.tech/stas/mystore s3tank 80G S3 ``` diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md index 7b078e9ec0..e36d0a9ae3 100644 --- a/docs/rfcs/005-zenith_local.md +++ b/docs/rfcs/005-zenith_local.md @@ -31,7 +31,7 @@ Ideally, just one binary that incorporates all elements we need. #### Components: -- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way. +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md index a04536922a..84dc932211 100644 --- a/docs/rfcs/006-laptop-cli-v2-CLI.md +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -25,9 +25,9 @@ To make changes in the catalog you need to run compute nodes zenith start /home/pipedpiper/northwind:main -- starts a compute instance zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port) +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind: --port 8009 -- start anothe compute instance (on different port) +zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run -- zenith status and see how there are two WAL streams one on top of diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 11ded3a724..0139569721 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -4,7 +4,7 @@ We may think about backups as snapshots in a different format (i.e plain pgdata Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. -So here is an attemt to design consistent CLI for diferent usage scenarios: +So here is an attempt to design consistent CLI for different usage scenarios: #### 1. Start empty pageserver. That is what we have now. diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md index 801613e2c9..a4d978324b 100644 --- a/docs/rfcs/009-snapshot-first-storage-pitr.md +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -3,7 +3,7 @@ GetPage@LSN can be called with older LSNs, and the page server needs to be able to reconstruct older page versions. That's needed for having read-only replicas that lag behind the primary, or that are -"anchored" at an older LSN, and internally in the page server whne you +"anchored" at an older LSN, and internally in the page server when you branch at an older point in time. How do you do that? For now, I'm not considering incremental snapshots at all. I don't diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md index 8429a2d9e3..5c279b7dc8 100644 --- a/docs/rfcs/010-storage_details.md +++ b/docs/rfcs/010-storage_details.md @@ -123,7 +123,7 @@ As far as I understand Bookfile/Aversion addresses versioning and serialization As for exact data that should go to snapshots I think it is the following for each snapshot: * format version number -* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number. +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknown key are present. If we add something backward compatible to the file we can keep the version number. * array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile * array of [(BuffTag, LSN), corresponding offset in file] for the WAL records * pages, one by one diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 0c359028ed..7fe505456d 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -13,7 +13,7 @@ https://github.com/zenithdb/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is -stamped with term in which it was generated; while we essentialy store in +stamped with term in which it was generated; while we essentially store in `epoch` only the term of the highest record on this safekeeper -- when we know it -- because during recovery generally we don't, and `epoch` is bumped directly to the term of the proposer who performs the recovery when it is finished. It is diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md index 47bc9eb89c..a415b90459 100644 --- a/docs/rfcs/015-storage-messaging.md +++ b/docs/rfcs/015-storage-messaging.md @@ -124,7 +124,7 @@ Each storage node can subscribe to the relevant sets of keys and maintain a loca ### Safekeeper address discovery -During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertize something more useful. +During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertise something more useful. ### Safekeeper behavior @@ -195,7 +195,7 @@ sequenceDiagram PS1->>SK1: start replication ``` -#### Behavour of services during typical operations +#### Behaviour of services during typical operations ```mermaid sequenceDiagram @@ -250,7 +250,7 @@ sequenceDiagram PS2->>M: Register downloaded timeline PS2->>M: Get safekeepers for timeline, subscribe to changes PS2->>SK1: Start replication to catch up - note over O: PS2 catched up, time to switch compute + note over O: PS2 caught up, time to switch compute O->>C: Restart compute with new pageserver url in config note over C: Wal push is restarted loop request pages diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index fdf6885929..f7b0b3a587 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -49,7 +49,7 @@ topics. RFC lifecycle: -- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body. +- Should be submitted in a pull request with and full RFC text in a committed markdown file and copy of the Summary and Motivation sections also included in the PR body. - RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. - Add labels to the PR in the same manner as you do Issues. Example TBD - Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. diff --git a/docs/settings.md b/docs/settings.md index 9564ef626f..7773dbf17f 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -105,7 +105,7 @@ Interval at which garbage collection is triggered. Default is 100 s. #### image_creation_threshold -L0 delta layer threshold for L1 iamge layer creation. Default is 3. +L0 delta layer threshold for L1 image layer creation. Default is 3. #### pitr_interval diff --git a/docs/sourcetree.md b/docs/sourcetree.md index c8d4baff62..5384d334df 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation `/docs`: -Documentaion of the Zenith features and concepts. +Documentation of the Zenith features and concepts. Now it is mostly dev documentation. `/monitoring`: @@ -92,7 +92,7 @@ A single virtual environment with all dependencies is described in the single `P ### Prerequisites - Install Python 3.9 (the minimal supported version) or greater. - - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected. + - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 95ea9660e8..91542d268f 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -73,7 +73,7 @@ impl WalStreamDecoder { /// Returns one of the following: /// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function - /// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid. + /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// pub fn poll_decode(&mut self) -> Result, WalDecodeError> { let recordbuf; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 32a3022c5a..67541d844e 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -531,7 +531,7 @@ impl CheckPoint { /// /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { - // nextXid should nw greate than any XID in WAL, so increment provided XID and check for wraparround. + // nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround. let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 063d69557d..70f54ea02f 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -71,7 +71,7 @@ impl From for SerializeError { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn be_coder() -> impl Options { bincode::DefaultOptions::new() .with_big_endian() @@ -85,7 +85,7 @@ pub fn be_coder() -> impl Options { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn le_coder() -> impl Options { bincode::DefaultOptions::new() .with_little_endian() diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 15d4c7a81e..1b011bb73a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -64,7 +64,7 @@ pub mod signals; /// One thing to note is that .git is not available in docker (and it is bad to include it there). /// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required. /// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. -/// Git version received from environment variable used as a fallback in git_version invokation. +/// Git version received from environment variable used as a fallback in git_version invocation. /// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. /// So the build script will be run only when GIT_VERSION envvar has changed. /// diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 857df0ec84..5fdb1ff9d2 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -475,7 +475,7 @@ impl PostgresBackend { self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; } // NOTE there is no ReadyForQuery message. This handler is used - // for basebackup and it uses CopyOut which doesnt require + // for basebackup and it uses CopyOut which doesn't require // ReadyForQuery message and backend just switches back to // processing mode after sending CopyDone or ErrorResponse. } diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index ce86cf8c91..a36e8342b0 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -464,7 +464,7 @@ impl BeParameterStatusMessage<'static> { } } -// One row desciption in RowDescription packet. +// One row description in RowDescription packet. #[derive(Debug)] pub struct RowDescriptor<'a> { pub name: &'a [u8], @@ -613,7 +613,7 @@ fn cstr_to_str(b: &Bytes) -> Result<&str> { impl<'a> BeMessage<'a> { /// Write message to the given buf. // Unlike the reading side, we use BytesMut - // here as msg len preceeds its body and it is handy to write it down first + // here as msg len precedes its body and it is handy to write it down first // and then fill the length. With Write we would have to either calc it // manually or have one more buffer. pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> { @@ -1047,7 +1047,7 @@ mod tests { #[test] fn test_zenithfeedback_serialization() { let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values + // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. @@ -1062,7 +1062,7 @@ mod tests { #[test] fn test_zenithfeedback_unknown_key() { let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values + // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index dc9d7161a2..8add7b8b8f 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -114,7 +114,7 @@ pub struct PageServerConf { pub default_tenant_conf: TenantConf, /// A prefix to add in etcd brokers before every key. - /// Can be used for isolating different pageserver groups withing the same etcd cluster. + /// Can be used for isolating different pageserver groups within the same etcd cluster. pub broker_etcd_prefix: String, /// Etcd broker endpoints to connect to. diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index f6f0d7b7cf..da213704f3 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -15,7 +15,7 @@ pub struct KeySpace { impl KeySpace { /// /// Partition a key space into roughly chunks of roughly 'target_size' bytes - /// in each patition. + /// in each partition. /// pub fn partition(&self, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 0d7c6f54c8..c13407a14b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -823,7 +823,7 @@ impl LayeredRepository { for (timeline_id, timeline_entry) in timelines.iter() { timeline_ids.push(*timeline_id); - // This is unresolved question for now, how to do gc in presense of remote timelines + // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. // Somewhat related: https://github.com/zenithdb/zenith/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { @@ -1831,7 +1831,7 @@ impl LayeredTimeline { // collect any page versions that are no longer needed because // of the new image layers we created in step 2. // - // TODO: This hight level strategy hasn't been implemented yet. + // TODO: This high level strategy hasn't been implemented yet. // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. @@ -2268,7 +2268,7 @@ impl LayeredTimeline { } // 3. Is it needed by a child branch? - // NOTE With that wee would keep data that + // NOTE With that we would keep data that // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when // they are no longer needed. This might be complicated with long inheritance chains. diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index 0c9ad75048..5f9ed8bbea 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -7,7 +7,7 @@ //! - Fixed-width keys //! - Fixed-width values (VALUE_SZ) //! - The tree is created in a bulk operation. Insert/deletion after creation -//! is not suppported +//! is not supported //! - page-oriented //! //! TODO: diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1c07b63072..4f0fca4797 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -634,7 +634,7 @@ impl PageServerHandler { return Ok(()); } // auth is some, just checked above, when auth is some - // then claims are always present because of checks during connetion init + // then claims are always present because of checks during connection init // so this expect won't trigger let claims = self .claims diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c052aa3d69..626ed1b0f1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -521,7 +521,7 @@ pub struct DatadirModification<'a, R: Repository> { lsn: Lsn, - // The modifications are not applied directly to the underyling key-value store. + // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_updates: HashMap, diff --git a/pageserver/src/remote_storage/storage_sync/delete.rs b/pageserver/src/remote_storage/storage_sync/delete.rs index 00e7c85e35..6fb1d254c4 100644 --- a/pageserver/src/remote_storage/storage_sync/delete.rs +++ b/pageserver/src/remote_storage/storage_sync/delete.rs @@ -1,4 +1,4 @@ -//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. +//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d25dc8914d..5bf128e66b 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -19,7 +19,7 @@ use utils::{ #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. /// -/// The Repository treates this as an opaque struct, but see the code in pgdatadir_mapping.rs +/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs /// for what we actually store in these fields. pub struct Key { pub field1: u8, @@ -210,7 +210,7 @@ pub trait Repository: Send + Sync { ) -> Result<()>; /// Get Timeline handle for given zenith timeline ID. - /// This function is idempotent. It doesnt change internal state in any way. + /// This function is idempotent. It doesn't change internal state in any way. fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. @@ -345,11 +345,11 @@ pub trait Timeline: Send + Sync { /// Look up given page version. /// - /// NOTE: It is considerd an error to 'get' a key that doesn't exist. The abstraction + /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction /// above this needs to store suitable metadata to track what data exists with /// what keys, in separate metadata entries. If a non-existent key is requested, - /// the Repository implementation may incorrectly return a value from an ancestore - /// branch, for exampel, or waste a lot of cycles chasing the non-existing key. + /// the Repository implementation may incorrectly return a value from an ancestor + /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// fn get(&self, key: Key, lsn: Lsn) -> Result; diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index bbebcd1f36..1c33d8315c 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -69,7 +69,7 @@ //! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. //! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], -//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. //! Bulk index data download happens only initially, on pageserver startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, @@ -96,7 +96,7 @@ //! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. //! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors. //! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task -//! does otherwise: it requires to have the remote data updated first succesfully: blob files will be invisible to pageserver this way. +//! does otherwise: it requires to have the remote data updated first successfully: blob files will be invisible to pageserver this way. //! //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, //! present locally. @@ -440,7 +440,7 @@ fn collect_timeline_files( // initial collect will fail because there is no metadata. // We either need to start download if we see empty dir after restart or attach caller should // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didnt appear locally. + // but timelinne didn't appear locally. // Check what happens with remote index in that case. let timeline_metadata_path = match timeline_metadata_path { Some(path) => path, @@ -1007,7 +1007,7 @@ where // in local (implicitly, via Lsn values and related memory state) or remote (explicitly via remote layer file paths) metadata. // When operating in a system without tasks failing over the error threshold, // current batching and task processing systems aim to update the layer set and metadata files (remote and local), - // without "loosing" such layer files. + // without "losing" such layer files. let (upload_result, status_update) = tokio::join!( async { if let Some(upload_data) = upload_data { @@ -1162,7 +1162,7 @@ where return Some(TimelineSyncStatusUpdate::Downloaded); } Err(e) => { - error!("Timeline {sync_id} was expected to be in the remote index after a sucessful download, but it's absent: {e:?}"); + error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); } }, Err(e) => { @@ -1549,10 +1549,10 @@ fn compare_local_and_remote_timeline( let remote_files = remote_entry.stored_files(); // TODO probably here we need more sophisticated logic, - // if more data is available remotely can we just download whats there? + // if more data is available remotely can we just download what's there? // without trying to upload something. It may be tricky, needs further investigation. // For now looks strange that we can request upload - // and dowload for the same timeline simultaneously. + // and download for the same timeline simultaneously. // (upload needs to be only for previously unsynced files, not whole timeline dir). // If one of the tasks fails they will be reordered in the queue which can lead // to timeline being stuck in evicted state @@ -1565,7 +1565,7 @@ fn compare_local_and_remote_timeline( }), )); (LocalTimelineInitStatus::NeedsSync, true) - // we do not need to manupulate with remote consistent lsn here + // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { (LocalTimelineInitStatus::LocallyComplete, false) diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 91c618d201..0dcd9c97fc 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -1,4 +1,4 @@ -//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. +//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index a28867f27e..99ccf27e1c 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -1,4 +1,4 @@ -//! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory. +//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory. use std::{collections::HashSet, fmt::Debug, path::Path}; diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 7764a810bc..2ba48ddf53 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -273,7 +273,7 @@ mod tests { }; let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone()) - .expect("Correct remote timeline should be convertable to index part"); + .expect("Correct remote timeline should be convertible to index part"); assert_eq!( index_part.timeline_layers.iter().collect::>(), @@ -305,7 +305,7 @@ mod tests { ); let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) - .expect("Correct index part should be convertable to remote timeline"); + .expect("Correct index part should be convertible to remote timeline"); let original_metadata = &remote_timeline.metadata; let restored_metadata = &restored_timeline.metadata; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 625ec7aed6..2f88fa95ba 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -391,7 +391,7 @@ mod tests { assert_eq!( upload.metadata, Some(metadata), - "Successful upload should not chage its metadata" + "Successful upload should not change its metadata" ); let storage_files = storage.list().await?; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 37d70372b5..a16e772238 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -336,7 +336,7 @@ impl VirtualFile { // library RwLock doesn't allow downgrading without releasing the lock, // and that doesn't seem worth the trouble. // - // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implemenation is fair and + // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and // may deadlock on subsequent read calls. // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 5223125ce6..2f39007e9f 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -12,7 +12,7 @@ //! The zenith Repository can store page versions in two formats: as //! page images, or a WAL records. WalIngest::ingest_record() extracts //! page images out of some WAL records, but most it stores as WAL -//! records. If a WAL record modifies multple pages, WalIngest +//! records. If a WAL record modifies multiple pages, WalIngest //! will call Repository::put_wal_record or put_page_image functions //! separately for each modified page. //! diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e556c24548..edfd36f51a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -122,7 +122,7 @@ lazy_static! { /// /// This is the real implementation that uses a Postgres process to -/// perform WAL replay. Only one thread can use the processs at a time, +/// perform WAL replay. Only one thread can use the process at a time, /// that is controlled by the Mutex. In the future, we might want to /// launch a pool of processes to allow concurrent replay of multiple /// records. @@ -134,7 +134,7 @@ pub struct PostgresRedoManager { process: Mutex>, } -/// Can this request be served by zenith redo funcitons +/// Can this request be served by zenith redo functions /// or we need to pass it to wal-redo postgres process? fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 642e50c2c1..0e3e17359e 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -95,7 +95,7 @@ async fn handle_client( /// Establish a (most probably, secure) connection with the client. /// For better testing experience, `stream` can be any object satisfying the traits. -/// It's easier to work with owned `stream` here as we need to updgrade it to TLS; +/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; /// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, diff --git a/safekeeper/README.md b/safekeeper/README.md index a4bb260932..7b217ddbec 100644 --- a/safekeeper/README.md +++ b/safekeeper/README.md @@ -75,7 +75,7 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only one primary node can be actively streaming WAL to the quorum of safekeepers. -See README_PROTO.md for a more detailed desription of the consensus +See README_PROTO.md for a more detailed description of the consensus protocol. spec/ contains TLA+ specification of it. # Q&A diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 6b2ae50254..7f3da3563a 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -143,7 +143,7 @@ Restart of PostgreSQL initiates new round of voting and switching new epoch. ## Limitations Right now message queue is maintained in main memory and is not spilled to the disk. It can cause memory overflow in case of presence of lagging safekeepers. -It is assumed that in case of loosing local data by some safekeepers, it should be recovered using some external mechanism. +It is assumed that in case of losing local data by some safekeepers, it should be recovered using some external mechanism. ## Glossary @@ -153,7 +153,7 @@ It is assumed that in case of loosing local data by some safekeepers, it should * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream * `Replica`: read-only computatio node -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. ## Algorithm diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla index 993edfcf23..e5f0bb270f 100644 --- a/safekeeper/spec/ProposerAcceptorConsensus.tla +++ b/safekeeper/spec/ProposerAcceptorConsensus.tla @@ -88,7 +88,7 @@ TypeOk == \* in campaign proposer sends RequestVote and waits for acks; \* in leader he is elected /\ prop_state[p].state \in {"campaign", "leader"} - \* 0..max_term should be actually Nat in the unbouned model, but TLC won't + \* 0..max_term should be actually Nat in the unbounded model, but TLC won't \* swallow it /\ prop_state[p].term \in 0..max_term \* votes received diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index a7628482d9..e792a854d5 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -100,7 +100,7 @@ fn main() -> anyhow::Result<()> { Arg::new("dump-control-file") .long("dump-control-file") .takes_value(true) - .help("Dump control file at path specifed by this argument and exit"), + .help("Dump control file at path specified by this argument and exit"), ) .arg( Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") diff --git a/safekeeper/src/callmemaybe.rs b/safekeeper/src/callmemaybe.rs index 8c3fbe26ba..53d38c5e25 100644 --- a/safekeeper/src/callmemaybe.rs +++ b/safekeeper/src/callmemaybe.rs @@ -39,7 +39,7 @@ async fn request_callback( } }); - // use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses + // use Config parsing because SockAddr parsing doesn't allow to use host names instead of ip addresses let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); let me_conf: postgres::config::Config = me_connstr.parse().unwrap(); let (host, port) = connection_host_port(&me_conf); diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 8d36472540..e1740cdcbf 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -27,7 +27,7 @@ struct SafeKeeperStateV1 { acceptor_state: AcceptorStateV1, /// information about server server: ServerInfoV2, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. proposer_uuid: PgUuid, /// part of WAL acknowledged by quorum and available locally @@ -57,7 +57,7 @@ pub struct SafeKeeperStateV2 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfoV2, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. pub proposer_uuid: PgUuid, /// part of WAL acknowledged by quorum and available locally @@ -89,7 +89,7 @@ pub struct SafeKeeperStateV3 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfoV3, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, @@ -114,7 +114,7 @@ pub struct SafeKeeperStateV4 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index c254f2c57c..df4b202063 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -180,7 +180,7 @@ pub struct SafeKeeperState { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, @@ -759,7 +759,7 @@ where self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); - // Initalizing backup_lsn is useful to avoid making backup think it should upload 0 segment. + // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 83dc312d28..a4b779649d 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -257,7 +257,7 @@ impl WalBackupTask { // Optimization idea for later: // Avoid checking election leader every time by returning current lease grant expiration time // Re-check leadership only after expiration time, - // such approach woud reduce overhead on write-intensive workloads + // such approach would reduce overhead on write-intensive workloads match l .check_am_i( @@ -389,7 +389,7 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> { let file = File::open(&source_file).await?; - // Storage is initialized by launcher at ths point. + // Storage is initialized by launcher at this point. match storage.as_ref().unwrap() { GenericRemoteStorage::Local(local_storage) => { let destination = local_storage.remote_object_id(source_file)?; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 503bd7c543..7285cedc03 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -126,7 +126,7 @@ pub struct PhysicalStorage { conf: SafeKeeperConf, // fields below are filled upon initialization - /// None if unitialized, Some(usize) if storage is initialized. + /// None if uninitialized, Some(usize) if storage is initialized. wal_seg_size: Option, /// Written to disk, but possibly still in the cache and not fully persisted. @@ -456,7 +456,7 @@ impl Storage for PhysicalStorage { segno += 1; let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; - // TODO: better use fs::try_exists which is currenty avaialble only in nightly build + // TODO: better use fs::try_exists which is currently available only in nightly build if wal_file_path.exists() { fs::remove_file(&wal_file_path)?; } else if wal_file_partial_path.exists() { diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index b7eeedb23e..1a49a4582e 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -14,7 +14,7 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): env = zenith_simple_env env.zenith_cli.create_branch('test_clog_truncate', 'empty') - # set agressive autovacuum to make sure that truncation will happen + # set aggressive autovacuum to make sure that truncation will happen config = [ 'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0', diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index ee19bddfe8..a5149f7ad9 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -55,7 +55,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: pscur.execute(f"compact {env.initial_tenant.hex} {timeline}") - # perform agressive GC. Data still should be kept because of the PITR setting. + # perform aggressive GC. Data still should be kept because of the PITR setting. pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") row = pscur.fetchone() print_gc_result(row) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index afbe3c55c7..864cccf736 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -116,7 +116,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None log.info("Timeline detail after attach completed: %s", detail) - assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should shoud not be less than the one stored on remote storage' + assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should should not be less than the one stored on remote storage' assert not detail['remote']['awaits_download'] pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 91506e120d..8ecc731ae9 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -92,7 +92,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve # if we recovered after failure verify that we have correct number of rows log.info("recovering at %s", inserted_ctr) cur.execute("SELECT count(*) FROM load") - # it seems that sometimes transaction gets commited before we can acknowledge + # it seems that sometimes transaction gets committed before we can acknowledge # the result, so sometimes selected value is larger by one than we expect assert cur.fetchone()[0] - inserted_ctr <= 1 log.info("successfully recovered %s", inserted_ctr) diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 49e48dd450..98854111f6 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -28,7 +28,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur.execute('INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g') cur.execute('VACUUM FREEZE vmtest_update') - # DELETE and UDPATE the rows. + # DELETE and UPDATE the rows. cur.execute('DELETE FROM vmtest_delete WHERE id = 1') cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index fc192c28e8..8837725b84 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -905,8 +905,8 @@ def test_delete_force(zenith_env_builder: ZenithEnvBuilder): # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant.hex - timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Acive, delete explicitly - timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explictly + timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Active, delete explicitly + timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explicitly timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 5fc6076f51..75fece6818 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -206,7 +206,7 @@ class ZenithBenchmarker: f"{prefix}.number_of_transactions_actually_processed", pg_bench_result.number_of_transactions_actually_processed, '', - # thats because this is predefined by test matrix and doesnt change across runs + # that's because this is predefined by test matrix and doesn't change across runs report=MetricReport.TEST_PARAM, ) self.record(f"{prefix}.latency_average", @@ -302,7 +302,7 @@ def pytest_addoption(parser): parser.addoption( "--out-dir", dest="out_dir", - help="Directory to ouput performance tests results to.", + help="Directory to output performance tests results to.", ) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index a2e8c82d30..8d9a4ccd85 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -75,7 +75,7 @@ def pytest_addoption(parser): "--skip-interfering-proc-check", dest="skip_interfering_proc_check", action="store_true", - help="skip check for interferring processes", + help="skip check for interfering processes", ) @@ -88,7 +88,7 @@ top_output_dir = "" def check_interferring_processes(config): if config.getoption("skip_interfering_proc_check"): - warnings.warn("interferring process check is skipped") + warnings.warn("interfering process check is skipped") return # does not use -c as it is not supported on macOS @@ -107,7 +107,7 @@ def check_interferring_processes(config): def pytest_configure(config): """ Ensure that no unwanted daemons are running before we start testing. - Check that we do not owerflow available ports range. + Check that we do not overflow available ports range. """ check_interferring_processes(config) @@ -1417,7 +1417,7 @@ class RemotePostgres(PgProtocol): raise Exception('cannot stop a remote Postgres instance') def get_subdir_size(self, subdir) -> int: - # TODO: Could use the server's Generic File Acccess functions if superuser. + # TODO: Could use the server's Generic File Access functions if superuser. # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE raise Exception('cannot get size of a Postgres instance') From 4b4d3073b8c479b4bcb1bec4681120c2f49065da Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 28 May 2022 14:30:59 +0300 Subject: [PATCH 29/50] Fix misc typos --- control_plane/src/etcd.rs | 2 +- docs/glossary.md | 2 +- pageserver/src/layered_repository/disk_btree.rs | 4 ++-- safekeeper/README_PROTO.md | 2 +- test_runner/fixtures/zenith_fixtures.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index bc39b7dea3..0123d9c491 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -77,7 +77,7 @@ pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let etcd_pid_file_path = etcd_pid_file_path(env); let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| { format!( - "Failed to read etcd pid filea at {}", + "Failed to read etcd pid file at {}", etcd_pid_file_path.display() ) })?); diff --git a/docs/glossary.md b/docs/glossary.md index 0de0eea1cb..a5bb154793 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -115,7 +115,7 @@ Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/RE * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. -* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index 5f9ed8bbea..dc8d7a2ad3 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -498,8 +498,8 @@ where return Ok(()); } - // It did not fit. Try to compress, and it it succeeds to make some room - // on the node, try appending to it again. + // It did not fit. Try to compress, and if it succeeds to make + // some room on the node, try appending to it again. #[allow(clippy::collapsible_if)] if last.compress() { if last.push(key, value) { diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 7f3da3563a..0cd1f510e6 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -153,7 +153,7 @@ It is assumed that in case of losing local data by some safekeepers, it should b * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream * `Replica`: read-only computatio node -* `VCL`: the largerst LSN for which we can guarantee availability of all prior records. +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. ## Algorithm diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8d9a4ccd85..336f1f1348 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -2139,7 +2139,7 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, if detail['remote'] is None: # No remote information at all. This happens right after creating - # a timeline, before any part of it it has been uploaded to remote + # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: From e3b320daabe4d140f7963c1ffed996128567264c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 28 May 2022 21:22:19 +0300 Subject: [PATCH 30/50] Remove obsolete Dockerfile.alpine It hasn't been used for anything for a long time. The comments still talked about librocksdb, which we also haven't used for a long time. --- Dockerfile.alpine | 95 ----------------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 Dockerfile.alpine diff --git a/Dockerfile.alpine b/Dockerfile.alpine deleted file mode 100644 index 0f244e4443..0000000000 --- a/Dockerfile.alpine +++ /dev/null @@ -1,95 +0,0 @@ -# -# Docker image for console integration testing. -# -# We may also reuse it in CI to unify installation process and as a general binaries building -# tool for production servers. -# -# Dynamic linking is used for librocksdb and libstdc++ because librocksdb-sys calls -# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust -# images which are statically linked and have guards against any dlopen. I would rather -# prefer all static binaries so we may change the way librocksdb-sys builds or wait until -# we will have our own storage and drop rockdb dependency. -# -# Cargo-chef is used to separate dependencies building from main binaries building. This -# way `docker build` will download and install dependencies only of there are changes to -# out Cargo.toml files. -# - - -# -# build postgres separately -- this layer will be rebuilt only if one of -# mentioned paths will get any changes -# -FROM alpine:3.13 as pg-build -RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \ - make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev -WORKDIR zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile -# Build using clang and lld -RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4 - -# -# Calculate cargo dependencies. -# This will always run, but only generate recipe.json with list of dependencies without -# installing them. -# -FROM alpine:20210212 as cargo-deps-inspect -RUN apk add --update rust cargo -RUN cargo install cargo-chef -WORKDIR zenith -COPY . . -RUN cargo chef prepare --recipe-path recipe.json - -# -# Build cargo dependencies. -# This temp cantainner would be build only if recipe.json was changed. -# -FROM alpine:20210212 as deps-build -RUN apk add --update rust cargo openssl-dev clang build-base -# rust-rocksdb can be built against system-wide rocksdb -- that saves about -# 10 minutes during build. Rocksdb apk package is in testing now, but use it -# anyway. In case of any troubles we can download and build rocksdb here manually -# (to cache it as a docker layer). -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/ -COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json -RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json - -# -# Build zenith binaries -# -FROM alpine:20210212 as build -RUN apk add --update rust cargo openssl-dev clang build-base -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY . . -# Copy cached dependencies -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=deps-build /zenith/target target -COPY --from=deps-build /root/.cargo /root/.cargo -RUN cargo build --release - -# -# Copy binaries to resulting image. -# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure -# out how to statically link rocksdb or avoid it at all). -# -FROM alpine:3.13 -RUN apk add --update openssl build-base libseccomp-dev -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install /usr/local -COPY docker-entrypoint.sh /docker-entrypoint.sh - -RUN addgroup zenith && adduser -h /data -D -G zenith zenith -VOLUME ["/data"] -WORKDIR /data -USER zenith -EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] -CMD ["pageserver"] From 3accde613d8a57fea149471dab22ee0d6843035e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 18 Apr 2022 19:43:57 +0300 Subject: [PATCH 31/50] Rename contrib/zenith to contrib/neon. Rename custom GUCs: - zenith.page_server_connstring -> neon.pageserver_connstring - zenith.zenith_tenant -> neon.tenantid - zenith.zenith_timeline -> neon.timelineid - zenith.max_cluster_size -> neon.max_cluster_size --- Makefile | 10 ++++---- compute_tools/src/bin/compute_ctl.rs | 6 ++--- compute_tools/tests/cluster_spec.json | 8 +++---- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/compute.rs | 14 +++++------ docs/rfcs/cluster-size-limits.md | 8 +++---- docs/sourcetree.md | 6 ++--- libs/postgres_ffi/wal_generate/src/lib.rs | 4 ++-- pageserver/src/walredo.rs | 4 ++-- .../batch_others/test_ancestor_branch.py | 6 ++--- test_runner/batch_others/test_backpressure.py | 2 +- .../batch_others/test_branch_behind.py | 2 +- .../batch_others/test_broken_timeline.py | 2 +- .../batch_others/test_clog_truncate.py | 2 +- .../batch_others/test_gc_aggressive.py | 2 +- .../batch_others/test_old_request_lsn.py | 2 +- test_runner/batch_others/test_pitr_gc.py | 2 +- .../batch_others/test_read_validation.py | 2 +- .../batch_others/test_remote_storage.py | 4 ++-- .../batch_others/test_tenant_relocation.py | 4 ++-- .../test_tenants_with_remote_storage.py | 8 +++---- .../batch_others/test_timeline_size.py | 6 ++--- test_runner/batch_others/test_vm_bits.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 24 +++++++++---------- .../batch_others/test_wal_acceptor_async.py | 4 ++-- test_runner/batch_others/test_wal_restore.py | 2 +- test_runner/fixtures/compare_fixtures.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 2 +- vendor/postgres | 2 +- 29 files changed, 72 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index fdfc64f6fa..e3d183eaee 100644 --- a/Makefile +++ b/Makefile @@ -74,16 +74,16 @@ postgres-headers: postgres-configure +@echo "Installing PostgreSQL headers" $(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install -# Compile and install PostgreSQL and contrib/zenith +# Compile and install PostgreSQL and contrib/neon .PHONY: postgres postgres: postgres-configure \ postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" $(MAKE) -C tmp_install/build MAKELEVEL=0 install - +@echo "Compiling contrib/zenith" - $(MAKE) -C tmp_install/build/contrib/zenith install - +@echo "Compiling contrib/zenith_test_utils" - $(MAKE) -C tmp_install/build/contrib/zenith_test_utils install + +@echo "Compiling contrib/neon" + $(MAKE) -C tmp_install/build/contrib/neon install + +@echo "Compiling contrib/neon_test_utils" + $(MAKE) -C tmp_install/build/contrib/neon_test_utils install +@echo "Compiling pg_buffercache" $(MAKE) -C tmp_install/build/contrib/pg_buffercache install +@echo "Compiling pageinspect" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 5c951b7779..b97429c223 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -116,17 +116,17 @@ fn main() -> Result<()> { let pageserver_connstr = spec .cluster .settings - .find("zenith.page_server_connstring") + .find("neon.pageserver_connstring") .expect("pageserver connstr should be provided"); let tenant = spec .cluster .settings - .find("zenith.zenith_tenant") + .find("neon.tenantid") .expect("tenant id should be provided"); let timeline = spec .cluster .settings - .find("zenith.zenith_timeline") + .find("neon.timelineid") .expect("tenant id should be provided"); let compute_state = ComputeNode { diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 4a1672919c..4821848678 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -150,7 +150,7 @@ "vartype": "integer" }, { - "name": "zenith.zenith_tenant", + "name": "neon.tenantid", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, @@ -160,13 +160,13 @@ "vartype": "integer" }, { - "name": "zenith.zenith_timeline", + "name": "neon.timelineid", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, { "name": "shared_preload_libraries", - "value": "zenith", + "value": "neon", "vartype": "string" }, { @@ -175,7 +175,7 @@ "vartype": "string" }, { - "name": "zenith.page_server_connstring", + "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 33f903f0e1..a81c6512bc 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nzenith.zenith_tenant = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nzenith.zenith_timeline = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'zenith'\nsynchronous_standby_names = 'walproposer'\nzenith.page_server_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 045acd7519..3fefd32389 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -148,8 +148,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; - let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; + let timeline_id: ZTimelineId = conf.parse_field("neon.timelineid", &context)?; + let tenant_id: ZTenantId = conf.parse_field("neon.tenantid", &context)?; let uses_wal_proposer = conf.get("wal_acceptors").is_some(); // parse recovery_target_lsn, if any @@ -303,11 +303,11 @@ impl PostgresNode { // uses only needed variables namely host, port, user, password. format!("postgresql://no_user:{}@{}:{}", password, host, port) }; - conf.append("shared_preload_libraries", "zenith"); + conf.append("shared_preload_libraries", "neon"); conf.append_line(""); - conf.append("zenith.page_server_connstring", &pageserver_connstr); - conf.append("zenith.zenith_tenant", &self.tenant_id.to_string()); - conf.append("zenith.zenith_timeline", &self.timeline_id.to_string()); + conf.append("neon.pageserver_connstring", &pageserver_connstr); + conf.append("neon.tenantid", &self.tenant_id.to_string()); + conf.append("neon.timelineid", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } @@ -352,7 +352,7 @@ impl PostgresNode { // This isn't really a supported configuration, but can be useful for // testing. conf.append("synchronous_standby_names", "pageserver"); - conf.append("zenith.callmemaybe_connstring", &self.connstr()); + conf.append("neon.callmemaybe_connstring", &self.connstr()); } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md index 4696f2c7f0..bd12fb6eee 100644 --- a/docs/rfcs/cluster-size-limits.md +++ b/docs/rfcs/cluster-size-limits.md @@ -22,8 +22,8 @@ so we don't want to give users access to the functionality that we don't think i * pageserver - calculate the size consumed by a timeline and add it to the feedback message. * safekeeper - pass feedback message from pageserver to compute. -* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`. -* console - set and update `zenith.max_cluster_size` setting +* compute - receive feedback message, enforce size limit based on GUC `neon.max_cluster_size`. +* console - set and update `neon.max_cluster_size` setting ## Proposed implementation @@ -49,7 +49,7 @@ This message is received by the safekeeper and propagated to compute node as a p Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. -And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. (see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) TODO: @@ -75,5 +75,5 @@ We should warn users if the limit is soon to be reached. ### **Security implications** We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM. -Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check. +Malicious users may change the `neon.max_cluster_size`, so we need an extra size limit check. To cover this case, we also monitor the compute node size in the console. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 5384d334df..05eaa96938 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -42,13 +42,13 @@ Integration tests, written in Python using the `pytest` framework. `/vendor/postgres`: -PostgreSQL source tree, with the modifications needed for Zenith. +PostgreSQL source tree, with the modifications needed for Neon. -`/vendor/postgres/contrib/zenith`: +`/vendor/postgres/contrib/neon`: PostgreSQL extension that implements storage manager API and network communications with remote page server. -`/vendor/postgres/contrib/zenith_test_utils`: +`/vendor/postgres/contrib/neon_test_utils`: PostgreSQL extension that contains functions needed for testing and debugging. diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_generate/src/lib.rs index a5cd81d68a..3b19afb826 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_generate/src/lib.rs @@ -80,7 +80,7 @@ impl Conf { .arg(self.datadir.as_os_str()) .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output - .args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup + .args(&["-c", "shared_preload_libraries=neon"]) // can only be loaded at startup // Disable background processes as much as possible .args(&["-c", "wal_writer_delay=10s"]) .args(&["-c", "autovacuum=off"]) @@ -178,7 +178,7 @@ fn generate_internal( client: &mut C, f: impl Fn(&mut C, PgLsn) -> Result>, ) -> Result { - client.execute("create extension if not exists zenith_test_utils", &[])?; + client.execute("create extension if not exists neon_test_utils", &[])?; let wal_segment_size = client.query_one( "select cast(setting as bigint) as setting, unit \ diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index edfd36f51a..d263bf0e9a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -607,8 +607,8 @@ impl PostgresRedoProcess { .open(PathBuf::from(&datadir).join("postgresql.conf"))?; config.write_all(b"shared_buffers=128kB\n")?; config.write_all(b"fsync=off\n")?; - config.write_all(b"shared_preload_libraries=zenith\n")?; - config.write_all(b"zenith.wal_redo=on\n")?; + config.write_all(b"shared_preload_libraries=neon\n")?; + config.write_all(b"neon.wal_redo=on\n")?; } // Start postgres itself let mut child = Command::new(conf.pg_bin_dir().join("postgres")) diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 5dbd6d2e26..e05a550fdf 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -30,7 +30,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_cur.execute("SHOW zenith.zenith_timeline") + branch0_cur.execute("SHOW neon.timelineid") branch0_timeline = branch0_cur.fetchone()[0] log.info(f"b0 timeline {branch0_timeline}") @@ -55,7 +55,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_cur.execute("SHOW zenith.zenith_timeline") + branch1_cur.execute("SHOW neon.timelineid") branch1_timeline = branch1_cur.fetchone()[0] log.info(f"b1 timeline {branch1_timeline}") @@ -79,7 +79,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_cur.execute("SHOW zenith.zenith_timeline") + branch2_cur.execute("SHOW neon.timelineid") branch2_timeline = branch2_cur.fetchone()[0] log.info(f"b2 timeline {branch2_timeline}") diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 81f45b749b..5debb2ee61 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -26,7 +26,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info("checks started") with pg_cur(pg) as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to zenith_fixtures? cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))") res = cur.fetchone() diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index fc84af5283..9bb04f574b 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -31,7 +31,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW zenith.zenith_timeline") + main_cur.execute("SHOW neon.timelineid") timeline = main_cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index f0aa44e0a4..45fe69748d 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -26,7 +26,7 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline_id = cur.fetchone()[0] pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 1a49a4582e..2382cd93b3 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -29,7 +29,7 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): log.info('postgres is running on test_clog_truncate branch') # Install extension containing function needed for test - pg.safe_psql('CREATE EXTENSION zenith_test_utils') + pg.safe_psql('CREATE EXTENSION neon_test_utils') # Consume many xids to advance clog with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 519a6dda1c..6beee49d2f 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -62,7 +62,7 @@ def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): conn = pg.connect() cur = conn.cursor() - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index cf7fe09b1e..1ec429ea34 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -26,7 +26,7 @@ def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = cur.fetchone()[0] psconn = env.pageserver.connect() diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index a5149f7ad9..6456acd214 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -25,7 +25,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW zenith.zenith_timeline") + main_cur.execute("SHOW neon.timelineid") timeline = main_cur.fetchone()[0] # Create table diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/batch_others/test_read_validation.py index ee41e6511c..9d2248ac89 100644 --- a/test_runner/batch_others/test_read_validation.py +++ b/test_runner/batch_others/test_read_validation.py @@ -8,7 +8,7 @@ from psycopg2.errors import IoError pytest_plugins = ("fixtures.zenith_fixtures") -extensions = ["pageinspect", "zenith_test_utils", "pg_buffercache"] +extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] # diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 864cccf736..e5c94980f0 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -48,8 +48,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] checkpoint_numbers = range(1, 3) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 8ecc731ae9..6ad9c6305f 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -130,7 +130,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, with closing(tenant_pg.connect()) as conn: with conn.cursor() as cur: # save timeline for later gc call - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = UUID(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline.hex) @@ -223,7 +223,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, tenant_pg_config_file_path = pathlib.Path(tenant_pg.config_file_path()) tenant_pg_config_file_path.open('a').write( - f"\nzenith.page_server_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'" + f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'" ) tenant_pg.start() diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index c00f077fcd..8eb72437fd 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -21,8 +21,8 @@ async def tenant_workload(env: ZenithEnv, pg: Postgres): pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") - timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + tenant_id = await pg_conn.fetchval("show neon.tenantid") + timeline_id = await pg_conn.fetchval("show neon.timelineid") await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): @@ -82,9 +82,9 @@ def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): for tenant, pg in tenants_pgs: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("show zenith.zenith_tenant") + cur.execute("show neon.tenantid") tenant_id = cur.fetchone()[0] - cur.execute("show zenith.zenith_timeline") + cur.execute("show neon.timelineid") timeline_id = cur.fetchone()[0] cur.execute("SELECT pg_current_wal_flush_lsn()") current_lsn = lsn_from_hex(cur.fetchone()[0]) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 0b33b56df3..86f9ed247b 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -21,7 +21,7 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (t text)") @@ -81,12 +81,12 @@ def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['zenith.max_cluster_size=30MB']) + config_lines=['neon.max_cluster_size=30MB']) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to zenith_fixtures? cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 98854111f6..8a14959eff 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -17,7 +17,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur = pg_conn.cursor() # Install extension containing function needed for test - cur.execute('CREATE EXTENSION zenith_test_utils') + cur.execute('CREATE EXTENSION neon_test_utils') # Create a test table and freeze it to set the VM bit. cur.execute('CREATE TABLE vmtest_delete (id integer PRIMARY KEY)') diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 8837725b84..46fb6601b1 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -337,8 +337,8 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -384,8 +384,8 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): cur.execute('CREATE TABLE t(key int primary key, value text)') cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] # force checkpoint to advance remote_consistent_lsn with closing(env.pageserver.connect()) as psconn: @@ -497,10 +497,10 @@ class ProposerPostgres(PgProtocol): with open(self.config_file_path(), "w") as f: cfg = [ "synchronous_standby_names = 'walproposer'\n", - "shared_preload_libraries = 'zenith'\n", - f"zenith.zenith_timeline = '{self.timeline_id.hex}'\n", - f"zenith.zenith_tenant = '{self.tenant_id.hex}'\n", - f"zenith.page_server_connstring = ''\n", + "shared_preload_libraries = 'neon'\n", + f"neon.timelineid = '{self.timeline_id.hex}'\n", + f"neon.tenantid = '{self.tenant_id.hex}'\n", + f"neon.pageserver_connstring = ''\n", f"wal_acceptors = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", @@ -612,8 +612,8 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): wa_http_cli.check_status() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) @@ -798,8 +798,8 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): pg.start() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + timeline_id = pg.safe_psql("show neon.timelineid")[0][0] execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index c484b6401c..1e7edcc8df 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -151,8 +151,8 @@ async def run_restarts_under_load(env: ZenithEnv, test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") - timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + tenant_id = await pg_conn.fetchval("show neon.tenantid") + timeline_id = await pg_conn.fetchval("show neon.timelineid") bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index f4aceac5e8..eacc742880 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -19,7 +19,7 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, env.zenith_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] + tenant_id = pg.safe_psql("show neon.tenantid")[0][0] env.zenith_cli.pageserver_stop() port = port_distributor.get_port() data_dir = os.path.join(test_output_dir, 'pgsql.restored') diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index d572901ed1..f5a97b5a84 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -66,7 +66,7 @@ class ZenithCompare(PgCompare): # We only use one branch and one timeline self.env.zenith_cli.create_branch(branch_name, 'empty') self._pg = self.env.postgres.create_start(branch_name) - self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] + self.timeline = self.pg.safe_psql("SHOW neon.timelineid")[0][0] # Long-lived cursor, useful for flushing self.psconn = self.env.pageserver.connect() diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 336f1f1348..6d859b17d2 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -2039,7 +2039,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timelineid") timeline = cur.fetchone()[0] # stop postgres to ensure that files won't change diff --git a/vendor/postgres b/vendor/postgres index 038b2b98e5..165e61b5e0 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91 +Subproject commit 165e61b5e0a7e003b28d8dca7a6825b3a03f065d From 751f1191b42a5c65b601bd5ab3e15f7301f8cf5f Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 19 Apr 2022 15:36:43 +0300 Subject: [PATCH 32/50] Rename 'wal_acceptors' GUC to 'safekeepers' --- compute_tools/tests/cluster_spec.json | 2 +- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/compute.rs | 4 ++-- test_runner/batch_others/test_wal_acceptor.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 6 +++--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 4821848678..5d8104ab4c 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "wal_acceptors", + "name": "safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", "vartype": "string" }, diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index a81c6512bc..9e606ec7c2 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 3fefd32389..e81dddc287 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -150,7 +150,7 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timeline_id: ZTimelineId = conf.parse_field("neon.timelineid", &context)?; let tenant_id: ZTenantId = conf.parse_field("neon.tenantid", &context)?; - let uses_wal_proposer = conf.get("wal_acceptors").is_some(); + let uses_wal_proposer = conf.get("safekeepers").is_some(); // parse recovery_target_lsn, if any let recovery_target_lsn: Option = @@ -341,7 +341,7 @@ impl PostgresNode { .map(|sk| format!("localhost:{}", sk.pg_port)) .collect::>() .join(","); - conf.append("wal_acceptors", &safekeepers); + conf.append("safekeepers", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 46fb6601b1..b176faa46a 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -501,7 +501,7 @@ class ProposerPostgres(PgProtocol): f"neon.timelineid = '{self.timeline_id.hex}'\n", f"neon.tenantid = '{self.tenant_id.hex}'\n", f"neon.pageserver_connstring = ''\n", - f"wal_acceptors = '{safekeepers}'\n", + f"safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", ] diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 6d859b17d2..533a6cfa8c 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1590,12 +1590,12 @@ class Postgres(PgProtocol): if ("synchronous_standby_names" in cfg_line or # don't ask pageserver to fetch WAL from compute "callmemaybe_connstring" in cfg_line or - # don't repeat safekeepers/wal_acceptors multiple times - "wal_acceptors" in cfg_line): + # don't repeat safekeepers multiple times + "safekeepers" in cfg_line): continue f.write(cfg_line) f.write("synchronous_standby_names = 'walproposer'\n") - f.write("wal_acceptors = '{}'\n".format(safekeepers)) + f.write("safekeepers = '{}'\n".format(safekeepers)) return self def config(self, lines: List[str]) -> 'Postgres': From 6a867bce6db5c4a0b6bd0d56c6d6a6df92ef2279 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 26 May 2022 12:48:07 +0300 Subject: [PATCH 33/50] Rename 'zenith_admin' role to 'cloud_admin' --- README.md | 10 +++++----- compute_tools/README.md | 2 +- compute_tools/src/bin/compute_ctl.rs | 2 +- compute_tools/src/monitor.rs | 2 +- control_plane/src/compute.rs | 2 +- docs/settings.md | 6 +++--- libs/utils/scripts/restore_from_wal.sh | 2 +- libs/utils/scripts/restore_from_wal_archive.sh | 2 +- pageserver/src/config.rs | 4 ++-- test_runner/batch_others/test_wal_acceptor.py | 4 ++-- test_runner/batch_others/test_wal_restore.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 4 ++-- vendor/postgres | 2 +- 13 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 97927317d8..131d5da110 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Safekeeper started > ./target/debug/neon_local pg start main Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 -Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' +Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances > ./target/debug/neon_local pg list @@ -123,7 +123,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=po 2. Now it is possible to connect to postgres and run some queries: ```text -> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); @@ -150,7 +150,7 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: > ./target/debug/neon_local pg start migration_check --branch-name migration_check Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 -Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres' +Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' # check the new list of running postgres instances > ./target/debug/neon_local pg list @@ -160,7 +160,7 @@ Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=po # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55433 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -171,7 +171,7 @@ postgres=# insert into t values(2,2); INSERT 0 1 # check that the new change doesn't affect the 'main' postgres -> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- diff --git a/compute_tools/README.md b/compute_tools/README.md index 15876ed246..97a7513344 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -22,7 +22,7 @@ Also `compute_ctl` spawns two separate service threads: Usage example: ```sh compute_ctl -D /var/db/postgres/compute \ - -C 'postgresql://zenith_admin@localhost/postgres' \ + -C 'postgresql://cloud_admin@localhost/postgres' \ -S /var/db/postgres/specs/current.json \ -b /usr/local/bin/postgres ``` diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b97429c223..2e8d864830 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -21,7 +21,7 @@ //! Usage example: //! ```sh //! compute_ctl -D /var/db/postgres/compute \ -//! -C 'postgresql://zenith_admin@localhost/postgres' \ +//! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres //! ``` diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 496a5aae3b..041b4875bd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -43,7 +43,7 @@ fn watch_compute_activity(compute: &Arc) { FROM pg_stat_activity WHERE backend_type = 'client backend' AND pid != pg_backend_pid() - AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors? + AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? &[], ); let mut last_active = compute.state.read().unwrap().last_active; diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index e81dddc287..d2d1d840c9 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -499,7 +499,7 @@ impl PostgresNode { "host={} port={} user={} dbname={}", self.address.ip(), self.address.port(), - "zenith_admin", + "cloud_admin", "postgres" ) } diff --git a/docs/settings.md b/docs/settings.md index 7773dbf17f..98439a094c 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -23,7 +23,7 @@ gc_horizon = '67108864' max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zenith_admin' +initial_superuser_name = 'cloud_admin' broker_etcd_prefix = 'neon' broker_endpoints = ['some://etcd'] @@ -38,7 +38,7 @@ Yet, it validates the config values it can (e.g. postgres install dir) and error Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and -- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` +- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'cloud_admin'` - or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` @@ -115,7 +115,7 @@ WAL retention duration for PITR branching. Default is 30 days. Name of the initial superuser role, passed to initdb when a new tenant is initialized. It doesn't affect anything after initialization. The -default is Note: The default is 'zenith_admin', and the console +default is Note: The default is 'cloud_admin', and the console depends on that, so if you change it, bad things will happen. #### page_cache_size diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 4983449f24..9bd860affb 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -5,7 +5,7 @@ DATA_DIR=$3 PORT=$4 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` rm -fr $DATA_DIR -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID echo port=$PORT >> $DATA_DIR/postgresql.conf REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` declare -i WAL_SIZE=$REDO_POS+114 diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh index 07f4fe1e4f..ce58b349fc 100755 --- a/libs/utils/scripts/restore_from_wal_archive.sh +++ b/libs/utils/scripts/restore_from_wal_archive.sh @@ -5,7 +5,7 @@ PORT=$4 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` rm -fr $DATA_DIR /tmp/pg_wals mkdir /tmp/pg_wals -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID echo port=$PORT >> $DATA_DIR/postgresql.conf REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` declare -i WAL_SIZE=$REDO_POS+114 diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8add7b8b8f..f44b0846a8 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -34,7 +34,7 @@ pub mod defaults { pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -499,7 +499,7 @@ impl PageServerConf { max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - superuser: "zenith_admin".to_string(), + superuser: "cloud_admin".to_string(), workdir: repo_dir, pg_distrib_dir: PathBuf::new(), auth_type: AuthType::Trust, diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index b176faa46a..97bac5fed4 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -473,7 +473,7 @@ class ProposerPostgres(PgProtocol): tenant_id: uuid.UUID, listen_addr: str, port: int): - super().__init__(host=listen_addr, port=port, user='zenith_admin', dbname='postgres') + super().__init__(host=listen_addr, port=port, user='cloud_admin', dbname='postgres') self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin @@ -529,7 +529,7 @@ class ProposerPostgres(PgProtocol): def initdb(self): """ Run initdb """ - args = ["initdb", "-U", "zenith_admin", "-D", self.pg_data_dir_path()] + args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] self.pg_bin.run(args) def start(self): diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index eacc742880..69249e75ff 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -32,4 +32,4 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, str(port) ]) restored.start() - assert restored.safe_psql('select count(*) from t', user='zenith_admin') == [(300000, )] + assert restored.safe_psql('select count(*) from t', user='cloud_admin') == [(300000, )] diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 533a6cfa8c..4459e0ac55 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1226,7 +1226,7 @@ class ZenithPageserver(PgProtocol): Initializes the repository via `zenith init`. """ def __init__(self, env: ZenithEnv, port: PageserverPort, config_override: Optional[str] = None): - super().__init__(host='localhost', port=port.pg, user='zenith_admin') + super().__init__(host='localhost', port=port.pg, user='cloud_admin') self.env = env self.running = False self.service_port = port @@ -1495,7 +1495,7 @@ def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: class Postgres(PgProtocol): """ An object representing a running postgres daemon. """ def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int): - super().__init__(host='localhost', port=port, user='zenith_admin', dbname='postgres') + super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres') self.env = env self.running = False self.node_name: Optional[str] = None # dubious, see asserts below diff --git a/vendor/postgres b/vendor/postgres index 165e61b5e0..7a2aa6035b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 165e61b5e0a7e003b28d8dca7a6825b3a03f065d +Subproject commit 7a2aa6035bf0b4f676597b7b90de7fee20824fff From 67d6ff41009a38a8c96e7058737220518f2267c5 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 26 May 2022 21:18:52 +0300 Subject: [PATCH 34/50] Rename custom GUCs: - zenith.zenith_tenant -> neon.tenant_id - zenith.zenith_timeline -> neon.timeline_id --- compute_tools/src/bin/compute_ctl.rs | 4 ++-- compute_tools/tests/cluster_spec.json | 4 ++-- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/compute.rs | 8 ++++---- .../batch_others/test_ancestor_branch.py | 6 +++--- .../batch_others/test_branch_behind.py | 2 +- .../batch_others/test_broken_timeline.py | 2 +- .../batch_others/test_gc_aggressive.py | 2 +- .../batch_others/test_old_request_lsn.py | 2 +- test_runner/batch_others/test_pitr_gc.py | 2 +- .../batch_others/test_remote_storage.py | 4 ++-- .../batch_others/test_tenant_relocation.py | 2 +- .../test_tenants_with_remote_storage.py | 8 ++++---- .../batch_others/test_timeline_size.py | 2 +- test_runner/batch_others/test_wal_acceptor.py | 20 +++++++++---------- .../batch_others/test_wal_acceptor_async.py | 4 ++-- test_runner/batch_others/test_wal_restore.py | 2 +- test_runner/fixtures/compare_fixtures.py | 2 +- test_runner/fixtures/zenith_fixtures.py | 2 +- vendor/postgres | 2 +- 20 files changed, 41 insertions(+), 41 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2e8d864830..ba116af11b 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -121,12 +121,12 @@ fn main() -> Result<()> { let tenant = spec .cluster .settings - .find("neon.tenantid") + .find("neon.tenant_id") .expect("tenant id should be provided"); let timeline = spec .cluster .settings - .find("neon.timelineid") + .find("neon.timeline_id") .expect("tenant id should be provided"); let compute_state = ComputeNode { diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 5d8104ab4c..bdd6e60a69 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -150,7 +150,7 @@ "vartype": "integer" }, { - "name": "neon.tenantid", + "name": "neon.tenant_id", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, @@ -160,7 +160,7 @@ "vartype": "integer" }, { - "name": "neon.timelineid", + "name": "neon.timeline_id", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 9e606ec7c2..1f2e188398 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenantid = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timelineid = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index d2d1d840c9..06a14d8a41 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -148,8 +148,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("neon.timelineid", &context)?; - let tenant_id: ZTenantId = conf.parse_field("neon.tenantid", &context)?; + let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; + let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("safekeepers").is_some(); // parse recovery_target_lsn, if any @@ -306,8 +306,8 @@ impl PostgresNode { conf.append("shared_preload_libraries", "neon"); conf.append_line(""); conf.append("neon.pageserver_connstring", &pageserver_connstr); - conf.append("neon.tenantid", &self.tenant_id.to_string()); - conf.append("neon.timelineid", &self.timeline_id.to_string()); + conf.append("neon.tenant_id", &self.tenant_id.to_string()); + conf.append("neon.timeline_id", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index e05a550fdf..d87bebcc11 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -30,7 +30,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_cur.execute("SHOW neon.timelineid") + branch0_cur.execute("SHOW neon.timeline_id") branch0_timeline = branch0_cur.fetchone()[0] log.info(f"b0 timeline {branch0_timeline}") @@ -55,7 +55,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_cur.execute("SHOW neon.timelineid") + branch1_cur.execute("SHOW neon.timeline_id") branch1_timeline = branch1_cur.fetchone()[0] log.info(f"b1 timeline {branch1_timeline}") @@ -79,7 +79,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_cur.execute("SHOW neon.timelineid") + branch2_cur.execute("SHOW neon.timeline_id") branch2_timeline = branch2_cur.fetchone()[0] log.info(f"b2 timeline {branch2_timeline}") diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 9bb04f574b..7a00ecfca2 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -31,7 +31,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW neon.timelineid") + main_cur.execute("SHOW neon.timeline_id") timeline = main_cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 45fe69748d..05391f7e4d 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -26,7 +26,7 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline_id = cur.fetchone()[0] pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 6beee49d2f..79af54c1de 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -62,7 +62,7 @@ def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): conn = pg.connect() cur = conn.cursor() - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index 1ec429ea34..fd0cbe26cc 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -26,7 +26,7 @@ def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] psconn = env.pageserver.connect() diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index 6456acd214..1a1562ca5f 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -25,7 +25,7 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW neon.timelineid") + main_cur.execute("SHOW neon.timeline_id") timeline = main_cur.fetchone()[0] # Create table diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index e5c94980f0..e7097e2ef5 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -48,8 +48,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] checkpoint_numbers = range(1, 3) diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 6ad9c6305f..af96cc8524 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -130,7 +130,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, with closing(tenant_pg.connect()) as conn: with conn.cursor() as cur: # save timeline for later gc call - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = UUID(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline.hex) diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index 8eb72437fd..dbe07c4aba 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -21,8 +21,8 @@ async def tenant_workload(env: ZenithEnv, pg: Postgres): pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenantid") - timeline_id = await pg_conn.fetchval("show neon.timelineid") + tenant_id = await pg_conn.fetchval("show neon.tenant_id") + timeline_id = await pg_conn.fetchval("show neon.timeline_id") await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): @@ -82,9 +82,9 @@ def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): for tenant, pg in tenants_pgs: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("show neon.tenantid") + cur.execute("show neon.tenant_id") tenant_id = cur.fetchone()[0] - cur.execute("show neon.timelineid") + cur.execute("show neon.timeline_id") timeline_id = cur.fetchone()[0] cur.execute("SELECT pg_current_wal_flush_lsn()") current_lsn = lsn_from_hex(cur.fetchone()[0]) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 86f9ed247b..d43e793df8 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -21,7 +21,7 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 97bac5fed4..fd80313f94 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -337,8 +337,8 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn zenith timeline from compute - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -384,8 +384,8 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): cur.execute('CREATE TABLE t(key int primary key, value text)') cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # force checkpoint to advance remote_consistent_lsn with closing(env.pageserver.connect()) as psconn: @@ -498,8 +498,8 @@ class ProposerPostgres(PgProtocol): cfg = [ "synchronous_standby_names = 'walproposer'\n", "shared_preload_libraries = 'neon'\n", - f"neon.timelineid = '{self.timeline_id.hex}'\n", - f"neon.tenantid = '{self.tenant_id.hex}'\n", + f"neon.timeline_id = '{self.timeline_id.hex}'\n", + f"neon.tenant_id = '{self.tenant_id.hex}'\n", f"neon.pageserver_connstring = ''\n", f"safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", @@ -612,8 +612,8 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): wa_http_cli.check_status() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) @@ -798,8 +798,8 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): pg.start() # learn zenith timeline from compute - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] - timeline_id = pg.safe_psql("show neon.timelineid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 1e7edcc8df..bd3b3027c5 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -151,8 +151,8 @@ async def run_restarts_under_load(env: ZenithEnv, test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenantid") - timeline_id = await pg_conn.fetchval("show neon.timelineid") + tenant_id = await pg_conn.fetchval("show neon.tenant_id") + timeline_id = await pg_conn.fetchval("show neon.timeline_id") bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index 69249e75ff..85c6e776c5 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -19,7 +19,7 @@ def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, env.zenith_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show neon.tenantid")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] env.zenith_cli.pageserver_stop() port = port_distributor.get_port() data_dir = os.path.join(test_output_dir, 'pgsql.restored') diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index f5a97b5a84..c61bc6d81f 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -66,7 +66,7 @@ class ZenithCompare(PgCompare): # We only use one branch and one timeline self.env.zenith_cli.create_branch(branch_name, 'empty') self._pg = self.env.postgres.create_start(branch_name) - self.timeline = self.pg.safe_psql("SHOW neon.timelineid")[0][0] + self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] # Long-lived cursor, useful for flushing self.psconn = self.env.pageserver.connect() diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 4459e0ac55..5f3c16c4e6 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -2039,7 +2039,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW neon.timelineid") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] # stop postgres to ensure that files won't change diff --git a/vendor/postgres b/vendor/postgres index 7a2aa6035b..a424e3ccff 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 7a2aa6035bf0b4f676597b7b90de7fee20824fff +Subproject commit a424e3ccff7d6af97d9ee5d4b727fb8324c78e11 From 915e5c911483ca10716615bf2e14574710e6844e Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 26 May 2022 19:18:32 +0300 Subject: [PATCH 35/50] Rename 'zenith_admin' to 'cloud_admin' on compute node start --- compute_tools/src/compute.rs | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index fd60b80305..a2e6874a28 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -262,7 +262,30 @@ impl ComputeNode { .unwrap_or_else(|| "5432".to_string()); wait_for_postgres(&mut pg, &port, pgdata_path)?; - let mut client = Client::connect(&self.connstr, NoTls)?; + // If connection fails, + // it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin`name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. + let mut client = match Client::connect(&self.connstr, NoTls) { + Err(e) => { + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1); + + let mut client = Client::connect(&zenith_admin_connstr, NoTls)?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + drop(client); + + // reconnect with connsting with expected name + Client::connect(&self.connstr, NoTls)? + } + Ok(client) => client, + }; handle_roles(&self.spec, &mut client)?; handle_databases(&self.spec, &mut client)?; From e014cb6026f5b5f0105a7db5f81ac16affd9a1a7 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 30 May 2022 12:03:04 +0300 Subject: [PATCH 36/50] rename zenith.zenith_tenant to neon.tenant_id in test --- test_runner/batch_others/test_wal_acceptor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index fd80313f94..35b7d9585a 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -431,8 +431,8 @@ def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): pg = env.postgres.create_start('test_safekeepers_wal_backup') # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] pg_conn = pg.connect() cur = pg_conn.cursor() From 36281e3b475ac46570dd4f89a61fc525ff3f0a1c Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Sat, 28 May 2022 07:13:15 +0400 Subject: [PATCH 37/50] Extend test_wal_backup with compute restart. --- safekeeper/src/wal_backup.rs | 4 +- test_runner/batch_others/test_wal_acceptor.py | 40 +++++++++++++------ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index a4b779649d..1723d03ee3 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -204,6 +204,7 @@ impl WalBackupTask { l.give_up().await; } + info!("acquiring leadership"); match broker::get_leader(&self.election).await { Ok(l) => { self.leader = Some(l); @@ -214,6 +215,7 @@ impl WalBackupTask { continue; } } + info!("acquired leadership"); // offload loop loop { @@ -268,7 +270,7 @@ impl WalBackupTask { { Ok(leader) => { if !leader { - info!("leader has changed"); + info!("lost leadership"); break; } } diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 35b7d9585a..40a9b48a18 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -414,6 +414,22 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): time.sleep(0.5) +def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): + started_at = time.time() + http_cli = live_sk.http_client() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"live sk status is {tli_status}") + + if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded") + time.sleep(0.5) + + @pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): zenith_env_builder.num_safekeepers = 3 @@ -446,23 +462,21 @@ def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str): # roughly fills one segment cur.execute("insert into t select generate_series(1,250000), 'payload'") live_sk = [sk for sk in env.safekeepers if sk != victim][0] - http_cli = live_sk.http_client() - started_at = time.time() - while True: - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"live sk status is {tli_status}") - - if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): - break - elapsed = time.time() - started_at - if elapsed > 20: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s segment ending at {seg_end} get offloaded") - time.sleep(0.5) + wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end) victim.start() + # put one of safekeepers down again + env.safekeepers[0].stop() + # restart postgres + pg.stop_and_destroy().create_start('test_safekeepers_wal_backup') + # and ensure offloading still works + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("insert into t select generate_series(1,250000), 'payload'") + wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000') + class ProposerPostgres(PgProtocol): """Object for running postgres without ZenithEnv""" From c3e0b6c839fa37bc9734a09dc8288d577557cb27 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 31 May 2022 11:10:50 +0300 Subject: [PATCH 38/50] Implement timeline-based metrics in safekeeper (#1823) Now there's timelines metrics collector, which goes through all timelines and reports metrics only for active ones --- libs/metrics/src/lib.rs | 1 + safekeeper/src/bin/safekeeper.rs | 6 + safekeeper/src/lib.rs | 1 + safekeeper/src/metrics.rs | 336 +++++++++++++++++++++++++++++++ safekeeper/src/safekeeper.rs | 37 +--- safekeeper/src/timeline.rs | 38 ++++ safekeeper/src/wal_storage.rs | 14 +- 7 files changed, 384 insertions(+), 49 deletions(-) create mode 100644 safekeeper/src/metrics.rs diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 9929fc6d45..3b5da9f7ff 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,6 +3,7 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; +pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; pub use prometheus::{register_gauge_vec, GaugeVec}; diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e792a854d5..9feb984c4f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -264,6 +264,12 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo } } + // Register metrics collector for active timelines. It's important to do this + // after daemonizing, otherwise process collector will be upset. + let registry = metrics::default_registry(); + let timeline_collector = safekeeper::metrics::TimelineCollector::new(); + registry.register(Box::new(timeline_collector))?; + let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index c092f5185b..1fae9b00f8 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -14,6 +14,7 @@ pub mod control_file_upgrade; pub mod handler; pub mod http; pub mod json_ctrl; +pub mod metrics; pub mod receive_wal; pub mod remove_wal; pub mod safekeeper; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs new file mode 100644 index 0000000000..5a2e5f125f --- /dev/null +++ b/safekeeper/src/metrics.rs @@ -0,0 +1,336 @@ +//! This module exports metrics for all active timelines. + +use std::time::{Instant, SystemTime}; + +use metrics::{ + core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, + proto::MetricFamily, + Gauge, IntGaugeVec, +}; +use postgres_ffi::xlog_utils::XLogSegNo; +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + +use crate::{ + safekeeper::{SafeKeeperState, SafekeeperMemState}, + timeline::{GlobalTimelines, ReplicaState}, +}; + +pub struct FullTimelineInfo { + pub zttid: ZTenantTimelineId, + pub replicas: Vec, + pub wal_backup_active: bool, + pub timeline_is_active: bool, + pub num_computes: u32, + pub last_removed_segno: XLogSegNo, + + pub epoch_start_lsn: Lsn, + pub mem_state: SafekeeperMemState, + pub persisted_state: SafeKeeperState, + + pub flush_lsn: Lsn, +} + +pub struct TimelineCollector { + descs: Vec, + commit_lsn: GenericGaugeVec, + backup_lsn: GenericGaugeVec, + flush_lsn: GenericGaugeVec, + epoch_start_lsn: GenericGaugeVec, + peer_horizon_lsn: GenericGaugeVec, + remote_consistent_lsn: GenericGaugeVec, + feedback_ps_write_lsn: GenericGaugeVec, + feedback_last_time_seconds: GenericGaugeVec, + timeline_active: GenericGaugeVec, + wal_backup_active: GenericGaugeVec, + connected_computes: IntGaugeVec, + disk_usage: GenericGaugeVec, + acceptor_term: GenericGaugeVec, + collect_timeline_metrics: Gauge, +} + +impl Default for TimelineCollector { + fn default() -> Self { + Self::new() + } +} + +impl TimelineCollector { + pub fn new() -> TimelineCollector { + let mut descs = Vec::new(); + + let commit_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_commit_lsn", + "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(commit_lsn.desc().into_iter().cloned()); + + let backup_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_backup_lsn", + "Current backup_lsn, up to which WAL is backed up, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(backup_lsn.desc().into_iter().cloned()); + + let flush_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_flush_lsn", + "Current flush_lsn, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flush_lsn.desc().into_iter().cloned()); + + let epoch_start_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_epoch_start_lsn", + "Point since which compute generates new WAL in the current consensus term", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(epoch_start_lsn.desc().into_iter().cloned()); + + let peer_horizon_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_peer_horizon_lsn", + "LSN of the most lagging safekeeper", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(peer_horizon_lsn.desc().into_iter().cloned()); + + let remote_consistent_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_remote_consistent_lsn", + "LSN which is persisted to the remote storage in pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); + + let feedback_ps_write_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_ps_write_lsn", + "Last LSN received by the pageserver, acknowledged in the feedback", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + + let feedback_last_time_seconds = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_last_time_seconds", + "Timestamp of the last feedback from the pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_last_time_seconds.desc().into_iter().cloned()); + + let timeline_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_timeline_active", + "Reports 1 for active timelines, 0 for inactive", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(timeline_active.desc().into_iter().cloned()); + + let wal_backup_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_wal_backup_active", + "Reports 1 for timelines with active WAL backup, 0 otherwise", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(wal_backup_active.desc().into_iter().cloned()); + + let connected_computes = IntGaugeVec::new( + Opts::new( + "safekeeper_connected_computes", + "Number of active compute connections", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(connected_computes.desc().into_iter().cloned()); + + let disk_usage = GenericGaugeVec::new( + Opts::new( + "safekeeper_disk_usage_bytes", + "Estimated disk space used to store WAL segments", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(disk_usage.desc().into_iter().cloned()); + + let acceptor_term = GenericGaugeVec::new( + Opts::new("safekeeper_acceptor_term", "Current consensus term"), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(acceptor_term.desc().into_iter().cloned()); + + let collect_timeline_metrics = Gauge::new( + "safekeeper_collect_timeline_metrics_seconds", + "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", + ) + .unwrap(); + descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + + TimelineCollector { + descs, + commit_lsn, + backup_lsn, + flush_lsn, + epoch_start_lsn, + peer_horizon_lsn, + remote_consistent_lsn, + feedback_ps_write_lsn, + feedback_last_time_seconds, + timeline_active, + wal_backup_active, + connected_computes, + disk_usage, + acceptor_term, + collect_timeline_metrics, + } + } +} + +impl Collector for TimelineCollector { + fn desc(&self) -> Vec<&Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let start_collecting = Instant::now(); + + // reset all metrics to clean up inactive timelines + self.commit_lsn.reset(); + self.backup_lsn.reset(); + self.flush_lsn.reset(); + self.epoch_start_lsn.reset(); + self.peer_horizon_lsn.reset(); + self.remote_consistent_lsn.reset(); + self.feedback_ps_write_lsn.reset(); + self.feedback_last_time_seconds.reset(); + self.timeline_active.reset(); + self.wal_backup_active.reset(); + self.connected_computes.reset(); + self.disk_usage.reset(); + self.acceptor_term.reset(); + + let timelines = GlobalTimelines::active_timelines_metrics(); + + for tli in timelines { + let tenant_id = tli.zttid.tenant_id.to_string(); + let timeline_id = tli.zttid.timeline_id.to_string(); + let labels = &[tenant_id.as_str(), timeline_id.as_str()]; + + let mut most_advanced: Option = None; + for replica in tli.replicas.iter() { + if let Some(replica_feedback) = replica.zenith_feedback { + if let Some(current) = most_advanced { + if current.ps_writelsn < replica_feedback.ps_writelsn { + most_advanced = Some(replica_feedback); + } + } else { + most_advanced = Some(replica_feedback); + } + } + } + + self.commit_lsn + .with_label_values(labels) + .set(tli.mem_state.commit_lsn.into()); + self.backup_lsn + .with_label_values(labels) + .set(tli.mem_state.backup_lsn.into()); + self.flush_lsn + .with_label_values(labels) + .set(tli.flush_lsn.into()); + self.epoch_start_lsn + .with_label_values(labels) + .set(tli.epoch_start_lsn.into()); + self.peer_horizon_lsn + .with_label_values(labels) + .set(tli.mem_state.peer_horizon_lsn.into()); + self.remote_consistent_lsn + .with_label_values(labels) + .set(tli.mem_state.remote_consistent_lsn.into()); + self.timeline_active + .with_label_values(labels) + .set(tli.timeline_is_active as u64); + self.wal_backup_active + .with_label_values(labels) + .set(tli.wal_backup_active as u64); + self.connected_computes + .with_label_values(labels) + .set(tli.num_computes as i64); + self.acceptor_term + .with_label_values(labels) + .set(tli.persisted_state.acceptor_state.term as u64); + + if let Some(feedback) = most_advanced { + self.feedback_ps_write_lsn + .with_label_values(labels) + .set(feedback.ps_writelsn); + if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) + { + self.feedback_last_time_seconds + .with_label_values(labels) + .set(unix_time.as_secs()); + } + } + + if tli.last_removed_segno != 0 { + let segno_count = tli + .flush_lsn + .segment_number(tli.persisted_state.server.wal_seg_size as usize) + - tli.last_removed_segno; + let disk_usage_bytes = segno_count * tli.persisted_state.server.wal_seg_size as u64; + self.disk_usage + .with_label_values(labels) + .set(disk_usage_bytes); + } + } + + // collect MetricFamilys. + let mut mfs = Vec::new(); + mfs.extend(self.commit_lsn.collect()); + mfs.extend(self.backup_lsn.collect()); + mfs.extend(self.flush_lsn.collect()); + mfs.extend(self.epoch_start_lsn.collect()); + mfs.extend(self.peer_horizon_lsn.collect()); + mfs.extend(self.remote_consistent_lsn.collect()); + mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.feedback_last_time_seconds.collect()); + mfs.extend(self.timeline_active.collect()); + mfs.extend(self.wal_backup_active.collect()); + mfs.extend(self.connected_computes.collect()); + mfs.extend(self.disk_usage.collect()); + mfs.extend(self.acceptor_term.collect()); + + // report time it took to collect all info + let elapsed = start_collecting.elapsed().as_secs_f64(); + self.collect_timeline_metrics.set(elapsed); + mfs.extend(self.collect_timeline_metrics.collect()); + + mfs + } +} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index df4b202063..1c00af7043 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -15,13 +15,10 @@ use std::fmt; use std::io::Read; use tracing::*; -use lazy_static::lazy_static; - use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use metrics::{register_gauge_vec, Gauge, GaugeVec}; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; use utils::{ bin_ser::LeSer, @@ -487,45 +484,16 @@ impl AcceptorProposerMessage { } } -lazy_static! { - // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). - // i64 is faster than f64, so update to u64 when available. - static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_commit_lsn", - "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_commit_lsn gauge vec"); -} - -struct SafeKeeperMetrics { - commit_lsn: Gauge, - // WAL-related metrics are in WalStorageMetrics -} - -impl SafeKeeperMetrics { - fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - Self { - commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - /// SafeKeeper which consumes events (messages from compute) and provides /// replies. pub struct SafeKeeper { - // Cached metrics so we don't have to recompute labels on each update. - metrics: SafeKeeperMetrics, - /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. /// Note: be careful to set only if we are sure our WAL (term history) matches /// committed one. pub global_commit_lsn: Lsn, /// LSN since the proposer safekeeper currently talking to appends WAL; /// determines epoch switch point. - epoch_start_lsn: Lsn, + pub epoch_start_lsn: Lsn, pub inmem: SafekeeperMemState, // in memory part pub state: CTRL, // persistent state storage @@ -555,7 +523,6 @@ where wal_store.init_storage(&state)?; Ok(SafeKeeper { - metrics: SafeKeeperMetrics::new(state.tenant_id, ztli), global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { @@ -757,7 +724,6 @@ where // upgrade. self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); - self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); @@ -777,7 +743,6 @@ where assert!(commit_lsn >= self.inmem.commit_lsn); self.inmem.commit_lsn = commit_lsn; - self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); // If new commit_lsn reached epoch switch, force sync of control // file: walproposer in sync mode is very interested when this diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 74a61410fd..2fc5bcc1f6 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -33,6 +33,7 @@ use crate::safekeeper::{ }; use crate::send_wal::HotStandbyFeedback; +use crate::metrics::FullTimelineInfo; use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; @@ -450,6 +451,33 @@ impl Timeline { shared_state.active } + /// Returns full timeline info, required for the metrics. + /// If the timeline is not active, returns None instead. + pub fn info_for_metrics(&self) -> Option { + let shared_state = self.mutex.lock().unwrap(); + if !shared_state.active { + return None; + } + + Some(FullTimelineInfo { + zttid: self.zttid, + replicas: shared_state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: shared_state.wal_backup_active, + timeline_is_active: shared_state.active, + num_computes: shared_state.num_computes, + last_removed_segno: shared_state.last_removed_segno, + epoch_start_lsn: shared_state.sk.epoch_start_lsn, + mem_state: shared_state.sk.inmem.clone(), + persisted_state: shared_state.sk.state.clone(), + flush_lsn: shared_state.sk.wal_store.flush_lsn(), + }) + } + /// Timed wait for an LSN to be committed. /// /// Returns the last committed LSN, which will be at least @@ -777,6 +805,16 @@ impl GlobalTimelines { .collect() } + /// Return FullTimelineInfo for all active timelines. + pub fn active_timelines_metrics() -> Vec { + let state = TIMELINES_STATE.lock().unwrap(); + state + .timelines + .iter() + .filter_map(|(_, tli)| tli.info_for_metrics()) + .collect() + } + fn delete_force_internal( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 7285cedc03..e3f1ce7333 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -31,20 +31,11 @@ use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; use postgres_ffi::waldecoder::WalStreamDecoder; -use metrics::{ - register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec, - DISK_WRITE_SECONDS_BUCKETS, -}; +use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; lazy_static! { // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). // i64 is faster than f64, so update to u64 when available. - static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_flush_lsn", - "Current flush_lsn, grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_flush_lsn gauge vec"); static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!( "safekeeper_write_wal_bytes", "Bytes written to WAL in a single request, grouped by timeline", @@ -69,7 +60,6 @@ lazy_static! { } struct WalStorageMetrics { - flush_lsn: Gauge, write_wal_bytes: Histogram, write_wal_seconds: Histogram, flush_wal_seconds: Histogram, @@ -80,7 +70,6 @@ impl WalStorageMetrics { let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); Self { - flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), @@ -171,7 +160,6 @@ impl PhysicalStorage { /// Wrapper for flush_lsn updates that also updates metrics. fn update_flush_lsn(&mut self) { self.flush_record_lsn = self.write_record_lsn; - self.metrics.flush_lsn.set(self.flush_record_lsn.0 as f64); } /// Call fdatasync if config requires so. From 595a6bc1e15390782b38d2cdf48a5bb24b7a061b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 31 May 2022 14:47:06 +0300 Subject: [PATCH 39/50] Bump vendor/postgres to fix basebackup LSN comparison. (#1835) Co-authored-by: Arseny Sher --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index a424e3ccff..8a6cc09624 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit a424e3ccff7d6af97d9ee5d4b727fb8324c78e11 +Subproject commit 8a6cc09624fe921b6191f1f524a8051dc476404e From 54e163ac03c9ba556d8055cc81d3a70825bf5aaa Mon Sep 17 00:00:00 2001 From: Ryan Russell Date: Mon, 30 May 2022 07:00:23 -0500 Subject: [PATCH 40/50] Improve Readability in Docs Signed-off-by: Ryan Russell --- docs/README.md | 2 +- docs/glossary.md | 2 +- docs/rfcs/003-laptop-cli.md | 4 ++-- docs/rfcs/006-laptop-cli-v2-repository-structure.md | 2 +- docs/rfcs/009-snapshot-first-storage-cli.md | 2 +- docs/rfcs/009-snapshot-first-storage-pitr.md | 2 +- docs/rfcs/009-snapshot-first-storage.md | 2 +- docs/rfcs/010-storage_details.md | 4 ++-- docs/rfcs/013-term-history.md | 2 +- docs/settings.md | 2 +- pageserver/src/layered_repository/README.md | 2 +- safekeeper/README_PROTO.md | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/README.md b/docs/README.md index 886363dccc..60114c5fd5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,7 +6,7 @@ - [docker.md](docker.md) — Docker images and building pipeline. - [glossary.md](glossary.md) — Glossary of all the terms used in codebase. - [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. -- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout. +- [sourcetree.md](sourcetree.md) — Overview of the source tree layout. - [pageserver/README.md](/pageserver/README.md) — pageserver overview. - [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview. - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. diff --git a/docs/glossary.md b/docs/glossary.md index a5bb154793..7aeae27a39 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -2,7 +2,7 @@ ### Authentication -### Backpresssure +### Backpressure Backpressure is used to limit the lag between pageserver and compute node or WAL service. diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 8520249bf1..1a549c2df5 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -136,9 +136,9 @@ s3tank 80G S3 ## pg -Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself. +Manages postgres data directories and can start postgres instances with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themselves. -Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together. +Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. **zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md index ee4e432182..e6e6e172ad 100644 --- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -121,7 +121,7 @@ repository, launch an instance on the same branch in both clones, and later try to push/pull between them? Perhaps create a new timeline every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept -that we have in the WAL safekeepr +that we have in the WAL safekeeper ### zenith checkout/commit diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 0139569721..0acbd68f86 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. -Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. So here is an attempt to design consistent CLI for different usage scenarios: diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md index a4d978324b..29d3614d34 100644 --- a/docs/rfcs/009-snapshot-first-storage-pitr.md +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -192,7 +192,7 @@ for a particular relation readily available alongside the snapshot files, and you don't need to track what snapshot LSNs exist separately. -(If we wanted to minize the number of files, you could include the +(If we wanted to minimize the number of files, you could include the snapshot @300 and the WAL between 200 and 300 in the same file, but I feel it's probably better to keep them separate) diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md index aeef54898a..75ed490f21 100644 --- a/docs/rfcs/009-snapshot-first-storage.md +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -121,7 +121,7 @@ The properties of s3 that we depend on are: list objects streaming read of entire object read byte range from object -streaming write new object (may use multipart upload for better relialibity) +streaming write new object (may use multipart upload for better reliability) delete object (that should not disrupt an already-started read). Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md index 5c279b7dc8..bc79924e7b 100644 --- a/docs/rfcs/010-storage_details.md +++ b/docs/rfcs/010-storage_details.md @@ -40,7 +40,7 @@ b) overwrite older pages with the newer pages -- if there is no replica we proba I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. -With option b) we can also treat PageStor as an uncompleted increamental snapshot. +With option b) we can also treat PageStor as an uncompleted incremental snapshot. ### LocalStore @@ -131,7 +131,7 @@ As for exact data that should go to snapshots I think it is the following for ea It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). -1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small). +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when relation_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset deltas would be small). 2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 7fe505456d..59833526c5 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -7,7 +7,7 @@ and e.g. prevents electing two proposers with the same term -- it is actually called `term` in the code. The second, called `epoch`, reflects progress of log receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly -correspones to proposed in +corresponds to proposed in https://github.com/zenithdb/rfcs/pull/3/files diff --git a/docs/settings.md b/docs/settings.md index 98439a094c..0ca7223faa 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -185,7 +185,7 @@ If no IAM bucket access is used during the remote storage usage, use the `AWS_AC ###### General remote storage configuration -Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. +Pageserver allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. No default values are used for the remote storage configuration parameters. Besides, there are parameters common for all types of remote storage that can be configured, those have defaults: diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 70c571a507..81f585d2e2 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -260,7 +260,7 @@ Whenever a GetPage@LSN request comes in from the compute node, the page server needs to reconstruct the requested page, as it was at the requested LSN. To do that, the page server first checks the recent in-memory layer; if the requested page version is found there, it can -be returned immediatedly without looking at the files on +be returned immediately without looking at the files on disk. Otherwise the page server needs to locate the layer file that contains the requested page version. diff --git a/safekeeper/README_PROTO.md b/safekeeper/README_PROTO.md index 0cd1f510e6..a2d4fa455d 100644 --- a/safekeeper/README_PROTO.md +++ b/safekeeper/README_PROTO.md @@ -152,7 +152,7 @@ It is assumed that in case of losing local data by some safekeepers, it should b * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream -* `Replica`: read-only computatio node +* `Replica`: read-only computation node * `VCL`: the largest LSN for which we can guarantee availability of all prior records. ## Algorithm From c97cd684e0d925cc21d9e484c6d65ba69629b458 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Tue, 31 May 2022 11:20:51 -0400 Subject: [PATCH 41/50] Use `HOMEBREW_PREFIX` instead of hard-coded path (#1833) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e3d183eaee..50e2c8ab7f 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ endif # macOS with brew-installed openssl requires explicit paths UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib + PG_CONFIGURE_OPTS += --with-includes=$(HOMEBREW_PREFIX)/opt/openssl/include --with-libraries=$(HOMEBREW_PREFIX)/opt/openssl/lib endif # Choose whether we should be silent or verbose From ca10cc12c1fe40c3ca5c020a219d96aa1f06de92 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Tue, 31 May 2022 14:14:09 -0400 Subject: [PATCH 42/50] Close file descriptors for redo process (#1834) --- Cargo.lock | 11 +++++++++ pageserver/Cargo.toml | 1 + pageserver/src/walredo.rs | 49 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index e39375c221..6f8382de27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -363,6 +363,16 @@ dependencies = [ "textwrap 0.14.2", ] +[[package]] +name = "close_fds" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" +dependencies = [ + "cfg-if", + "libc", +] + [[package]] name = "cmake" version = "0.1.48" @@ -1789,6 +1799,7 @@ dependencies = [ "bytes", "chrono", "clap 3.0.14", + "close_fds", "const_format", "crc32c", "crossbeam-utils", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 290f52e0b2..d78d3622c4 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -60,6 +60,7 @@ metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } +close_fds = "0.3.2" [dev-dependencies] hex-literal = "0.3" diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index d263bf0e9a..cad211b1bd 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -28,6 +28,7 @@ use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::os::unix::io::AsRawFd; +use std::os::unix::prelude::CommandExt; use std::path::PathBuf; use std::process::Stdio; use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; @@ -554,6 +555,40 @@ impl PostgresRedoManager { } } +/// +/// Command with ability not to give all file descriptors to child process +/// +trait CloseFileDescriptors: CommandExt { + /// + /// Close file descriptors (other than stdin, stdout, stderr) in child process + /// + fn close_fds(&mut self) -> &mut Command; +} + +impl CloseFileDescriptors for C { + fn close_fds(&mut self) -> &mut Command { + unsafe { + self.pre_exec(move || { + // SAFETY: Code executed inside pre_exec should have async-signal-safety, + // which means it should be safe to execute inside a signal handler. + // The precise meaning depends on platform. See `man signal-safety` + // for the linux definition. + // + // The set_fds_cloexec_threadsafe function is documented to be + // async-signal-safe. + // + // Aside from this function, the rest of the code is re-entrant and + // doesn't make any syscalls. We're just passing constants. + // + // NOTE: It's easy to indirectly cause a malloc or lock a mutex, + // which is not async-signal-safe. Be careful. + close_fds::set_fds_cloexec_threadsafe(3, &[]); + Ok(()) + }) + } + } +} + /// /// Handle to the Postgres WAL redo process /// @@ -610,6 +645,7 @@ impl PostgresRedoProcess { config.write_all(b"shared_preload_libraries=neon\n")?; config.write_all(b"neon.wal_redo=on\n")?; } + // Start postgres itself let mut child = Command::new(conf.pg_bin_dir().join("postgres")) .arg("--wal-redo") @@ -620,6 +656,19 @@ impl PostgresRedoProcess { .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .env("PGDATA", &datadir) + // The redo process is not trusted, so it runs in seccomp mode + // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't + // inherit any file descriptors from the pageserver that would allow + // an attacker to do bad things. + // + // The Rust standard library makes sure to mark any file descriptors with + // as close-on-exec by default, but that's not enough, since we use + // libraries that directly call libc open without setting that flag. + // + // One example is the pidfile of the daemonize library, which doesn't + // currently mark file descriptors as close-on-exec. Either way, we + // want to be on the safe side and prevent accidental regression. + .close_fds() .spawn() .map_err(|e| { Error::new( From b1b67cc5a055561a3d60c4e0194b0a3103cb8624 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 31 May 2022 19:13:12 +0300 Subject: [PATCH 43/50] improve test normal work to start several computes --- .../batch_others/test_ancestor_branch.py | 4 +- test_runner/batch_others/test_normal_work.py | 47 +++++++++++++++++++ test_runner/batch_others/test_wal_acceptor.py | 19 -------- test_runner/fixtures/zenith_fixtures.py | 22 +++++++-- 4 files changed, 65 insertions(+), 27 deletions(-) create mode 100644 test_runner/batch_others/test_normal_work.py diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index d87bebcc11..78724c434e 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -24,9 +24,7 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): 'compaction_target_size': '4194304', }) - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute("failpoints flush-frozen=sleep(10000)") + env.pageserver.safe_psql("failpoints flush-frozen=sleep(10000)") pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py new file mode 100644 index 0000000000..87dd2d5e18 --- /dev/null +++ b/test_runner/batch_others/test_normal_work.py @@ -0,0 +1,47 @@ +from fixtures.log_helper import log +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient + + +def check_tenant(env: ZenithEnv, pageserver_http: ZenithPageserverHttpClient): + tenant_id, timeline_id = env.zenith_cli.create_tenant() + pg = env.postgres.create_start('main', tenant_id=tenant_id) + # we rely upon autocommit after each statement + res_1 = pg.safe_psql_many(queries=[ + 'CREATE TABLE t(key int primary key, value text)', + 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', + 'SELECT sum(key) FROM t', + ]) + + assert res_1[-1][0] == (5000050000, ) + # TODO check detach on live instance + log.info("stopping compute") + pg.stop() + log.info("compute stopped") + + pg.start() + res_2 = pg.safe_psql('SELECT sum(key) FROM t') + assert res_2[0] == (5000050000, ) + + pg.stop() + pageserver_http.timeline_detach(tenant_id, timeline_id) + + +def test_normal_work(zenith_env_builder: ZenithEnvBuilder): + """ + Basic test: + * create new tenant with a timeline + * write some data + * ensure that it was successfully written + * restart compute + * check that the data is there + * stop compute + * detach timeline + + Repeat check for several tenants/timelines. + """ + + env = zenith_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + for _ in range(3): + check_tenant(env, pageserver_http) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 40a9b48a18..007641417e 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -18,25 +18,6 @@ from fixtures.log_helper import log from typing import List, Optional, Any -# basic test, write something in setup with wal acceptors, ensure that commits -# succeed and data is written -def test_normal_work(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() - - env.zenith_cli.create_branch('test_safekeepers_normal_work') - pg = env.postgres.create_start('test_safekeepers_normal_work') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) - - @dataclass class TimelineMetrics: timeline_id: str diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 5f3c16c4e6..ff905efa53 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -338,18 +338,30 @@ class PgProtocol: conn_options['server_settings'] = {key: val} return await asyncpg.connect(**conn_options) - def safe_psql(self, query: str, **kwargs: Any) -> List[Any]: + def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: """ Execute query against the node and return all rows. This method passes all extra params to connstr. """ + return self.safe_psql_many([query], **kwargs)[0] + def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + """ + Execute queries against the node and return all rows. + This method passes all extra params to connstr. + """ + result: List[List[Any]] = [] with closing(self.connect(**kwargs)) as conn: with conn.cursor() as cur: - cur.execute(query) - if cur.description is None: - return [] # query didn't return data - return cast(List[Any], cur.fetchall()) + for query in queries: + log.info(f"Executing query: {query}") + cur.execute(query) + + if cur.description is None: + result.append([]) # query didn't return data + else: + result.append(cast(List[Any], cur.fetchall())) + return result @dataclass From ff233cf4c28a29086de28627aee2d8753855d77f Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 31 May 2022 17:36:35 +0200 Subject: [PATCH 44/50] Use :local compute-tools tag to build compute-node image --- .circleci/config.yml | 24 ++++++++++++++---------- vendor/postgres | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 624d367053..fde6cbd35f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -453,9 +453,6 @@ jobs: - checkout - setup_remote_docker: docker_layer_caching: true - # Build neondatabase/compute-tools:latest image and push it to Docker hub - # TODO: this should probably also use versioned tag, not just :latest. - # XXX: but should it? We build and use it only locally now. - run: name: Build and push compute-tools Docker image command: | @@ -463,7 +460,10 @@ jobs: docker build \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools . + --tag neondatabase/compute-tools:local \ + --tag neondatabase/compute-tools:latest \ + -f Dockerfile.compute-tools . + # Only push :latest image docker push neondatabase/compute-tools:latest - run: name: Init postgres submodule @@ -473,7 +473,9 @@ jobs: command: | echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres + docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ + --tag neondatabase/compute-node:latest vendor/postgres \ + --build-arg COMPUTE_TOOLS_TAG=local docker push neondatabase/compute-node:${DOCKER_TAG} docker push neondatabase/compute-node:latest @@ -510,9 +512,6 @@ jobs: - checkout - setup_remote_docker: docker_layer_caching: true - # Build neondatabase/compute-tools:release image and push it to Docker hub - # TODO: this should probably also use versioned tag, not just :latest. - # XXX: but should it? We build and use it only locally now. - run: name: Build and push compute-tools Docker image command: | @@ -520,7 +519,10 @@ jobs: docker build \ --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:release -f Dockerfile.compute-tools . + --tag neondatabase/compute-tools:release \ + --tag neondatabase/compute-tools:local \ + -f Dockerfile.compute-tools . + # Only push :release image docker push neondatabase/compute-tools:release - run: name: Init postgres submodule @@ -530,7 +532,9 @@ jobs: command: | echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres + docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ + --tag neondatabase/compute-node:release vendor/postgres \ + --build-arg COMPUTE_TOOLS_TAG=local docker push neondatabase/compute-node:${DOCKER_TAG} docker push neondatabase/compute-node:release diff --git a/vendor/postgres b/vendor/postgres index 8a6cc09624..50b6edfbe0 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 8a6cc09624fe921b6191f1f524a8051dc476404e +Subproject commit 50b6edfbe0c3b171bd6d407652e1e31a4c97aa8b From af6143ea1ffb3987279745af6c70071b16e5fcee Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 31 May 2022 18:35:06 +0200 Subject: [PATCH 45/50] Install missing openssl packages in the Github Actions workflow --- .github/workflows/testing.yml | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 1ce1b64a49..41f9f51e86 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -40,11 +40,11 @@ jobs: if: matrix.os == 'ubuntu-latest' run: | sudo apt update - sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev + sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev - - name: Install macOs postgres dependencies + - name: Install macOS postgres dependencies if: matrix.os == 'macos-latest' - run: brew install flex bison + run: brew install flex bison openssl - name: Set pg revision for caching id: pg_ver @@ -58,10 +58,27 @@ jobs: tmp_install/ key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} + - name: Set extra env for macOS + if: matrix.os == 'macos-latest' + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' run: make postgres + # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' + # and the real cause will be inside config.log + - name: Print configure logs in case of failure + if: failure() + continue-on-error: true + run: | + echo '' && echo '=== config.log ===' && echo '' + cat tmp_install/build/config.log + echo '' && echo '=== configure.log ===' && echo '' + cat tmp_install/build/configure.log + - name: Cache cargo deps id: cache_cargo uses: actions/cache@v2 From e5a2b0372d73854121c159c0ea7092bd72d0d8dd Mon Sep 17 00:00:00 2001 From: Anton Chaporgin Date: Wed, 1 Jun 2022 15:40:45 +0300 Subject: [PATCH 46/50] remove sk1 from inventory (#1845) https://github.com/neondatabase/cloud/issues/1454 --- .circleci/ansible/staging.hosts | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/ansible/staging.hosts b/.circleci/ansible/staging.hosts index cf5b98eaa1..4273b885e1 100644 --- a/.circleci/ansible/staging.hosts +++ b/.circleci/ansible/staging.hosts @@ -3,7 +3,6 @@ zenith-us-stage-ps-2 console_region_id=27 [safekeepers] -zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 zenith-us-stage-sk-6 console_region_id=27 From 6623c5b9d5322da766674319beb03a56cb68e462 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 30 May 2022 16:02:57 +0300 Subject: [PATCH 47/50] add installation instructions for Fedora Linux --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 131d5da110..be5032e87d 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,18 @@ Pageserver consists of: ## Running local installation -#### building on Ubuntu/ Debian (Linux) +#### building on Linux 1. Install build dependencies and other useful packages -On Ubuntu or Debian this set of packages should be sufficient to build the code: -```text +* On Ubuntu or Debian this set of packages should be sufficient to build the code: +```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd +libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client +``` +* On Fedora these packages are needed: +```bash +dnf install flex bison readline-devel zlib-devel openssl-devel \ + libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -44,16 +49,11 @@ libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` -3. Install PostgreSQL Client -``` -apt install postgresql-client -``` - -4. Build neon and patched postgres +3. Build neon and patched postgres ```sh git clone --recursive https://github.com/neondatabase/neon.git cd neon -make -j5 +make -j`nproc` ``` #### building on OSX (12.3.1) From e5cb72757250457a61eeb4bdd7c613527ce7ec98 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 22 Apr 2022 13:56:48 +0300 Subject: [PATCH 48/50] Replace callmemaybe with etcd subscriptions on safekeeper timeline info --- control_plane/src/compute.rs | 1 - control_plane/src/storage.rs | 47 +- docs/settings.md | 18 +- libs/etcd_broker/src/lib.rs | 6 +- libs/utils/src/postgres_backend.rs | 13 +- libs/utils/src/zid.rs | 2 +- pageserver/Cargo.toml | 2 +- pageserver/src/config.rs | 15 + pageserver/src/http/models.rs | 23 + pageserver/src/http/routes.rs | 50 +- pageserver/src/layered_repository.rs | 22 + pageserver/src/page_service.rs | 26 - pageserver/src/repository.rs | 3 + pageserver/src/tenant_config.rs | 54 + pageserver/src/tenant_mgr.rs | 178 +- pageserver/src/thread_mgr.rs | 4 +- pageserver/src/walreceiver.rs | 1554 ++++++++++++----- .../src/walreceiver/connection_handler.rs | 405 +++++ safekeeper/src/bin/safekeeper.rs | 26 +- safekeeper/src/lib.rs | 1 - safekeeper/src/send_wal.rs | 67 - safekeeper/src/timeline.rs | 116 +- .../batch_others/test_pageserver_api.py | 10 +- test_runner/fixtures/zenith_fixtures.py | 4 +- .../performance/test_bulk_tenant_create.py | 14 +- 25 files changed, 1968 insertions(+), 693 deletions(-) create mode 100644 pageserver/src/walreceiver/connection_handler.rs diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 06a14d8a41..e78f96074e 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -352,7 +352,6 @@ impl PostgresNode { // This isn't really a supported configuration, but can be useful for // testing. conf.append("synchronous_standby_names", "pageserver"); - conf.append("neon.callmemaybe_connstring", &self.connstr()); } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 24cdbce8f3..a8f21406fb 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::io::Write; use std::net::TcpStream; +use std::num::NonZeroU64; use std::path::PathBuf; use std::process::Command; use std::time::Duration; @@ -11,6 +12,7 @@ use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest}; +use pageserver::tenant_mgr::TenantInfo; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -26,7 +28,6 @@ use utils::{ use crate::local_env::LocalEnv; use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; -use pageserver::tenant_mgr::TenantInfo; #[derive(Error, Debug)] pub enum PageserverHttpError { @@ -37,6 +38,12 @@ pub enum PageserverHttpError { Response(String), } +impl From for PageserverHttpError { + fn from(e: anyhow::Error) -> Self { + Self::Response(e.to_string()) + } +} + type Result = result::Result; pub trait ResponseErrorMessageExt: Sized { @@ -410,6 +417,15 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose()?, pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .get("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), + max_lsn_wal_lag: settings + .get("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, }) .send()? .error_from_body()? @@ -433,22 +449,41 @@ impl PageServerNode { tenant_id, checkpoint_distance: settings .get("checkpoint_distance") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'checkpoint_distance' as an integer")?, compaction_target_size: settings .get("compaction_target_size") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_target_size' as an integer")?, compaction_period: settings.get("compaction_period").map(|x| x.to_string()), compaction_threshold: settings .get("compaction_threshold") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_threshold' as an integer")?, gc_horizon: settings .get("gc_horizon") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_horizon' as an integer")?, gc_period: settings.get("gc_period").map(|x| x.to_string()), image_creation_threshold: settings .get("image_creation_threshold") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_threshold' as non zero integer")?, pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .get("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), + max_lsn_wal_lag: settings + .get("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, }) .send()? .error_from_body()?; diff --git a/docs/settings.md b/docs/settings.md index 0ca7223faa..4d828f22bc 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -31,7 +31,7 @@ broker_endpoints = ['some://etcd'] # [remote_storage] ``` -The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, +The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. @@ -54,7 +54,7 @@ Note that TOML distinguishes between strings and integers, the former require si A list of endpoints (etcd currently) to connect and pull the information from. Mandatory, does not have a default, since requires etcd to be started as a separate process, -and its connection url should be specified separately. +and its connection url should be specified separately. #### broker_etcd_prefix @@ -111,6 +111,20 @@ L0 delta layer threshold for L1 image layer creation. Default is 3. WAL retention duration for PITR branching. Default is 30 days. +#### walreceiver_connect_timeout + +Time to wait to establish the wal receiver connection before failing + +#### lagging_wal_timeout + +Time the pageserver did not get any WAL updates from safekeeper (if any). +Avoids lagging pageserver preemptively by forcing to switch it from stalled connections. + +#### max_lsn_wal_lag + +Difference between Lsn values of the latest available WAL on safekeepers: if currently connected safekeeper starts to lag too long and too much, +it gets swapped to the different one. + #### initial_superuser_name Name of the initial superuser role, passed to initdb when a new tenant diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 7fe142502b..0bfce66a5d 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -31,7 +31,7 @@ struct SafekeeperTimeline { /// Published data about safekeeper's timeline. Fields made optional for easy migrations. #[serde_as] -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Clone, Deserialize, Serialize)] pub struct SkTimelineInfo { /// Term of the last entry. pub last_log_term: Option, @@ -55,7 +55,9 @@ pub struct SkTimelineInfo { #[serde(default)] pub peer_horizon_lsn: Option, #[serde(default)] - pub safekeeper_connection_string: Option, + pub safekeeper_connstr: Option, + #[serde(default)] + pub pageserver_connstr: Option, } #[derive(Debug, thiserror::Error)] diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 5fdb1ff9d2..ff71423122 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -336,11 +336,11 @@ impl PostgresBackend { let have_tls = self.tls_config.is_some(); match msg { FeMessage::StartupPacket(m) => { - trace!("got startup message {:?}", m); + trace!("got startup message {m:?}"); match m { FeStartupPacket::SslRequest => { - info!("SSL requested"); + debug!("SSL requested"); self.write_message(&BeMessage::EncryptionResponse(have_tls))?; if have_tls { @@ -349,7 +349,7 @@ impl PostgresBackend { } } FeStartupPacket::GssEncRequest => { - info!("GSS requested"); + debug!("GSS requested"); self.write_message(&BeMessage::EncryptionResponse(false))?; } FeStartupPacket::StartupMessage { .. } => { @@ -433,12 +433,7 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - if query_string.starts_with("callmemaybe") { - // FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed - error!("query handler for '{}' failed: {}", query_string, e); - } else { - error!("query handler for '{}' failed: {:?}", query_string, e); - } + error!("query handler for '{}' failed: {:?}", query_string, e); self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 0ef174da4d..6da5355f61 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -193,7 +193,7 @@ pub struct ZTenantId(ZId); zid_newtype!(ZTenantId); // A pair uniquely identifying Zenith instance. -#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct ZTenantTimelineId { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index d78d3622c4..298addb838 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [features] # It is simpler infra-wise to have failpoints enabled by default -# It shouldn't affect perf in any way because failpoints +# It shouldn't affect performance in any way because failpoints # are not placed in hot code paths default = ["failpoints"] profiling = ["pprof"] diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f44b0846a8..01b626e046 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -480,6 +480,21 @@ impl PageServerConf { if let Some(pitr_interval) = item.get("pitr_interval") { t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?); } + if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") { + t_conf.walreceiver_connect_timeout = Some(parse_toml_duration( + "walreceiver_connect_timeout", + walreceiver_connect_timeout, + )?); + } + if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") { + t_conf.lagging_wal_timeout = Some(parse_toml_duration( + "lagging_wal_timeout", + lagging_wal_timeout, + )?); + } + if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") { + t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?); + } Ok(t_conf) } diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index e00ccda2a1..c947cebcb6 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,3 +1,5 @@ +use std::num::NonZeroU64; + use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ @@ -33,6 +35,9 @@ pub struct TenantCreateRequest { pub gc_period: Option, pub image_creation_threshold: Option, pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } #[serde_as] @@ -68,6 +73,9 @@ pub struct TenantConfigRequest { pub gc_period: Option, pub image_creation_threshold: Option, pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } impl TenantConfigRequest { @@ -82,6 +90,21 @@ impl TenantConfigRequest { gc_period: None, image_creation_threshold: None, pitr_interval: None, + walreceiver_connect_timeout: None, + lagging_wal_timeout: None, + max_lsn_wal_lag: None, } } } + +/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. +/// We keep one WAL receiver active per timeline. +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct WalReceiverEntry { + pub wal_producer_connstr: Option, + #[serde_as(as = "Option")] + pub last_received_msg_lsn: Option, + /// the timestamp (in microseconds) of the last received message + pub last_received_msg_ts: Option, +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bb650a34ed..a1198051a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -229,23 +229,16 @@ async fn wal_receiver_get_handler(request: Request) -> Result) -> Result, ApiError> { @@ -402,6 +395,19 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout) + } + + pub fn get_lagging_wal_timeout(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout) + } + + pub fn get_max_lsn_wal_lag(&self) -> NonZeroU64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag) + } + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> { let mut tenant_conf = self.tenant_conf.write().unwrap(); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 4f0fca4797..df43b8c0df 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,7 +7,6 @@ // *status* -- show actual info about this pageserver, // *pagestream* -- enter mode where smgr and pageserver talk with their // custom protocol. -// *callmemaybe $url* -- ask pageserver to start walreceiver on $url // use anyhow::{bail, ensure, Context, Result}; @@ -38,7 +37,6 @@ use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; -use crate::walreceiver; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; use postgres_ffi::xlog_utils::to_pg_timestamp; @@ -716,30 +714,6 @@ impl postgres_backend::Handler for PageServerHandler { // Check that the timeline exists self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("callmemaybe ") { - // callmemaybe - // TODO lazy static - let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap(); - let caps = re - .captures(query_string) - .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?; - - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let connstr = caps.get(3).unwrap().as_str().to_owned(); - - self.check_permission(Some(tenantid))?; - - let _enter = - info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered(); - - // Check that the timeline exists - tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; - - walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5bf128e66b..9d5056cd16 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -469,6 +469,9 @@ pub mod repo_harness { gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), pitr_interval: Some(tenant_conf.pitr_interval), + walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), + lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), + max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), } } } diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 9bf223e59e..f68a820e95 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -10,6 +10,7 @@ //! use crate::config::PageServerConf; use serde::{Deserialize, Serialize}; +use std::num::NonZeroU64; use std::path::PathBuf; use std::time::Duration; use utils::zid::ZTenantId; @@ -34,6 +35,9 @@ pub mod defaults { pub const DEFAULT_GC_PERIOD: &str = "100 s"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1_000_000; } /// Per-tenant configuration options @@ -68,6 +72,17 @@ pub struct TenantConf { // Page versions older than this are garbage collected away. #[serde(with = "humantime_serde")] pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, } /// Same as TenantConf, but this struct preserves the information about @@ -85,6 +100,11 @@ pub struct TenantConfOpt { pub image_creation_threshold: Option, #[serde(with = "humantime_serde")] pub pitr_interval: Option, + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } impl TenantConfOpt { @@ -108,6 +128,13 @@ impl TenantConfOpt { .image_creation_threshold .unwrap_or(global_conf.image_creation_threshold), pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), + walreceiver_connect_timeout: self + .walreceiver_connect_timeout + .unwrap_or(global_conf.walreceiver_connect_timeout), + lagging_wal_timeout: self + .lagging_wal_timeout + .unwrap_or(global_conf.lagging_wal_timeout), + max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), } } @@ -136,6 +163,15 @@ impl TenantConfOpt { if let Some(pitr_interval) = other.pitr_interval { self.pitr_interval = Some(pitr_interval); } + if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout { + self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout); + } + if let Some(lagging_wal_timeout) = other.lagging_wal_timeout { + self.lagging_wal_timeout = Some(lagging_wal_timeout); + } + if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { + self.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } } } @@ -155,6 +191,14 @@ impl TenantConf { image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), } } @@ -175,6 +219,16 @@ impl TenantConf { gc_period: Duration::from_secs(10), image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: Duration::from_secs(60 * 60), + walreceiver_connect_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .unwrap(), + lagging_wal_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT, + ) + .unwrap(), + max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .unwrap(), } } } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index cc35d79d16..c48b021d1f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -8,11 +8,10 @@ use crate::repository::{Repository, TimelineSyncStatusUpdate}; use crate::storage_sync::index::RemoteIndex; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; -use crate::thread_mgr; use crate::thread_mgr::ThreadKind; -use crate::timelines; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; +use crate::{thread_mgr, timelines, walreceiver}; use crate::{DatadirTimelineImpl, RepositoryImpl}; use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; @@ -21,23 +20,30 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fmt; use std::sync::Arc; +use tokio::sync::mpsc; use tracing::*; use utils::lsn::Lsn; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; mod tenants_state { + use anyhow::ensure; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; + use tokio::sync::mpsc; + use tracing::{debug, error}; use utils::zid::ZTenantId; - use crate::tenant_mgr::Tenant; + use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; lazy_static::lazy_static! { static ref TENANTS: RwLock> = RwLock::new(HashMap::new()); + /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, + /// so that it can enable/disable corresponding processes. + static ref TIMELINE_UPDATE_SENDER: RwLock>> = RwLock::new(None); } pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { @@ -51,6 +57,39 @@ mod tenants_state { .write() .expect("Failed to write() tenants lock, it got poisoned") } + + pub(super) fn set_timeline_update_sender( + timeline_updates_sender: mpsc::UnboundedSender, + ) -> anyhow::Result<()> { + let mut sender_guard = TIMELINE_UPDATE_SENDER + .write() + .expect("Failed to write() timeline_update_sender lock, it got poisoned"); + ensure!(sender_guard.is_none(), "Timeline update sender already set"); + *sender_guard = Some(timeline_updates_sender); + Ok(()) + } + + pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) { + match TIMELINE_UPDATE_SENDER + .read() + .expect("Failed to read() timeline_update_sender lock, it got poisoned") + .as_ref() + { + Some(sender) => { + if let Err(e) = sender.send(update) { + error!("Failed to send timeline update: {}", e); + } + } + None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"), + } + } + + pub(super) fn stop_timeline_update_sender() { + TIMELINE_UPDATE_SENDER + .write() + .expect("Failed to write() timeline_update_sender lock, it got poisoned") + .take(); + } } struct Tenant { @@ -87,10 +126,10 @@ pub enum TenantState { impl fmt::Display for TenantState { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - TenantState::Active => f.write_str("Active"), - TenantState::Idle => f.write_str("Idle"), - TenantState::Stopping => f.write_str("Stopping"), - TenantState::Broken => f.write_str("Broken"), + Self::Active => f.write_str("Active"), + Self::Idle => f.write_str("Idle"), + Self::Stopping => f.write_str("Stopping"), + Self::Broken => f.write_str("Broken"), } } } @@ -99,6 +138,11 @@ impl fmt::Display for TenantState { /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the repository once download is completed. pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result { + let (timeline_updates_sender, timeline_updates_receiver) = + mpsc::unbounded_channel::(); + tenants_state::set_timeline_update_sender(timeline_updates_sender)?; + walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; + let SyncStartupData { remote_index, local_timeline_init_statuses, @@ -113,16 +157,27 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result), +} + +impl std::fmt::Debug for LocalTimelineUpdate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Detach(ttid) => f.debug_tuple("Remove").field(ttid).finish(), + Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(), + } + } +} + /// Updates tenants' repositories, changing their timelines state in memory. pub fn apply_timeline_sync_status_updates( conf: &'static PageServerConf, @@ -160,6 +215,7 @@ pub fn apply_timeline_sync_status_updates( /// Shut down all tenants. This runs as part of pageserver shutdown. /// pub fn shutdown_all_tenants() { + tenants_state::stop_timeline_update_sender(); let mut m = tenants_state::write_tenants(); let mut tenantids = Vec::new(); for (tenantid, tenant) in m.iter_mut() { @@ -173,7 +229,7 @@ pub fn shutdown_all_tenants() { } drop(m); - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None); + thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None); thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None); @@ -247,32 +303,49 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { Some(tenants_state::read_tenants().get(&tenantid)?.state) } -/// -/// Change the state of a tenant to Active and launch its compactor and GC -/// threads. If the tenant was already in Active state or Stopping, does nothing. -/// -pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { +pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) .with_context(|| format!("Tenant not found for id {tenant_id}"))?; + let old_state = tenant.state; + tenant.state = new_state; + drop(m); - info!("activating tenant {tenant_id}"); - - match tenant.state { - // If the tenant is already active, nothing to do. - TenantState::Active => {} - - // If it's Idle, launch the compactor and GC threads - TenantState::Idle => { - thread_mgr::spawn( + match (old_state, new_state) { + (TenantState::Broken, TenantState::Broken) + | (TenantState::Active, TenantState::Active) + | (TenantState::Idle, TenantState::Idle) + | (TenantState::Stopping, TenantState::Stopping) => { + debug!("tenant {tenant_id} already in state {new_state}"); + } + (TenantState::Broken, ignored) => { + debug!("Ignoring {ignored} since tenant {tenant_id} is in broken state"); + } + (_, TenantState::Broken) => { + debug!("Setting tenant {tenant_id} status to broken"); + } + (TenantState::Stopping, ignored) => { + debug!("Ignoring {ignored} since tenant {tenant_id} is in stopping state"); + } + (TenantState::Idle, TenantState::Active) => { + info!("activating tenant {tenant_id}"); + let compactor_spawn_result = thread_mgr::spawn( ThreadKind::Compactor, Some(tenant_id), None, "Compactor thread", false, move || crate::tenant_threads::compact_loop(tenant_id), - )?; + ); + if compactor_spawn_result.is_err() { + let mut m = tenants_state::write_tenants(); + m.get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))? + .state = old_state; + drop(m); + } + compactor_spawn_result?; let gc_spawn_result = thread_mgr::spawn( ThreadKind::GarbageCollector, @@ -286,21 +359,31 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); if let Err(e) = &gc_spawn_result { + let mut m = tenants_state::write_tenants(); + m.get_mut(&tenant_id) + .with_context(|| format!("Tenant not found for id {tenant_id}"))? + .state = old_state; + drop(m); error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}"); thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); return gc_spawn_result; } - tenant.state = TenantState::Active; } - - TenantState::Stopping => { - // don't re-activate it if it's being stopped + (TenantState::Idle, TenantState::Stopping) => { + info!("stopping idle tenant {tenant_id}"); } - - TenantState::Broken => { - // cannot activate + (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { + info!("stopping tenant {tenant_id} threads due to new state {new_state}"); + thread_mgr::shutdown_threads( + Some(ThreadKind::WalReceiverManager), + Some(tenant_id), + None, + ); + thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None); + thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); } } + Ok(()) } @@ -325,15 +408,15 @@ pub fn get_local_timeline_with_load( .with_context(|| format!("Tenant {tenant_id} not found"))?; if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) { - return Ok(Arc::clone(page_tline)); + Ok(Arc::clone(page_tline)) + } else { + let page_tline = load_local_timeline(&tenant.repo, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; + tenant + .local_timelines + .insert(timeline_id, Arc::clone(&page_tline)); + Ok(page_tline) } - - let page_tline = load_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; - tenant - .local_timelines - .insert(timeline_id, Arc::clone(&page_tline)); - Ok(page_tline) } pub fn detach_timeline( @@ -351,6 +434,9 @@ pub fn detach_timeline( .detach_timeline(timeline_id) .context("Failed to detach inmem tenant timeline")?; tenant.local_timelines.remove(&timeline_id); + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach( + ZTenantTimelineId::new(tenant_id, timeline_id), + )); } None => bail!("Tenant {tenant_id} not found in local tenant state"), } @@ -379,6 +465,12 @@ fn load_local_timeline( repartition_distance, )); page_tline.init_logical_size()?; + + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach( + ZTenantTimelineId::new(repo.tenant_id(), timeline_id), + Arc::clone(&page_tline), + )); + Ok(page_tline) } diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 473cddda58..8264bdd97c 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -91,8 +91,8 @@ pub enum ThreadKind { // associated with one later, after receiving a command from the client. PageRequestHandler, - // Thread that connects to a safekeeper to fetch WAL for one timeline. - WalReceiver, + // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. + WalReceiverManager, // Thread that handles compaction of all timelines for a tenant. Compactor, diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index b8f349af8f..df8dd2fc29 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -1,61 +1,77 @@ +//! WAL receiver manages an open connection to safekeeper, to get the WAL it streams into. +//! To do so, a current implementation needs to do the following: //! -//! WAL receiver connects to the WAL safekeeper service, streams WAL, -//! decodes records and saves them in the repository for the correct -//! timeline. +//! * acknowledge the timelines that it needs to stream WAL into. +//! Pageserver is able to dynamically (un)load tenants on attach and detach, +//! hence WAL receiver needs to react on such events. //! -//! We keep one WAL receiver active per timeline. +//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. +//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically. +//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. +//! Without this data, no WAL streaming is possible currently. +//! +//! Only one active WAL streaming connection is allowed at a time. +//! The connection is supposed to be updated periodically, based on safekeeper timeline data. +//! +//! * handle the actual connection and WAL streaming +//! +//! Handle happens dynamically, by portions of WAL being processed and registered in the server. +//! Along with the registration, certain metadata is written to show WAL streaming progress and rely on that when considering safekeepers for connection. +//! +//! ## Implementation details +//! +//! WAL receiver's implementation consists of 3 kinds of nested loops, separately handling the logic from the bullets above: +//! +//! * [`init_wal_receiver_main_thread`], a wal receiver main thread, containing the control async loop: timeline addition/removal and interruption of a whole thread handling. +//! The loop is infallible, always trying to continue with the new tasks, the only place where it can fail is its initialization. +//! All of the code inside the loop is either async or a spawn_blocking wrapper around the sync code. +//! +//! * [`timeline_wal_broker_loop_step`], a broker task, handling the etcd broker subscription and polling, safekeeper selection logic and [re]connects. +//! On every concequent broker/wal streamer connection attempt, the loop steps are forced to wait for some time before running, +//! increasing with the number of attempts (capped with some fixed value). +//! This is done endlessly, to ensure we don't miss the WAL streaming when it gets available on one of the safekeepers. +//! +//! Apart from the broker management, it keeps the wal streaming connection open, with the safekeeper having the most advanced timeline state. +//! The connection could be closed from safekeeper side (with error or not), could be cancelled from pageserver side from time to time. +//! +//! * [`connection_handler::handle_walreceiver_connection`], a wal streaming task, opening the libpq connection and reading the data out of it to the end. +//! Does periodic reporting of the progress, to share some of the data via external HTTP API and to ensure we're able to switch connections when needed. +//! +//! Every task is cancellable via its separate cancellation channel, +//! also every such task's dependency (broker subscription or the data source channel) cancellation/drop triggers the corresponding task cancellation either. + +mod connection_handler; use crate::config::PageServerConf; -use crate::repository::{Repository, Timeline}; -use crate::tenant_mgr; -use crate::thread_mgr; +use crate::http::models::WalReceiverEntry; +use crate::repository::Timeline; +use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; use crate::thread_mgr::ThreadKind; -use crate::walingest::WalIngest; -use anyhow::{bail, Context, Error, Result}; -use bytes::BytesMut; -use fail::fail_point; -use lazy_static::lazy_static; -use postgres_ffi::waldecoder::*; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; +use crate::{thread_mgr, DatadirTimelineImpl}; +use anyhow::{ensure, Context}; +use chrono::{NaiveDateTime, Utc}; +use etcd_broker::{Client, SkTimelineInfo, SkTimelineSubscription, SkTimelineSubscriptionKind}; +use itertools::Itertools; +use once_cell::sync::Lazy; use std::cell::Cell; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::Mutex; +use std::collections::{hash_map, HashMap, HashSet}; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::Arc; use std::thread_local; -use std::time::SystemTime; -use tokio::pin; -use tokio_postgres::replication::ReplicationStream; -use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; -use tokio_stream::StreamExt; -use tracing::*; -use utils::{ - lsn::Lsn, - pq_proto::ZenithFeedback, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, +use std::time::Duration; +use tokio::select; +use tokio::{ + sync::{mpsc, watch, RwLock}, + task::JoinHandle, }; +use tracing::*; +use url::Url; +use utils::lsn::Lsn; +use utils::pq_proto::ZenithFeedback; +use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; -/// -/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. -/// We keep one WAL receiver active per timeline. -/// -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct WalReceiverEntry { - thread_id: u64, - wal_producer_connstr: String, - #[serde_as(as = "Option")] - last_received_msg_lsn: Option, - /// the timestamp (in microseconds) of the last received message - last_received_msg_ts: Option, -} - -lazy_static! { - static ref WAL_RECEIVERS: Mutex> = - Mutex::new(HashMap::new()); -} +use self::connection_handler::{WalConnectionEvent, WalReceiverConnection}; thread_local! { // Boolean that is true only for WAL receiver threads @@ -64,375 +80,1133 @@ thread_local! { pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); } -fn drop_wal_receiver(tenantid: ZTenantId, timelineid: ZTimelineId) { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.remove(&(tenantid, timelineid)); -} +/// WAL receiver state for sharing with the outside world. +/// Only entries for timelines currently available in pageserver are stored. +static WAL_RECEIVER_ENTRIES: Lazy>> = + Lazy::new(|| RwLock::new(HashMap::new())); -// Launch a new WAL receiver, or tell one that's running about change in connection string -pub fn launch_wal_receiver( - conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, - wal_producer_connstr: &str, -) -> Result<()> { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - - match receivers.get_mut(&(tenantid, timelineid)) { - Some(receiver) => { - debug!("wal receiver already running, updating connection string"); - receiver.wal_producer_connstr = wal_producer_connstr.into(); - } - None => { - let thread_id = thread_mgr::spawn( - ThreadKind::WalReceiver, - Some(tenantid), - Some(timelineid), - "WAL receiver thread", - false, - move || { - IS_WAL_RECEIVER.with(|c| c.set(true)); - thread_main(conf, tenantid, timelineid); - Ok(()) - }, - )?; - - let receiver = WalReceiverEntry { - thread_id, - wal_producer_connstr: wal_producer_connstr.into(), - last_received_msg_lsn: None, - last_received_msg_ts: None, - }; - receivers.insert((tenantid, timelineid), receiver); - - // Update tenant state and start tenant threads, if they are not running yet. - tenant_mgr::activate_tenant(tenantid)?; - } - }; - Ok(()) -} - -/// Look up a WAL receiver's data in the global `WAL_RECEIVERS` -pub fn get_wal_receiver_entry( +/// Gets the public WAL streaming entry for a certain timeline. +pub async fn get_wal_receiver_entry( tenant_id: ZTenantId, timeline_id: ZTimelineId, ) -> Option { - let receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.get(&(tenant_id, timeline_id)).cloned() + WAL_RECEIVER_ENTRIES + .read() + .await + .get(&ZTenantTimelineId::new(tenant_id, timeline_id)) + .cloned() } -// -// This is the entry point for the WAL receiver thread. -// -fn thread_main(conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId) { - let _enter = info_span!("WAL receiver", timeline = %timeline_id, tenant = %tenant_id).entered(); - info!("WAL receiver thread started"); - - // Look up the current WAL producer address - let wal_producer_connstr = { - match get_wal_receiver_entry(tenant_id, timeline_id) { - Some(e) => e.wal_producer_connstr, - None => { - info!( - "Unable to create the WAL receiver thread: no WAL receiver entry found for tenant {} and timeline {}", - tenant_id, timeline_id - ); - return; - } - } - }; - - // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, - // and start streaming WAL from it. - let res = walreceiver_main(conf, tenant_id, timeline_id, &wal_producer_connstr); - - // TODO cleanup info messages - if let Err(e) = res { - info!("WAL streaming connection failed ({})", e); - } else { - info!( - "walreceiver disconnected tenant {}, timelineid {}", - tenant_id, timeline_id - ); - } - - // Drop it from list of active WAL_RECEIVERS - // so that next callmemaybe request launched a new thread - drop_wal_receiver(tenant_id, timeline_id); -} - -fn walreceiver_main( - _conf: &PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - wal_producer_connstr: &str, -) -> anyhow::Result<(), Error> { - // Connect to the database in replication mode. - info!("connecting to {:?}", wal_producer_connstr); - let connect_cfg = format!( - "{} application_name=pageserver replication=true", - wal_producer_connstr +/// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. +/// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. +pub fn init_wal_receiver_main_thread( + conf: &'static PageServerConf, + mut timeline_updates_receiver: mpsc::UnboundedReceiver, +) -> anyhow::Result<()> { + let etcd_endpoints = conf.broker_endpoints.clone(); + ensure!( + !etcd_endpoints.is_empty(), + "Cannot start wal receiver: etcd endpoints are empty" ); - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - let (mut replication_client, connection) = - runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?; - // This is from tokio-postgres docs, but it is a bit weird in our case because we extensively use block_on - runtime.spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - info!("connected!"); - - // Immediately increment the gauge, then create a job to decrement it on thread exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } - - let identify = runtime.block_on(identify_system(&mut replication_client))?; - info!("{:?}", identify); - let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); - let mut caught_up = false; - - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {}", tenant_id))?; - let timeline = - tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).with_context(|| { - format!( - "local timeline {} not found for tenant {}", - timeline_id, tenant_id - ) - })?; - let remote_index = repo.get_remote_index(); - - // - // Start streaming the WAL, from where we left off previously. - // - // If we had previously received WAL up to some point in the middle of a WAL record, we - // better start from the end of last full WAL record, not in the middle of one. - let mut last_rec_lsn = timeline.get_last_record_lsn(); - let mut startpoint = last_rec_lsn; - - if startpoint == Lsn(0) { - bail!("No previous WAL position"); - } - - // There might be some padding after the last full record, skip it. - startpoint += startpoint.calc_padding(8u32); - + let broker_prefix = &conf.broker_etcd_prefix; info!( - "last_record_lsn {} starting replication from {}, server is at {}...", - last_rec_lsn, startpoint, end_of_wal + "Starting wal receiver main thread, etdc endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") ); - let query = format!("START_REPLICATION PHYSICAL {}", startpoint); + let runtime = tokio::runtime::Builder::new_multi_thread() + .thread_name("wal-receiver-runtime-thread") + .worker_threads(40) + .enable_all() + .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) + .build() + .context("Failed to create storage sync runtime")?; + let etcd_client = runtime + .block_on(etcd_broker::Client::connect(etcd_endpoints, None)) + .context("Failed to connect to etcd")?; - let copy_stream = runtime.block_on(replication_client.copy_both_simple(&query))?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); - - let mut waldecoder = WalStreamDecoder::new(startpoint); - - let mut walingest = WalIngest::new(&*timeline, startpoint)?; - - while let Some(replication_message) = runtime.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - // check for shutdown first - biased; - _ = shutdown_watcher => { - info!("walreceiver interrupted"); - None - } - replication_message = physical_stream.next() => replication_message, - } - }) { - let replication_message = replication_message?; - let status_update = match replication_message { - ReplicationMessage::XLogData(xlog_data) => { - // Pass the WAL data to the decoder, and see if we can decode - // more records as a result. - let data = xlog_data.data(); - let startlsn = Lsn::from(xlog_data.wal_start()); - let endlsn = startlsn + data.len() as u64; - - trace!("received XLogData between {} and {}", startlsn, endlsn); - - waldecoder.feed_bytes(data); - - while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - let _enter = info_span!("processing record", lsn = %lsn).entered(); - - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - anyhow::ensure!(lsn.is_aligned()); - - walingest.ingest_record(&timeline, recdata, lsn)?; - - fail_point!("walreceiver-after-ingest"); - - last_rec_lsn = lsn; + thread_mgr::spawn( + ThreadKind::WalReceiverManager, + None, + None, + "WAL receiver manager main thread", + true, + move || { + runtime.block_on(async move { + let mut local_timeline_wal_receivers = HashMap::new(); + loop { + select! { + _ = thread_mgr::shutdown_watcher() => { + info!("Shutdown signal received"); + shutdown_all_wal_connections(&mut local_timeline_wal_receivers).await; + break; + }, + _ = wal_receiver_main_thread_loop_step( + broker_prefix, + &etcd_client, + &mut timeline_updates_receiver, + &mut local_timeline_wal_receivers, + ) => {}, + } } + }.instrument(info_span!("wal_receiver_main"))); - if !caught_up && endlsn >= end_of_wal { - info!("caught up at LSN {}", endlsn); - caught_up = true; + info!("Wal receiver main thread stopped"); + Ok(()) + }, + ) + .map(|_thread_id| ()) + .context("Failed to spawn wal receiver main thread") +} + +/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. +/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. +/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. +/// +/// Cannot fail, should always try to process the next timeline event even if the other one was not processed properly. +async fn wal_receiver_main_thread_loop_step<'a>( + broker_prefix: &'a str, + etcd_client: &'a Client, + timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, + local_timeline_wal_receivers: &'a mut HashMap< + ZTenantId, + HashMap, + >, +) { + // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. + match timeline_updates_receiver.recv().await { + Some(update) => { + info!("Processing timeline update: {update:?}"); + match update { + // Timeline got detached, stop all related tasks and remove public timeline data. + LocalTimelineUpdate::Detach(id) => { + match local_timeline_wal_receivers.get_mut(&id.tenant_id) { + Some(wal_receivers) => { + if let hash_map::Entry::Occupied(mut o) = wal_receivers.entry(id.timeline_id) { + if let Err(e) = o.get_mut().shutdown(id).await { + error!("Failed to shut down timeline {id} wal receiver handle: {e:#}"); + return; + } else { + o.remove(); + } + } + if wal_receivers.is_empty() { + if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { + error!("Failed to make tenant idle for id {id}: {e:#}"); + } + } + } + None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), + }; + { + WAL_RECEIVER_ENTRIES.write().await.remove(&id); + } } + // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. + LocalTimelineUpdate::Attach(new_id, new_timeline) => { + let timelines = local_timeline_wal_receivers + .entry(new_id.tenant_id) + .or_default(); - timeline.tline.check_checkpoint_distance()?; + if timelines.is_empty() { + if let Err(e) = + change_tenant_state(new_id.tenant_id, TenantState::Active).await + { + error!("Failed to make tenant active for id {new_id}: {e:#}"); + return; + } + } - Some(endlsn) - } + let vacant_timeline_entry = match timelines.entry(new_id.timeline_id) { + hash_map::Entry::Occupied(_) => { + debug!("Attepted to readd an existing timeline {new_id}, ignoring"); + return; + } + hash_map::Entry::Vacant(v) => v, + }; - ReplicationMessage::PrimaryKeepAlive(keepalive) => { - let wal_end = keepalive.wal_end(); - let timestamp = keepalive.timestamp(); - let reply_requested = keepalive.reply() != 0; + let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = + match fetch_tenant_settings(new_id.tenant_id).await { + Ok(settings) => settings, + Err(e) => { + error!("Failed to fetch tenant settings for id {new_id}: {e:#}"); + return; + } + }; - trace!( - "received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})", - wal_end, - timestamp, - reply_requested, - ); - - if reply_requested { - Some(last_rec_lsn) - } else { - None - } - } - - _ => None, - }; - - if let Some(last_lsn) = status_update { - let timeline_remote_consistent_lsn = runtime.block_on(async { - remote_index - .read() - .await - // here we either do not have this timeline in remote index - // or there were no checkpoints for it yet - .timeline_entry(&ZTenantTimelineId { - tenant_id, - timeline_id, - }) - .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) - .unwrap_or(Lsn(0)) // no checkpoint was uploaded - }); - - // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); - // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); - // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash - // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_remote_consistent_lsn); - let ts = SystemTime::now(); - - // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` - { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - let entry = match receivers.get_mut(&(tenant_id, timeline_id)) { - Some(e) => e, - None => { - anyhow::bail!( - "no WAL receiver entry found for tenant {} and timeline {}", - tenant_id, - timeline_id + { + WAL_RECEIVER_ENTRIES.write().await.insert( + new_id, + WalReceiverEntry { + wal_producer_connstr: None, + last_received_msg_lsn: None, + last_received_msg_ts: None, + }, ); } - }; - entry.last_received_msg_lsn = Some(last_lsn); - entry.last_received_msg_ts = Some( - ts.duration_since(SystemTime::UNIX_EPOCH) - .expect("Received message time should be before UNIX EPOCH!") - .as_micros(), - ); + let (cancellation_sender, mut cancellation_receiver) = watch::channel(()); + let mut wal_connection_manager = WalConnectionManager { + id: new_id, + timeline: Arc::clone(&new_timeline), + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + wal_connection_data: None, + wal_connection_attempt: 0, + }; + + let broker_prefix = broker_prefix.to_string(); + let mut loop_client = etcd_client.clone(); + let broker_join_handle = tokio::spawn(async move { + info!("WAL receiver broker started, connecting to etcd"); + let mut cancellation = cancellation_receiver.clone(); + loop { + select! { + _ = cancellation.changed() => { + info!("Wal broker loop cancelled, shutting down"); + break; + }, + step_result = timeline_wal_broker_loop_step( + &broker_prefix, + &mut loop_client, + &mut wal_connection_manager, + &mut cancellation_receiver, + ) => match step_result { + Ok(ControlFlow::Break(())) => { + break; + } + Ok(ControlFlow::Continue(())) => {} + Err(e) => warn!("Error during wal receiver main thread step for timeline {new_id}: {e:#}"), + } + } + } + }.instrument(info_span!("timeline", id = %new_id))); + + vacant_timeline_entry.insert(TimelineWalBrokerLoopHandle { + broker_join_handle, + cancellation_sender, + }); + } } + } + None => { + info!("Local timeline update channel closed"); + shutdown_all_wal_connections(local_timeline_wal_receivers).await; + } + } +} - // Send zenith feedback message. - // Regular standby_status_update fields are put into this message. - let zenith_status_update = ZenithFeedback { - current_timeline_size: timeline.get_current_logical_size() as u64, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, - }; +async fn fetch_tenant_settings( + tenant_id: ZTenantId, +) -> anyhow::Result<(Duration, Duration, NonZeroU64)> { + tokio::task::spawn_blocking(move || { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + Ok::<_, anyhow::Error>(( + repo.get_wal_receiver_connect_timeout(), + repo.get_lagging_wal_timeout(), + repo.get_max_lsn_wal_lag(), + )) + }) + .await + .with_context(|| format!("Failed to join on tenant {tenant_id} settings fetch task"))? +} - debug!("zenith_status_update {:?}", zenith_status_update); +async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { + tokio::task::spawn_blocking(move || { + tenant_mgr::set_tenant_state(tenant_id, new_state) + .with_context(|| format!("Failed to activate tenant {tenant_id}")) + }) + .await + .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? +} - let mut data = BytesMut::new(); - zenith_status_update.serialize(&mut data)?; - runtime.block_on( - physical_stream - .as_mut() - .zenith_status_update(data.len() as u64, &data), - )?; +async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { + if n == 0 { + return; + } + let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds); + info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task"); + tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; +} + +async fn shutdown_all_wal_connections( + local_timeline_wal_receivers: &mut HashMap< + ZTenantId, + HashMap, + >, +) { + info!("Shutting down all WAL connections"); + let mut broker_join_handles = Vec::new(); + for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { + for (timeline_id, handles) in timelines { + handles.cancellation_sender.send(()).ok(); + broker_join_handles.push(( + ZTenantTimelineId::new(tenant_id, timeline_id), + handles.broker_join_handle, + )); } } - Ok(()) -} - -/// Data returned from the postgres `IDENTIFY_SYSTEM` command -/// -/// See the [postgres docs] for more details. -/// -/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html -#[derive(Debug)] -// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as -// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 -#[allow(dead_code)] -pub struct IdentifySystem { - systemid: u64, - timeline: u32, - xlogpos: PgLsn, - dbname: Option, -} - -/// There was a problem parsing the response to -/// a postgres IDENTIFY_SYSTEM command. -#[derive(Debug, thiserror::Error)] -#[error("IDENTIFY_SYSTEM parse error")] -pub struct IdentifyError; - -/// Run the postgres `IDENTIFY_SYSTEM` command -pub async fn identify_system(client: &mut Client) -> Result { - let query_str = "IDENTIFY_SYSTEM"; - let response = client.simple_query(query_str).await?; - - // get(N) from row, then parse it as some destination type. - fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result - where - T: FromStr, + let mut tenants = HashSet::with_capacity(broker_join_handles.len()); + for (id, broker_join_handle) in broker_join_handles { + tenants.insert(id.tenant_id); + debug!("Waiting for wal broker for timeline {id} to finish"); + if let Err(e) = broker_join_handle.await { + error!("Failed to join on wal broker for timeline {id}: {e}"); + } + } + if let Err(e) = tokio::task::spawn_blocking(move || { + for tenant_id in tenants { + if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { + error!("Failed to make tenant {tenant_id} idle: {e:?}"); + } + } + }) + .await { - let val = row.get(idx).ok_or(IdentifyError)?; - val.parse::().or(Err(IdentifyError)) - } - - // extract the row contents into an IdentifySystem struct. - // written as a closure so I can use ? for Option here. - if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { - Ok(IdentifySystem { - systemid: get_parse(first_row, 0)?, - timeline: get_parse(first_row, 1)?, - xlogpos: get_parse(first_row, 2)?, - dbname: get_parse(first_row, 3).ok(), - }) - } else { - Err(IdentifyError.into()) + error!("Failed to spawn a task to make all tenants idle: {e:?}"); + } +} + +/// Broker WAL loop handle to cancel the loop safely when needed. +struct TimelineWalBrokerLoopHandle { + broker_join_handle: JoinHandle<()>, + cancellation_sender: watch::Sender<()>, +} + +impl TimelineWalBrokerLoopHandle { + /// Stops the broker loop, waiting for its current task to finish. + async fn shutdown(&mut self, id: ZTenantTimelineId) -> anyhow::Result<()> { + self.cancellation_sender.send(()).context( + "Unexpected: cancellation sender is dropped before the receiver in the loop is", + )?; + debug!("Waiting for wal receiver for timeline {id} to finish"); + let handle = &mut self.broker_join_handle; + handle + .await + .with_context(|| format!("Failed to join the wal reveiver broker for timeline {id}")) + } +} + +/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. +/// Based on the updates, desides whether to start, keep or stop a WAL receiver task. +async fn timeline_wal_broker_loop_step( + broker_prefix: &str, + etcd_client: &mut Client, + wal_connection_manager: &mut WalConnectionManager, + cancellation: &mut watch::Receiver<()>, +) -> anyhow::Result> { + let id = wal_connection_manager.id; + + // Endlessly try to subscribe for broker updates for a given timeline. + // If there are no safekeepers to maintain the lease, the timeline subscription will be inavailable in the broker and the operation will fail constantly. + // This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. + let mut broker_subscription: SkTimelineSubscription; + let mut attempt = 0; + loop { + select! { + _ = cancellation.changed() => { + info!("Subscription backoff cancelled, shutting down"); + return Ok(ControlFlow::Break(())); + }, + _ = exponential_backoff(attempt, 2.0, 60.0) => {}, + } + attempt += 1; + + select! { + _ = cancellation.changed() => { + info!("Broker subscription loop cancelled, shutting down"); + return Ok(ControlFlow::Break(())); + }, + new_subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( + etcd_client, + SkTimelineSubscriptionKind::timeline(broker_prefix.to_owned(), id), + ) + .instrument(info_span!("etcd_subscription")) => match new_subscription { + Ok(new_subscription) => { + broker_subscription = new_subscription; + break; + } + Err(e) => { + warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); + continue; + } + }, + + } + } + + info!("Subscribed for etcd timeline changes, considering walreceiver connections"); + + loop { + select! { + // the order of the polls is especially important here, since the first task to complete gets selected and the others get dropped (cancelled). + // place more frequetly updated tasks below to ensure the "slow" tasks are also reacted to. + biased; + // first, the cancellations are checked, to ensure we exit eagerly + _ = cancellation.changed() => { + info!("Broker loop cancelled, shutting down"); + break; + } + // then, we check for new events from the WAL connection: the existing connection should either return some progress data, + // or block, allowing other tasks in this `select!` to run first. + // + // We set a "timebomb" in the polling method, that waits long enough and cancels the entire loop if nothing happens during the wait. + // The wait is only initiated when no data (or a "channel closed" data) is received from the loop, ending with the break flow return. + // While waiting, more broker events are expected to be retrieved from etcd (currently, every safekeeper posts ~1 message/second). + // The timebomb ensures that we don't get stuck for too long on any of the WAL/etcd event polling, rather restarting the subscription entirely. + // + // We cannot return here eagerly on no WAL task data, since the result will get selected to early, not allowing etcd tasks to be polled properly. + // We cannot move etcd tasks above this select, since they are very frequent to finish and WAL events might get ignored. + // We need WAL events to periodically update the external data, so we cannot simply await the task result on the handler here. + wal_receiver_poll_result = wal_connection_manager.poll_connection_event_or_cancel() => match wal_receiver_poll_result { + ControlFlow::Break(()) => break, + ControlFlow::Continue(()) => {}, + }, + // finally, if no other tasks are completed, get another broker update and possibly reconnect + updates = broker_subscription.fetch_data() => match updates { + Some(mut all_timeline_updates) => { + if let Some(subscribed_timeline_updates) = all_timeline_updates.remove(&id) { + match wal_connection_manager.select_connection_candidate(subscribed_timeline_updates) { + Some(candidate) => { + info!("Switching to different safekeeper {} for timeline {id}, reason: {:?}", candidate.safekeeper_id, candidate.reason); + wal_connection_manager.change_connection(candidate.safekeeper_id, candidate.wal_producer_connstr).await; + }, + None => {} + } + } + }, + None => { + info!("Subscription source end was dropped, no more updates are possible, shutting down"); + break; + }, + }, + } + } + + info!("Waiting for the current connection to close"); + wal_connection_manager.close_connection().await; + broker_subscription + .cancel() + .await + .with_context(|| format!("Failed to cancel timeline {id} subscription in etcd"))?; + Ok(ControlFlow::Continue(())) +} + +/// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. +struct WalConnectionManager { + id: ZTenantTimelineId, + timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, + wal_connection_attempt: u32, + wal_connection_data: Option, +} + +#[derive(Debug)] +struct WalConnectionData { + safekeeper_id: NodeId, + connection: WalReceiverConnection, + connection_init_time: NaiveDateTime, + last_wal_receiver_data: Option<(ZenithFeedback, NaiveDateTime)>, +} + +#[derive(Debug, PartialEq, Eq)] +struct NewWalConnectionCandidate { + safekeeper_id: NodeId, + wal_producer_connstr: String, + reason: ReconnectReason, +} + +/// Stores the reason why WAL connection was switched, for furter debugging purposes. +#[derive(Debug, PartialEq, Eq)] +enum ReconnectReason { + NoExistingConnection, + LaggingWal { + current_lsn: Lsn, + new_lsn: Lsn, + threshold: NonZeroU64, + }, + NoWalTimeout { + last_wal_interaction: NaiveDateTime, + check_time: NaiveDateTime, + threshold: Duration, + }, +} + +impl WalConnectionManager { + /// Tries to get more data from the WAL connection. + /// If the WAL connection channel is dropped or no data is retrieved, a "timebomb" future is started to break the existing broker subscription. + /// This future is intended to be used in the `select!` loop, so lengthy future normally gets dropped due to other futures completing. + /// If not, it's better to cancel the entire "stuck" loop and start over. + async fn poll_connection_event_or_cancel(&mut self) -> ControlFlow<(), ()> { + let (connection_data, wal_receiver_event) = match self.wal_connection_data.as_mut() { + Some(connection_data) => match connection_data.connection.next_event().await { + Some(event) => (connection_data, event), + None => { + warn!("WAL receiver event source stopped sending messages, waiting for other events to arrive"); + tokio::time::sleep(Duration::from_secs(30)).await; + warn!("WAL receiver without a connection spent sleeping 30s without being interrupted, aborting the loop"); + return ControlFlow::Break(()); + } + }, + None => { + tokio::time::sleep(Duration::from_secs(30)).await; + warn!("WAL receiver without a connection spent sleeping 30s without being interrupted, aborting the loop"); + return ControlFlow::Break(()); + } + }; + + match wal_receiver_event { + WalConnectionEvent::Started => { + self.wal_connection_attempt = 0; + } + WalConnectionEvent::NewWal(new_wal_data) => { + self.wal_connection_attempt = 0; + connection_data.last_wal_receiver_data = + Some((new_wal_data, Utc::now().naive_utc())); + } + WalConnectionEvent::End(wal_receiver_result) => { + match wal_receiver_result { + Ok(()) => { + info!("WAL receiver task finished, reconnecting"); + self.wal_connection_attempt = 0; + } + Err(e) => { + error!("WAL receiver task failed: {e:#}, reconnecting"); + self.wal_connection_attempt += 1; + } + } + self.close_connection().await; + } + } + + ControlFlow::Continue(()) + } + + /// Shuts down current connection (if any), waiting for it to finish. + async fn close_connection(&mut self) { + if let Some(data) = self.wal_connection_data.as_mut() { + match data.connection.shutdown().await { + Err(e) => { + error!("Failed to shutdown wal receiver connection: {e:#}"); + } + Ok(()) => self.wal_connection_data = None, + } + } + } + + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. + async fn change_connection( + &mut self, + new_safekeeper_id: NodeId, + new_wal_producer_connstr: String, + ) { + self.close_connection().await; + self.wal_connection_data = Some(WalConnectionData { + safekeeper_id: new_safekeeper_id, + connection: WalReceiverConnection::open( + self.id, + new_safekeeper_id, + new_wal_producer_connstr, + self.wal_connect_timeout, + ), + connection_init_time: Utc::now().naive_utc(), + last_wal_receiver_data: None, + }); + } + + /// Checks current state against every fetched safekeeper state of a given timeline. + /// Returns a new candidate, if the current state is somewhat lagging, or `None` otherwise. + /// The current rules for approving new candidates: + /// * pick the safekeeper with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline + /// * if the leader is a different SK and either + /// * no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) — reconnect + /// * same time amount had passed since the connection, WAL updates happened recently, but the new leader SK has timeline Lsn way ahead of the old one — reconnect + /// + /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. + /// Both thresholds are configured per tenant. + fn select_connection_candidate( + &self, + safekeeper_timelines: HashMap, + ) -> Option { + let (&new_sk_id, new_sk_timeline, new_wal_producer_connstr) = safekeeper_timelines + .iter() + .filter(|(_, info)| { + info.commit_lsn > Some(self.timeline.tline.get_last_record_lsn()) + }) + .filter_map(|(sk_id, info)| { + match wal_stream_connection_string( + self.id, + info.safekeeper_connstr.as_deref()?, + info.pageserver_connstr.as_deref()?, + ) { + Ok(connstr) => Some((sk_id, info, connstr)), + Err(e) => { + error!("Failed to create wal receiver connection string from broker data of safekeeper node {sk_id}: {e:#}"); + None + } + } + }) + .max_by_key(|(_, info, _)| info.commit_lsn)?; + + match self.wal_connection_data.as_ref() { + None => Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoExistingConnection, + }), + Some(current_connection) => { + if current_connection.safekeeper_id == new_sk_id { + None + } else { + self.reason_to_reconnect(current_connection, new_sk_timeline) + .map(|reason| NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason, + }) + } + } + } + } + + fn reason_to_reconnect( + &self, + current_connection: &WalConnectionData, + new_sk_timeline: &SkTimelineInfo, + ) -> Option { + let last_sk_interaction_time = match current_connection.last_wal_receiver_data.as_ref() { + Some((last_wal_receiver_data, data_submission_time)) => { + let new_lsn = new_sk_timeline.commit_lsn?; + match new_lsn.0.checked_sub(last_wal_receiver_data.ps_writelsn) + { + Some(sk_lsn_advantage) => { + if sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + return Some(ReconnectReason::LaggingWal { current_lsn: Lsn(last_wal_receiver_data.ps_writelsn), new_lsn, threshold: self.max_lsn_wal_lag }); + } + } + None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"), + } + *data_submission_time + } + None => current_connection.connection_init_time, + }; + + let now = Utc::now().naive_utc(); + match (now - last_sk_interaction_time).to_std() { + Ok(last_interaction) => { + if last_interaction > self.lagging_wal_timeout { + return Some(ReconnectReason::NoWalTimeout { + last_wal_interaction: last_sk_interaction_time, + check_time: now, + threshold: self.lagging_wal_timeout, + }); + } + } + Err(_e) => { + warn!("Last interaction with safekeeper {} happened in the future, ignoring the candidate. Interaction time: {last_sk_interaction_time}, now: {now}", + current_connection.safekeeper_id); + } + } + None + } +} + +fn wal_stream_connection_string( + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + listen_pg_addr_str: &str, + pageserver_connstr: &str, +) -> anyhow::Result { + let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); + let me_conf = sk_connstr + .parse::() + .with_context(|| { + format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one") + })?; + let (host, port) = utils::connstring::connection_host_port(&me_conf); + Ok(format!( + "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id} pageserver_connstr={pageserver_connstr}'", + )) +} + +#[cfg(test)] +mod tests { + use std::time::SystemTime; + + use crate::repository::{ + repo_harness::{RepoHarness, TIMELINE_ID}, + Repository, + }; + + use super::*; + + #[test] + fn no_connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_no_candidate")?; + let mut data_manager_with_no_connection = dummy_wal_connection_manager(&harness); + data_manager_with_no_connection.wal_connection_data = None; + + let no_candidate = + data_manager_with_no_connection.select_connection_candidate(HashMap::from([ + ( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + pageserver_connstr: Some("no safekeeper_connstr".to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no pageserver_connstr".to_string()), + pageserver_connstr: None, + }, + ), + ( + NodeId(2), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + pageserver_connstr: Some("no commit_lsn (p)".to_string()), + }, + ), + ( + NodeId(3), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + pageserver_connstr: Some("no commit_lsn (p)".to_string()), + }, + ), + ])); + + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of non full data options, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("connection_no_candidate")?; + + let current_lsn = 100_000; + let connected_sk_id = NodeId(0); + let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); + let mut dummy_connection_data = dummy_connection_data( + ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }, + connected_sk_id, + ) + .await; + let now = Utc::now().naive_utc(); + dummy_connection_data.last_wal_receiver_data = Some(( + ZenithFeedback { + current_timeline_size: 1, + ps_writelsn: 1, + ps_applylsn: current_lsn, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + }, + now, + )); + dummy_connection_data.connection_init_time = now; + data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); + + let no_candidate = + data_manager_with_connection.select_connection_candidate(HashMap::from([ + ( + connected_sk_id, + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn( + current_lsn + data_manager_with_connection.max_lsn_wal_lag.get() * 2 + )), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not advanced Lsn".to_string()), + pageserver_connstr: Some("not advanced Lsn (p)".to_string()), + }, + ), + ( + NodeId(2), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn( + current_lsn + data_manager_with_connection.max_lsn_wal_lag.get() / 2 + )), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not enough advanced Lsn".to_string()), + pageserver_connstr: Some("not enough advanced Lsn (p)".to_string()), + }, + ), + ])); + + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of valid options since candidate Lsn data is ignored and others' was not advanced enough, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[test] + fn no_connection_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_candidate")?; + let mut data_manager_with_no_connection = dummy_wal_connection_manager(&harness); + data_manager_with_no_connection.wal_connection_data = None; + + let only_candidate = data_manager_with_no_connection + .select_connection_candidate(HashMap::from([( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + data_manager_with_no_connection + .max_lsn_wal_lag + .get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + )])) + .expect("Expected one candidate selected out of the only data option, but got none"); + assert_eq!(only_candidate.safekeeper_id, NodeId(0)); + assert_eq!( + only_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_PAGESERVER_CONNSTR)); + + let selected_lsn = 100_000; + let biggest_wal_candidate = data_manager_with_no_connection + .select_connection_candidate(HashMap::from([ + ( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn - 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("smaller commit_lsn".to_string()), + pageserver_connstr: Some("smaller commit_lsn (p)".to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(2), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn + 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + pageserver_connstr: Some( + "no safekeeper_connstr despite bigger commit_lsn".to_string(), + ), + }, + ), + ])) + .expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(biggest_wal_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + biggest_wal_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(biggest_wal_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + assert!(biggest_wal_candidate + .wal_producer_connstr + .contains(DUMMY_PAGESERVER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let current_lsn = Lsn(100_000).align(); + + let id = ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }; + + let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); + let connected_sk_id = NodeId(0); + let mut dummy_connection_data = dummy_connection_data(id, NodeId(0)).await; + let lagging_wal_timeout = + chrono::Duration::from_std(data_manager_with_connection.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + dummy_connection_data.last_wal_receiver_data = Some(( + ZenithFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + }, + time_over_threshold, + )); + dummy_connection_data.connection_init_time = time_over_threshold; + data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); + + let new_lsn = Lsn(current_lsn.0 + data_manager_with_connection.max_lsn_wal_lag.get() + 1); + let candidates = HashMap::from([ + ( + connected_sk_id, + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), + pageserver_connstr: Some("advanced by Lsn safekeeper (p)".to_string()), + }, + ), + ]); + + let over_threshcurrent_candidate = data_manager_with_connection + .select_connection_candidate(candidates) + .expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + over_threshcurrent_candidate.reason, + ReconnectReason::LaggingWal { + current_lsn, + new_lsn, + threshold: data_manager_with_connection.max_lsn_wal_lag + }, + "Should select bigger WAL safekeeper if it starts to lag enough" + ); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains("advanced by Lsn safekeeper")); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains("advanced by Lsn safekeeper (p)")); + + Ok(()) + } + + #[tokio::test] + async fn timeout_wal_over_threshcurrent_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_wal_over_threshcurrent_candidate")?; + let current_lsn = Lsn(100_000).align(); + + let id = ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }; + + let mut data_manager_with_connection = dummy_wal_connection_manager(&harness); + let mut dummy_connection_data = dummy_connection_data(id, NodeId(1)).await; + let lagging_wal_timeout = + chrono::Duration::from_std(data_manager_with_connection.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + dummy_connection_data.last_wal_receiver_data = None; + dummy_connection_data.connection_init_time = time_over_threshold; + data_manager_with_connection.wal_connection_data = Some(dummy_connection_data); + + let new_lsn = Lsn(current_lsn.0 + data_manager_with_connection.max_lsn_wal_lag.get() + 1); + let over_threshcurrent_candidate = data_manager_with_connection + .select_connection_candidate(HashMap::from([ + ( + NodeId(0), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + pageserver_connstr: Some(DUMMY_PAGESERVER_CONNSTR.to_string()), + }, + ), + ( + NodeId(1), + SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not advanced by Lsn safekeeper".to_string()), + pageserver_connstr: Some("not advanced by Lsn safekeeper".to_string()), + }, + ), + ])) + .expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoWalTimeout { + last_wal_interaction, + threshold, + .. + } => { + assert_eq!(last_wal_interaction, time_over_threshold); + assert_eq!(threshold, data_manager_with_connection.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_PAGESERVER_CONNSTR)); + + Ok(()) + } + + fn dummy_wal_connection_manager(harness: &RepoHarness) -> WalConnectionManager { + WalConnectionManager { + id: ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }, + timeline: Arc::new(DatadirTimelineImpl::new( + harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .expect("Failed to create an empty timeline for dummy wal connection manager"), + 10_000, + )), + wal_connect_timeout: Duration::from_secs(1), + lagging_wal_timeout: Duration::from_secs(10), + max_lsn_wal_lag: NonZeroU64::new(300_000).unwrap(), + wal_connection_attempt: 0, + wal_connection_data: None, + } + } + + const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; + const DUMMY_PAGESERVER_CONNSTR: &str = "pageserver_connstr"; + + // the function itself does not need async, but it spawns a tokio::task underneath hence neeed + // a runtime to not to panic + async fn dummy_connection_data( + id: ZTenantTimelineId, + safekeeper_id: NodeId, + ) -> WalConnectionData { + let dummy_connstr = + wal_stream_connection_string(id, DUMMY_SAFEKEEPER_CONNSTR, DUMMY_PAGESERVER_CONNSTR) + .expect("Failed to construct dummy wal producer connstr"); + WalConnectionData { + safekeeper_id, + connection: WalReceiverConnection::open( + id, + safekeeper_id, + dummy_connstr, + Duration::from_secs(1), + ), + connection_init_time: Utc::now().naive_utc(), + last_wal_receiver_data: None, + } } } diff --git a/pageserver/src/walreceiver/connection_handler.rs b/pageserver/src/walreceiver/connection_handler.rs new file mode 100644 index 0000000000..aaccee9730 --- /dev/null +++ b/pageserver/src/walreceiver/connection_handler.rs @@ -0,0 +1,405 @@ +//! Actual Postgres connection handler to stream WAL to the server. +//! Runs as a separate, cancellable Tokio task. +use std::{ + str::FromStr, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::{bail, ensure, Context}; +use bytes::BytesMut; +use fail::fail_point; +use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::{pin, select, sync::watch, time}; +use tokio_postgres::{replication::ReplicationStream, Client}; +use tokio_stream::StreamExt; +use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use utils::{ + lsn::Lsn, + pq_proto::ZenithFeedback, + zid::{NodeId, ZTenantTimelineId}, +}; + +use crate::{ + http::models::WalReceiverEntry, + repository::{Repository, Timeline}, + tenant_mgr, + walingest::WalIngest, +}; + +#[derive(Debug, Clone)] +pub enum WalConnectionEvent { + Started, + NewWal(ZenithFeedback), + End(Result<(), String>), +} + +/// A wrapper around standalone Tokio task, to poll its updates or cancel the task. +#[derive(Debug)] +pub struct WalReceiverConnection { + handle: tokio::task::JoinHandle<()>, + cancellation: watch::Sender<()>, + events_receiver: watch::Receiver, +} + +impl WalReceiverConnection { + /// Initializes the connection task, returning a set of handles on top of it. + /// The task is started immediately after the creation, fails if no connection is established during the timeout given. + pub fn open( + id: ZTenantTimelineId, + safekeeper_id: NodeId, + wal_producer_connstr: String, + connect_timeout: Duration, + ) -> Self { + let (cancellation, mut cancellation_receiver) = watch::channel(()); + let (events_sender, events_receiver) = watch::channel(WalConnectionEvent::Started); + + let handle = tokio::spawn( + async move { + let connection_result = handle_walreceiver_connection( + id, + &wal_producer_connstr, + &events_sender, + &mut cancellation_receiver, + connect_timeout, + ) + .await + .map_err(|e| { + format!("Walreceiver connection for id {id} failed with error: {e:#}") + }); + + match &connection_result { + Ok(()) => { + debug!("Walreceiver connection for id {id} ended successfully") + } + Err(e) => warn!("{e}"), + } + events_sender + .send(WalConnectionEvent::End(connection_result)) + .ok(); + } + .instrument(info_span!("safekeeper_handle", sk = %safekeeper_id)), + ); + + Self { + handle, + cancellation, + events_receiver, + } + } + + /// Polls for the next WAL receiver event, if there's any available since the last check. + /// Blocks if there's no new event available, returns `None` if no new events will ever occur. + /// Only the last event is returned, all events received between observatins are lost. + pub async fn next_event(&mut self) -> Option { + match self.events_receiver.changed().await { + Ok(()) => Some(self.events_receiver.borrow().clone()), + Err(_cancellation_error) => None, + } + } + + /// Gracefully aborts current WAL streaming task, waiting for the current WAL streamed. + pub async fn shutdown(&mut self) -> anyhow::Result<()> { + self.cancellation.send(()).ok(); + let handle = &mut self.handle; + handle + .await + .context("Failed to join on a walreceiver connection task")?; + Ok(()) + } +} + +async fn handle_walreceiver_connection( + id: ZTenantTimelineId, + wal_producer_connstr: &str, + events_sender: &watch::Sender, + cancellation: &mut watch::Receiver<()>, + connect_timeout: Duration, +) -> anyhow::Result<()> { + // Connect to the database in replication mode. + info!("connecting to {wal_producer_connstr}"); + let connect_cfg = + format!("{wal_producer_connstr} application_name=pageserver replication=true"); + + let (mut replication_client, connection) = time::timeout( + connect_timeout, + tokio_postgres::connect(&connect_cfg, postgres::NoTls), + ) + .await + .context("Timed out while waiting for walreceiver connection to open")? + .context("Failed to open walreceiver conection")?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + let mut connection_cancellation = cancellation.clone(); + tokio::spawn( + async move { + info!("connected!"); + select! { + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, + + _ = connection_cancellation.changed() => info!("Connection cancelled"), + } + } + .instrument(info_span!("safekeeper_handle_db")), + ); + + // Immediately increment the gauge, then create a job to decrement it on task exit. + // One of the pros of `defer!` is that this will *most probably* + // get called, even in presence of panics. + let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); + gauge.inc(); + scopeguard::defer! { + gauge.dec(); + } + + let identify = identify_system(&mut replication_client).await?; + info!("{identify:?}"); + let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); + let mut caught_up = false; + let ZTenantTimelineId { + tenant_id, + timeline_id, + } = id; + + let (repo, timeline) = tokio::task::spawn_blocking(move || { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id) + .with_context(|| { + format!("local timeline {timeline_id} not found for tenant {tenant_id}") + })?; + Ok::<_, anyhow::Error>((repo, timeline)) + }) + .await + .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??; + + // + // Start streaming the WAL, from where we left off previously. + // + // If we had previously received WAL up to some point in the middle of a WAL record, we + // better start from the end of last full WAL record, not in the middle of one. + let mut last_rec_lsn = timeline.get_last_record_lsn(); + let mut startpoint = last_rec_lsn; + + if startpoint == Lsn(0) { + bail!("No previous WAL position"); + } + + // There might be some padding after the last full record, skip it. + startpoint += startpoint.calc_padding(8u32); + + info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, server is at {end_of_wal}..."); + + let query = format!("START_REPLICATION PHYSICAL {startpoint}"); + + let copy_stream = replication_client.copy_both_simple(&query).await?; + let physical_stream = ReplicationStream::new(copy_stream); + pin!(physical_stream); + + let mut waldecoder = WalStreamDecoder::new(startpoint); + + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; + + while let Some(replication_message) = { + select! { + // check for shutdown first + biased; + _ = cancellation.changed() => { + info!("walreceiver interrupted"); + None + } + replication_message = physical_stream.next() => replication_message, + } + } { + let replication_message = replication_message?; + let status_update = match replication_message { + ReplicationMessage::XLogData(xlog_data) => { + // Pass the WAL data to the decoder, and see if we can decode + // more records as a result. + let data = xlog_data.data(); + let startlsn = Lsn::from(xlog_data.wal_start()); + let endlsn = startlsn + data.len() as u64; + + trace!("received XLogData between {startlsn} and {endlsn}"); + + waldecoder.feed_bytes(data); + + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let _enter = info_span!("processing record", lsn = %lsn).entered(); + + // It is important to deal with the aligned records as lsn in getPage@LSN is + // aligned and can be several bytes bigger. Without this alignment we are + // at risk of hitting a deadlock. + ensure!(lsn.is_aligned()); + + walingest.ingest_record(&timeline, recdata, lsn)?; + + fail_point!("walreceiver-after-ingest"); + + last_rec_lsn = lsn; + } + + if !caught_up && endlsn >= end_of_wal { + info!("caught up at LSN {endlsn}"); + caught_up = true; + } + + let timeline_to_check = Arc::clone(&timeline.tline); + tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) + .await + .with_context(|| { + format!("Spawned checkpoint check task panicked for timeline {id}") + })? + .with_context(|| { + format!("Failed to check checkpoint distance for timeline {id}") + })?; + + Some(endlsn) + } + + ReplicationMessage::PrimaryKeepAlive(keepalive) => { + let wal_end = keepalive.wal_end(); + let timestamp = keepalive.timestamp(); + let reply_requested = keepalive.reply() != 0; + + trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + + if reply_requested { + Some(last_rec_lsn) + } else { + None + } + } + + _ => None, + }; + + if let Some(last_lsn) = status_update { + let remote_index = repo.get_remote_index(); + let timeline_remote_consistent_lsn = remote_index + .read() + .await + // here we either do not have this timeline in remote index + // or there were no checkpoints for it yet + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) + // no checkpoint was uploaded + .unwrap_or(Lsn(0)); + + // The last LSN we processed. It is not guaranteed to survive pageserver crash. + let write_lsn = u64::from(last_lsn); + // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data + let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); + // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash + // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. + let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let ts = SystemTime::now(); + + // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` + { + super::WAL_RECEIVER_ENTRIES.write().await.insert( + id, + WalReceiverEntry { + wal_producer_connstr: Some(wal_producer_connstr.to_owned()), + last_received_msg_lsn: Some(last_lsn), + last_received_msg_ts: Some( + ts.duration_since(SystemTime::UNIX_EPOCH) + .expect("Received message time should be before UNIX EPOCH!") + .as_micros(), + ), + }, + ); + } + + // Send zenith feedback message. + // Regular standby_status_update fields are put into this message. + let zenith_status_update = ZenithFeedback { + current_timeline_size: timeline.get_current_logical_size() as u64, + ps_writelsn: write_lsn, + ps_flushlsn: flush_lsn, + ps_applylsn: apply_lsn, + ps_replytime: ts, + }; + + debug!("zenith_status_update {zenith_status_update:?}"); + + let mut data = BytesMut::new(); + zenith_status_update.serialize(&mut data)?; + physical_stream + .as_mut() + .zenith_status_update(data.len() as u64, &data) + .await?; + if let Err(e) = events_sender.send(WalConnectionEvent::NewWal(zenith_status_update)) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + } + } + + Ok(()) +} + +/// Data returned from the postgres `IDENTIFY_SYSTEM` command +/// +/// See the [postgres docs] for more details. +/// +/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html +#[derive(Debug)] +// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as +// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 +#[allow(dead_code)] +struct IdentifySystem { + systemid: u64, + timeline: u32, + xlogpos: PgLsn, + dbname: Option, +} + +/// There was a problem parsing the response to +/// a postgres IDENTIFY_SYSTEM command. +#[derive(Debug, thiserror::Error)] +#[error("IDENTIFY_SYSTEM parse error")] +struct IdentifyError; + +/// Run the postgres `IDENTIFY_SYSTEM` command +async fn identify_system(client: &mut Client) -> anyhow::Result { + let query_str = "IDENTIFY_SYSTEM"; + let response = client.simple_query(query_str).await?; + + // get(N) from row, then parse it as some destination type. + fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result + where + T: FromStr, + { + let val = row.get(idx).ok_or(IdentifyError)?; + val.parse::().or(Err(IdentifyError)) + } + + // extract the row contents into an IdentifySystem struct. + // written as a closure so I can use ? for Option here. + if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { + Ok(IdentifySystem { + systemid: get_parse(first_row, 0)?, + timeline: get_parse(first_row, 1)?, + xlogpos: get_parse(first_row, 2)?, + dbname: get_parse(first_row, 3).ok(), + }) + } else { + Err(IdentifyError.into()) + } +} diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 9feb984c4f..5ce2591ff3 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -16,7 +16,8 @@ use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; -use safekeeper::control_file::{self}; +use safekeeper::broker; +use safekeeper::control_file; use safekeeper::defaults::{ DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, }; @@ -26,7 +27,6 @@ use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; -use safekeeper::{broker, callmemaybe}; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, zid::NodeId, @@ -272,9 +272,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; - let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - GlobalTimelines::init(callmemaybe_tx, wal_backup_launcher_tx); + GlobalTimelines::init(wal_backup_launcher_tx); let conf_ = conf.clone(); threads.push( @@ -296,29 +295,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) .spawn(|| { - // thread code - let thread_result = wal_service::thread_main(conf_cloned, pg_listener); - if let Err(e) = thread_result { - info!("safekeeper thread terminated: {}", e); + if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) { + info!("safekeeper thread terminated: {e}"); } }) .unwrap(); threads.push(safekeeper_thread); - let conf_cloned = conf.clone(); - let callmemaybe_thread = thread::Builder::new() - .name("callmemaybe thread".into()) - .spawn(|| { - // thread code - let thread_result = callmemaybe::thread_main(conf_cloned, callmemaybe_rx); - if let Err(e) = thread_result { - error!("callmemaybe thread terminated: {}", e); - } - }) - .unwrap(); - threads.push(callmemaybe_thread); - if !conf.broker_endpoints.is_empty() { let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 1fae9b00f8..f328d2e85a 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -8,7 +8,6 @@ use url::Url; use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; pub mod broker; -pub mod callmemaybe; pub mod control_file; pub mod control_file_upgrade; pub mod handler; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a89ed18071..7a6a8ca9b9 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -8,7 +8,6 @@ use anyhow::{bail, Context, Result}; use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::cmp::min; @@ -17,7 +16,6 @@ use std::sync::Arc; use std::thread::sleep; use std::time::Duration; use std::{str, thread}; -use tokio::sync::mpsc::UnboundedSender; use tracing::*; use utils::{ bin_ser::BeSer, @@ -25,7 +23,6 @@ use utils::{ postgres_backend::PostgresBackend, pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback}, sock_split::ReadStream, - zid::{ZTenantId, ZTimelineId}, }; // See: https://www.postgresql.org/docs/13/protocol-replication.html @@ -83,40 +80,6 @@ impl Drop for ReplicationConnGuard { } } -// XXX: Naming is a bit messy here. -// This ReplicationStreamGuard lives as long as ReplicationConn -// and current ReplicationConnGuard is tied to the background thread -// that receives feedback. -struct ReplicationStreamGuard { - tx: UnboundedSender, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - pageserver_connstr: String, -} - -impl Drop for ReplicationStreamGuard { - fn drop(&mut self) { - // the connection with pageserver is lost, - // resume callback subscription - debug!( - "Connection to pageserver is gone. Resume callmemaybe subsciption if necessary. tenantid {} timelineid {}", - self.tenant_id, self.timeline_id, - ); - - let subscription_key = SubscriptionStateKey::new( - self.tenant_id, - self.timeline_id, - self.pageserver_connstr.to_owned(), - ); - - self.tx - .send(CallmeEvent::Resume(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Resume request to callmemaybe thread {}", e); - }); - } -} - impl ReplicationConn { /// Create a new `ReplicationConn` pub fn new(pgb: &mut PostgresBackend) -> Self { @@ -256,36 +219,6 @@ impl ReplicationConn { }; info!("Start replication from {:?} till {:?}", start_pos, stop_pos); - // Don't spam pageserver with callmemaybe queries - // when replication connection with pageserver is already established. - let _guard = { - if spg.appname == Some("wal_proposer_recovery".to_string()) { - None - } else { - let pageserver_connstr = pageserver_connstr.expect("there should be a pageserver connection string since this is not a wal_proposer_recovery"); - let zttid = spg.timeline.get().zttid; - let tx_clone = spg.timeline.get().callmemaybe_tx.clone(); - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.clone(), - ); - tx_clone - .send(CallmeEvent::Pause(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); - - // create a guard to subscribe callback again, when this connection will exit - Some(ReplicationStreamGuard { - tx: tx_clone, - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, - pageserver_connstr, - }) - } - }; - // switch to copy pgb.write_message(&BeMessage::CopyBothResponse)?; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2fc5bcc1f6..b7a549fef8 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -16,7 +16,7 @@ use std::fs::{self}; use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; -use tokio::sync::mpsc::{Sender, UnboundedSender}; +use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ @@ -25,7 +25,6 @@ use utils::{ zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, @@ -191,79 +190,33 @@ impl SharedState { self.wal_backup_active } - /// start/change walsender (via callmemaybe). - fn callmemaybe_sub( + /// Activate timeline's walsender: start/change timeline information propagated into etcd for further pageserver connections. + fn activate_walsender( &mut self, zttid: &ZTenantTimelineId, - pageserver_connstr: Option<&String>, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - // unsub old sub. xxx: callmemaybe is going out - let old_subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(old_subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); + new_pageserver_connstr: Option, + ) { + if self.pageserver_connstr != new_pageserver_connstr { + self.deactivate_walsender(zttid); + + if new_pageserver_connstr.is_some() { + info!( + "timeline {} has activated its walsender with connstr {new_pageserver_connstr:?}", + zttid.timeline_id, + ); + } + self.pageserver_connstr = new_pageserver_connstr; } - if let Some(pageserver_connstr) = pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - // xx: sending to channel under lock is not very cool, but - // shouldn't be a problem here. If it is, we can grab a counter - // here and later augment channel messages with it. - callmemaybe_tx - .send(CallmeEvent::Subscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Subscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is subscribed to callmemaybe to {}", - zttid.timeline_id, pageserver_connstr - ); - } - self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned()); - Ok(()) } - /// Deactivate the timeline: stop callmemaybe. - fn callmemaybe_unsub( - &mut self, - zttid: &ZTenantTimelineId, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Unsubscribe request to callmemaybe thread {}", - e - ); - }); + /// Deactivate the timeline: stop sending the timeline data into etcd, so no pageserver can connect for WAL streaming. + fn deactivate_walsender(&mut self, zttid: &ZTenantTimelineId) { + if let Some(pageserver_connstr) = self.pageserver_connstr.take() { info!( - "timeline {} is unsubscribed from callmemaybe to {}", + "timeline {} had deactivated its wallsender with connstr {pageserver_connstr:?}", zttid.timeline_id, - self.pageserver_connstr.as_ref().unwrap() - ); + ) } - Ok(()) } fn get_wal_seg_size(&self) -> usize { @@ -332,7 +285,6 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { pub zttid: ZTenantTimelineId, - pub callmemaybe_tx: UnboundedSender, /// Sending here asks for wal backup launcher attention (start/stop /// offloading). Sending zttid instead of concrete command allows to do /// sending without timeline lock. @@ -348,7 +300,6 @@ pub struct Timeline { impl Timeline { fn new( zttid: ZTenantTimelineId, - callmemaybe_tx: UnboundedSender, wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { @@ -356,7 +307,6 @@ impl Timeline { watch::channel(shared_state.sk.inmem.commit_lsn); Timeline { zttid, - callmemaybe_tx, wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, @@ -378,7 +328,7 @@ impl Timeline { // should have kind of generations assigned by compute to distinguish // the latest one or even pass it through consensus to reliably deliver // to all safekeepers. - shared_state.callmemaybe_sub(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + shared_state.activate_walsender(&self.zttid, pageserver_connstr.cloned()); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { @@ -414,7 +364,7 @@ impl Timeline { (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; + shared_state.deactivate_walsender(&self.zttid); return Ok(true); } } @@ -431,16 +381,14 @@ impl Timeline { /// Deactivates the timeline, assuming it is being deleted. /// Returns whether the timeline was already active. /// - /// The callmemaybe thread is stopped by the deactivation message. We assume all other threads - /// will stop by themselves eventually (possibly with errors, but no panics). There should be no - /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but + /// We assume all threads will stop by themselves eventually (possibly with errors, but no panics). + /// There should be no compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but /// we're deleting the timeline anyway. pub async fn deactivate_for_delete(&self) -> Result { let was_active: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let shared_state = self.mutex.lock().unwrap(); was_active = shared_state.active; - shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?; } self.wal_backup_launcher_tx.send(self.zttid).await?; Ok(was_active) @@ -576,7 +524,8 @@ impl Timeline { shared_state.sk.inmem.remote_consistent_lsn, )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), + safekeeper_connstr: Some(conf.listen_pg_addr.clone()), + pageserver_connstr: shared_state.pageserver_connstr.clone(), backup_lsn: Some(shared_state.sk.inmem.backup_lsn), }) } @@ -675,14 +624,12 @@ impl TimelineTools for Option> { struct GlobalTimelinesState { timelines: HashMap>, - callmemaybe_tx: Option>, wal_backup_launcher_tx: Option>, } lazy_static! { static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - callmemaybe_tx: None, wal_backup_launcher_tx: None, }); } @@ -697,13 +644,8 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn init( - callmemaybe_tx: UnboundedSender, - wal_backup_launcher_tx: Sender, - ) { + pub fn init(wal_backup_launcher_tx: Sender) { let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.callmemaybe_tx.is_none()); - state.callmemaybe_tx = Some(callmemaybe_tx); assert!(state.wal_backup_launcher_tx.is_none()); state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); } @@ -726,7 +668,6 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, - state.callmemaybe_tx.as_ref().unwrap().clone(), state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); @@ -778,7 +719,6 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, - state.callmemaybe_tx.as_ref().unwrap().clone(), state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 2b0e5ae8bd..d22654ad3e 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -63,10 +63,11 @@ def test_pageserver_http_get_wal_receiver_not_found(zenith_simple_env: ZenithEnv tenant_id, timeline_id = env.zenith_cli.create_tenant() - # no PG compute node is running, so no WAL receiver is running - with pytest.raises(ZenithPageserverApiException) as e: - _ = client.wal_receiver_get(tenant_id, timeline_id) - assert "Not Found" in str(e.value) + empty_response = client.wal_receiver_get(tenant_id, timeline_id) + + assert empty_response.get('wal_producer_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert empty_response.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert empty_response.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): @@ -81,7 +82,6 @@ def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): # a successful `wal_receiver_get` response must contain the below fields assert list(res.keys()) == [ - "thread_id", "wal_producer_connstr", "last_received_msg_lsn", "last_received_msg_ts", diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index ff905efa53..37bc5fe541 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1600,9 +1600,7 @@ class Postgres(PgProtocol): for cfg_line in cfg_lines: # walproposer uses different application_name if ("synchronous_standby_names" in cfg_line or - # don't ask pageserver to fetch WAL from compute - "callmemaybe_connstring" in cfg_line or - # don't repeat safekeepers multiple times + # don't repeat safekeepers/wal_acceptors multiple times "safekeepers" in cfg_line): continue f.write(cfg_line) diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 0e16d3e749..a8a1ff7687 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -13,16 +13,12 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder @pytest.mark.parametrize('tenants_count', [1, 5, 10]) -@pytest.mark.parametrize('use_safekeepers', ['with_wa', 'without_wa']) def test_bulk_tenant_create( zenith_env_builder: ZenithEnvBuilder, - use_safekeepers: str, tenants_count: int, zenbenchmark, ): - """Measure tenant creation time (with and without wal acceptors)""" - if use_safekeepers == 'with_wa': - zenith_env_builder.num_safekeepers = 3 + zenith_env_builder.num_safekeepers = 3 env = zenith_env_builder.init_start() time_slices = [] @@ -31,15 +27,15 @@ def test_bulk_tenant_create( start = timeit.default_timer() tenant, _ = env.zenith_cli.create_tenant() - env.zenith_cli.create_timeline( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) + env.zenith_cli.create_timeline(f'test_bulk_tenant_create_{tenants_count}_{i}', + tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) - pg_tenant = env.postgres.create_start( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) + pg_tenant = env.postgres.create_start(f'test_bulk_tenant_create_{tenants_count}_{i}', + tenant_id=tenant) end = timeit.default_timer() time_slices.append(end - start) From 1188c9a95c6fe55a8b37e8f52402ef2e954f934e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 30 May 2022 20:38:28 +0300 Subject: [PATCH 49/50] remove extra span as this code is already covered by create timeline span E g this log line contains duplicated data: INFO /timeline_create{tenant=8d367870988250a755101b5189bbbc17 new_timeline=Some(27e2580f51f5660642d8ce124e9ee4ac) lsn=None}: bootstrapping{timeline=27e2580f51f5660642d8ce124e9ee4ac tenant=8d367870988250a755101b5189bbbc17}: created root timeline 27e2580f51f5660642d8ce124e9ee4ac timeline.lsn 0/16960E8 this avoids variable duplication in `bootstrapping` subspan --- pageserver/src/timelines.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 9ab063107c..a3939661c1 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -283,8 +283,6 @@ fn bootstrap_timeline( tli: ZTimelineId, repo: &R, ) -> Result<()> { - let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - let initdb_path = conf .tenant_path(&tenantid) .join(format!("tmp-timeline-{}", tli)); From de7eda2dc6a6dbad3c3ec96e71673c5a8a48bb79 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 1 Jun 2022 23:23:35 +0300 Subject: [PATCH 50/50] Fix url path printing --- control_plane/src/local_env.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2623f65242..f7bb890893 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -119,16 +119,24 @@ impl EtcdBroker { } pub fn comma_separated_endpoints(&self) -> String { - self.broker_endpoints.iter().map(Url::as_str).fold( - String::new(), - |mut comma_separated_urls, url| { + self.broker_endpoints + .iter() + .map(|url| { + // URL by default adds a '/' path at the end, which is not what etcd CLI wants. + let url_string = url.as_str(); + if url_string.ends_with('/') { + &url_string[0..url_string.len() - 1] + } else { + url_string + } + }) + .fold(String::new(), |mut comma_separated_urls, url| { if !comma_separated_urls.is_empty() { comma_separated_urls.push(','); } comma_separated_urls.push_str(url); comma_separated_urls - }, - ) + }) } }